You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
fromtypingimportAny, Dict, Iterable, List, OptionalimportnumpyasnpfromvicinityimportVicinityfromhaystackimportloggingfromhaystack.dataclassesimportDocumentfromhaystack.document_stores.errorsimportDuplicateDocumentErrorfromhaystack.document_stores.typesimportDuplicatePolicyfromhaystack.utilsimportexpitfromhaystack.utils.filtersimportdocument_matches_filterlogger=logging.getLogger(__name__)
classVicinityDocumentStore:
def__init__(self, backend_type: str="basic", **kwargs):
self.vicinity=Vicinity.from_vectors_and_items(
vectors=[], items=[], backend_type=backend_type, store_vectors=True, **kwargs
)
defsave_to_disk(self, path: str):
self.vicinity.save(path)
defload_from_disk(self, path: str):
self.vicinity.load(path)
defsave_to_hub(self, hub_name: str, hub_model_id: str):
self.vicinity.save_to_hub(hub_name, hub_model_id)
defload_from_hub(self, hub_name: str, hub_model_id: str):
self.vicinity.load_from_hub(hub_name, hub_model_id)
defcount_documents(self) ->int:
returnlen(self.vicinity.items)
defcount_documents(self, documents: List[Document]):
self.vicinity.add_documents(documents)
deffilter_documents(self, filters: Optional[Dict[str, Any]] =None) ->List[Document]:
""" Returns the documents that match the filters provided. For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation. :param filters: The filters to apply to the document list. :returns: A list of Documents that match the given filters. """iffilters:
if"operator"notinfiltersand"conditions"notinfilters:
raiseValueError(
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
)
return [docfordocinself.vicinity.itemsifdocument_matches_filter(filters=filters, document=doc)]
returnlist(self.vicinity.items)
defwrite_documents(self, documents: List[Document], policy: DuplicatePolicy=DuplicatePolicy.NONE) ->int:
""" Refer to the DocumentStore.write_documents() protocol documentation. If `policy` is set to `DuplicatePolicy.NONE` defaults to `DuplicatePolicy.FAIL`. """if (
notisinstance(documents, Iterable)
orisinstance(documents, str)
orany(notisinstance(doc, Document) fordocindocuments)
):
raiseValueError("Please provide a list of Documents.")
ifpolicy==DuplicatePolicy.NONE:
policy=DuplicatePolicy.FAILwritten_documents=len(documents)
fordocumentindocuments:
document_in_index=Falseforiteminself.vicinity.items:
ifitem.id==document.id:
document_in_index=Truebreakifpolicy!=DuplicatePolicy.OVERWRITEanddocument_in_index:
ifpolicy==DuplicatePolicy.FAIL:
raiseDuplicateDocumentError(f"ID '{document.id}' already exists.")
ifpolicy==DuplicatePolicy.SKIP:
logger.warning("ID '{document_id}' already exists", document_id=document.id)
written_documents-=1continue# Since the statistics are updated in an incremental manner,# we need to explicitly remove the existing document to revert# the statistics before updating them with the new document.ifdocument_in_index:
self.vicinity.delete([document])
self.vicinity.insert([document], embedding=np.array(document.embedding))
returnwritten_documentsdefdelete_documents(self, document_ids: List[str]) ->None:
""" Deletes all documents with matching document_ids from the DocumentStore. :param document_ids: The object_ids to delete. """items_to_delete= []
fordoc_idindocument_ids:
foriteminself.vicinity.items:
ifitem.id==doc_id:
items_to_delete.append(item)
self.vicinity.delete(items_to_delete)
defbm25_retrieval(
self, query: str, filters: Optional[Dict[str, Any]] =None, top_k: int=10, scale_score: bool=False
) ->List[Document]:
""" Retrieves documents that are most relevant to the query using BM25 algorithm. :param query: The query string. :param filters: A dictionary with filters to narrow down the search space. :param top_k: The number of top documents to retrieve. Default is 10. :param scale_score: Whether to scale the scores of the retrieved documents. Default is False. :returns: A list of the top_k documents most relevant to the query. """ifnotquery:
raiseValueError("Query should be a non-empty string")
content_type_filter= {
"operator": "OR",
"conditions": [
{"field": "content", "operator": "!=", "value": None},
{"field": "dataframe", "operator": "!=", "value": None},
],
}
iffilters:
if"operator"notinfilters:
raiseValueError(
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
)
filters= {"operator": "AND", "conditions": [content_type_filter, filters]}
else:
filters=content_type_filterall_documents=self.filter_documents(filters=filters)
iflen(all_documents) ==0:
logger.info("No documents found for BM25 retrieval. Returning empty list.")
return []
results=sorted(self.bm25_algorithm_inst(query, all_documents), key=lambdax: x[1], reverse=True)[:top_k]
# BM25Okapi can return meaningful negative values, so they should not be filtered out when scale_score is False.# It's the only algorithm supported by rank_bm25 at the time of writing (2024) that can return negative scores.# see https://github.com/deepset-ai/haystack/pull/6889 for more context.negatives_are_valid=self.bm25_algorithm=="BM25Okapi"andnotscale_score# Create documents with the BM25 score to return themreturn_documents= []
fordoc, scoreinresults:
ifscale_score:
score=expit(score/BM25_SCALING_FACTOR)
ifnotnegatives_are_validandscore<=0.0:
continuedoc_fields=doc.to_dict()
doc_fields["score"] =scorereturn_document=Document.from_dict(doc_fields)
return_documents.append(return_document)
returnreturn_documentsdefembedding_retrieval( # pylint: disable=too-many-positional-argumentsself,
query_embedding: List[float],
filters: Optional[Dict[str, Any]] =None,
top_k: int=10,
scale_score: bool=False,
return_embedding: bool=False,
) ->List[Document]:
""" Retrieves documents that are most similar to the query embedding using a vector similarity metric. :param query_embedding: Embedding of the query. :param filters: A dictionary with filters to narrow down the search space. :param top_k: The number of top documents to retrieve. Default is 10. :param scale_score: Whether to scale the scores of the retrieved Documents. Default is False. :param return_embedding: Whether to return the embedding of the retrieved Documents. Default is False. :returns: A list of the top_k documents most relevant to the query. """iflen(query_embedding) ==0ornotisinstance(query_embedding[0], float):
raiseValueError("query_embedding should be a non-empty list of floats.")
k_nearest_neighbors=self.vicinity.query(np.array(query_embedding), top_k=top_k)
all_documents= [self.vicinity.items[i[0]] foriink_nearest_neighbors]
distances= [i[1] foriink_nearest_neighbors]
filters=filtersor {}
all_documents= [
(doc, distance)
fordoc, distanceinzip(all_documents, distances)
ifdocument_matches_filter(filters=filters, document=doc)
]
# create Documents with the similarity score for the top k resultstop_documents= []
fordoc, scoreinsorted(all_documents, key=lambdax: x[1], reverse=True)[:top_k]:
doc_fields=doc.to_dict()
doc_fields["score"] =scoreifreturn_embeddingisFalse:
doc_fields["embedding"] =Nonetop_documents.append(Document.from_dict(doc_fields))
returntop_documents
The text was updated successfully, but these errors were encountered:
Some integrations with tools like haystack would be cool. https://docs.haystack.deepset.ai/docs/inmemorydocumentstore
The text was updated successfully, but these errors were encountered: