From afc18390e0c02156f9ca5d7f43b66c0b4f4c399a Mon Sep 17 00:00:00 2001 From: OwenElliott <41710527+OwenPendrighElliott@users.noreply.github.com> Date: Wed, 7 Feb 2024 12:18:46 +1100 Subject: [PATCH] efSearch and approximate in Client search (#215) * adding efsearch and approximate to search in client * add tests for ef and approx * bump ver to 3.1.0 * back to 3.0.1 --- src/marqo/index.py | 9 +++- tests/v2_tests/test_tensor_search.py | 74 ++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/src/marqo/index.py b/src/marqo/index.py index b7393da9..641cc6c8 100644 --- a/src/marqo/index.py +++ b/src/marqo/index.py @@ -199,7 +199,8 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op highlights=None, device: Optional[str] = None, filter_string: str = None, show_highlights=True, reranker=None, image_download_headers: Optional[Dict] = None, attributes_to_retrieve: Optional[List[str]] = None, boost: Optional[Dict[str,List[Union[float, int]]]] = None, - context: Optional[dict] = None, score_modifiers: Optional[dict] = None, model_auth: Optional[dict] = None + context: Optional[dict] = None, score_modifiers: Optional[dict] = None, model_auth: Optional[dict] = None, + ef_search: Optional[int] = None, approximate: Optional[bool] = None ) -> Dict[str, Any]: """Search the index. @@ -229,6 +230,8 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op context: a dictionary to allow you to bring your own vectors and more into search. score_modifiers: a dictionary to modify the score based on field values, for tensor search only model_auth: authorisation that lets Marqo download a private model, if required + ef_search: the size of the list of candidates during graph traversal, for tensor search only + approximate: whether to use approximate nearest neighbors search or not, for tensor search only Returns: Dictionary with hits and other metadata """ @@ -266,6 +269,10 @@ def search(self, q: Optional[Union[str, dict]] = None, searchable_attributes: Op body["scoreModifiers"] = score_modifiers if model_auth is not None: body["modelAuth"] = model_auth + if ef_search is not None: + body["efSearch"] = ef_search + if approximate is not None: + body["approximate"] = approximate res = self.http.post( path=path_with_query_str, body=body, diff --git a/tests/v2_tests/test_tensor_search.py b/tests/v2_tests/test_tensor_search.py index d90c6444..7ef84de3 100644 --- a/tests/v2_tests/test_tensor_search.py +++ b/tests/v2_tests/test_tensor_search.py @@ -7,6 +7,7 @@ import math import time from tests.marqo_test import MarqoTestCase, CloudTestIndex +from marqo.errors import MarqoWebError from pytest import mark @@ -445,6 +446,79 @@ def test_multi_queries(self): for hit_position, _ in enumerate(res['hits']): assert res['hits'][hit_position]['_id'] == expected_ordering[hit_position] + @mark.fixed + def test_search_with_ef_search(self): + """Tests if the ef_search parameter works""" + for cloud_test_index_to_use, open_source_test_index_name in self.test_cases: + test_index_name = self.get_test_index_name( + cloud_test_index_to_use=cloud_test_index_to_use, + open_source_test_index_name=open_source_test_index_name + ) + d1 = { + "Title": "This is a title about some doc. ", + "Description": """The Guardian is a British daily newspaper. It was founded in 1821 as The Manchester Guardian, and changed its name in 1959.[5] Along with its sister papers The Observer and The Guardian Weekly, The Guardian is part of the Guardian Media Group, owned by the Scott Trust.[6] The trust was created in 1936 to "secure the financial and editorial independence of The Guardian in perpetuity and to safeguard the journalistic freedom and liberal values of The Guardian free from commercial or political interference".[7] The trust was converted into a limited company in 2008, with a constitution written so as to maintain for The Guardian the same protections as were built into the structure of the Scott Trust by its creators. Profits are reinvested in journalism rather than distributed to owners or shareholders.[7] It is considered a newspaper of record in the UK.[8][9] + The editor-in-chief Katharine Viner succeeded Alan Rusbridger in 2015.[10][11] Since 2018, the paper's main newsprint sections have been published in tabloid format. As of July 2021, its print edition had a daily circulation of 105,134.[4] The newspaper has an online edition, TheGuardian.com, as well as two international websites, Guardian Australia (founded in 2013) and Guardian US (founded in 2011). The paper's readership is generally on the mainstream left of British political opinion,[12][13][14][15] and the term "Guardian reader" is used to imply a stereotype of liberal, left-wing or "politically correct" views.[3] Frequent typographical errors during the age of manual typesetting led Private Eye magazine to dub the paper the "Grauniad" in the 1960s, a nickname still used occasionally by the editors for self-mockery.[16] + """ + } + _ = self.client.index(test_index_name).add_documents([d1], tensor_fields=["Title", "Description"]) + + if self.IS_MULTI_INSTANCE: + self.warm_request(self.client.index(test_index_name).search, + "title about some doc") + + # text basic search works + search_res = self.client.index(test_index_name).search("text", ef_search=200) + assert len(search_res["hits"]) == 1 + assert self.strip_marqo_fields(search_res["hits"][0]) == d1 + assert len(search_res["hits"][0]["_highlights"]) > 0 + assert ("Title" in search_res["hits"][0]["_highlights"][0]) or ("Description" in search_res["hits"][0]["_highlights"][0]) + + # test error on negative ef_search + with self.assertRaises(MarqoWebError): + self.client.index(test_index_name).search("text", ef_search=-100) + + # test error on ef_search and LEXICAL + with self.assertRaises(MarqoWebError): + self.client.index(test_index_name).search("text", search_method='LEXICAL', ef_search=1000) + + @mark.fixed + def test_search_with_approximate(self): + """Tests if the approximate parameter works""" + for cloud_test_index_to_use, open_source_test_index_name in self.test_cases: + test_index_name = self.get_test_index_name( + cloud_test_index_to_use=cloud_test_index_to_use, + open_source_test_index_name=open_source_test_index_name + ) + d1 = { + "Title": "This is a title about some doc. ", + "Description": """The Guardian is a British daily newspaper. It was founded in 1821 as The Manchester Guardian, and changed its name in 1959.[5] Along with its sister papers The Observer and The Guardian Weekly, The Guardian is part of the Guardian Media Group, owned by the Scott Trust.[6] The trust was created in 1936 to "secure the financial and editorial independence of The Guardian in perpetuity and to safeguard the journalistic freedom and liberal values of The Guardian free from commercial or political interference".[7] The trust was converted into a limited company in 2008, with a constitution written so as to maintain for The Guardian the same protections as were built into the structure of the Scott Trust by its creators. Profits are reinvested in journalism rather than distributed to owners or shareholders.[7] It is considered a newspaper of record in the UK.[8][9] + The editor-in-chief Katharine Viner succeeded Alan Rusbridger in 2015.[10][11] Since 2018, the paper's main newsprint sections have been published in tabloid format. As of July 2021, its print edition had a daily circulation of 105,134.[4] The newspaper has an online edition, TheGuardian.com, as well as two international websites, Guardian Australia (founded in 2013) and Guardian US (founded in 2011). The paper's readership is generally on the mainstream left of British political opinion,[12][13][14][15] and the term "Guardian reader" is used to imply a stereotype of liberal, left-wing or "politically correct" views.[3] Frequent typographical errors during the age of manual typesetting led Private Eye magazine to dub the paper the "Grauniad" in the 1960s, a nickname still used occasionally by the editors for self-mockery.[16] + """ + } + _ = self.client.index(test_index_name).add_documents([d1], tensor_fields=["Title", "Description"]) + + if self.IS_MULTI_INSTANCE: + self.warm_request(self.client.index(test_index_name).search, + "title about some doc") + + # text basic search works with approximate + search_res_approx = self.client.index(test_index_name).search("text", approximate=True) + search_res_exact = self.client.index(test_index_name).search("text", approximate=False) + + for search_res in [search_res_approx, search_res_exact]: + assert len(search_res["hits"]) == 1 + assert self.strip_marqo_fields(search_res["hits"][0]) == d1 + assert len(search_res["hits"][0]["_highlights"]) > 0 + assert ("Title" in search_res["hits"][0]["_highlights"][0]) or ("Description" in search_res["hits"][0]["_highlights"][0]) + + # test error approximate and lexical + with self.assertRaises(MarqoWebError): + self.client.index(test_index_name).search("text", search_method='LEXICAL', approximate=True) + + # test error approximate and lexical + with self.assertRaises(MarqoWebError): + self.client.index(test_index_name).search("text", search_method='LEXICAL', approximate=False) + @mark.fixed def test_escaped_non_tensor_field(self): """We need to make sure non tensor field escaping works properly.