[features] bring your own vectors (#381)

* updated * update mock * update mock * change sum to mean. * change sum to mean. * updated * updated * add query type check logic * add maximum number of vectors limit * add tests * updated * catch mainline * catch mainline * catch mainline * catch mainline * add test for vectors * add test for vectors * add test for vectors
marqo-ai · Mar 16, 2023 · 10fbf58 · 10fbf58
1 parent 6ec18eb
commit 10fbf58
Show file tree

Hide file tree

Showing 7 changed files with 327 additions and 15 deletions.
diff --git a/src/marqo/tensor_search/api.py b/src/marqo/tensor_search/api.py
@@ -129,7 +129,8 @@ def search(search_query: SearchQuery, index_name: str, device: str = Depends(api
         reranker=search_query.reRanker, 
         filter=search_query.filter, device=device,
         attributes_to_retrieve=search_query.attributesToRetrieve, boost=search_query.boost,
-        image_download_headers=search_query.image_download_headers
+        image_download_headers=search_query.image_download_headers,
+        context=search_query.context
     )
 
 

diff --git a/src/marqo/tensor_search/models/api_models.py b/src/marqo/tensor_search/models/api_models.py
@@ -22,6 +22,7 @@ class SearchQuery(BaseModel):
     attributesToRetrieve: Union[None, List[str]] = None
     boost: Optional[Dict] = None
     image_download_headers: Optional[Dict] = None
+    context: Optional[Dict] = None
 
     @pydantic.validator('searchMethod')
     def validate_search_method(cls, value):

diff --git a/src/marqo/tensor_search/models/context_object.py b/src/marqo/tensor_search/models/context_object.py
@@ -0,0 +1,31 @@
+context_schema = {
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "type": "object",
+  "properties": {
+    "tensor": {
+      "type": "array",
+      "minItems":1,
+      "maxItems" : 64,
+      "items":
+        {
+          "type": "object",
+          "properties": {
+            "vector": {
+              "type": "array",
+              "items": {"type": "number"}
+            },
+            "weight": {
+              "type": "number"
+            }
+          },
+          "required": [
+            "vector",
+            "weight"
+          ]
+        },
+    }
+  },
+  "required": [
+    "tensor"
+  ]
+}
diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py
@@ -994,13 +994,13 @@ def bulk_search(query: BulkSearchQuery, marqo_config: config.Config, verbose: bo
     Args:
         query: Set of search queries
         marqo_config:
-        verbose: 
-        device: 
+        verbose:
+        device:
 
     Notes:
         Current limitations:
           - Lexical and tensor search done in serial.
-          - A single error (e.g. validation errors) on any one of the search queries returns an error and does not 
+          - A single error (e.g. validation errors) on any one of the search queries returns an error and does not
             process non-erroring queries.
     """
     # TODO: Let non-errored docs to propagate.
@@ -1041,7 +1041,7 @@ def bulk_search(query: BulkSearchQuery, marqo_config: config.Config, verbose: bo
         s["limit"] = q.limit
         s["offset"] = q.offset
 
-        ## TODO: filter out highlights within `_lexical_search` 
+        ## TODO: filter out highlights within `_lexical_search`
         if not q.showHighlights:
             for hit in s["hits"]:
                 del hit["_highlights"]
@@ -1090,7 +1090,8 @@ def search(config: Config, index_name: str, text: Union[str, dict],
            reranker: Union[str, Dict] = None, simplified_format: bool = True, filter: str = None,
            attributes_to_retrieve: Optional[List[str]] = None,
            device=None, boost: Optional[Dict] = None,
-           image_download_headers: Optional[Dict] = None) -> Dict:
+           image_download_headers: Optional[Dict] = None,
+           context: Optional[Dict] = None) -> Dict:
     """The root search method. Calls the specific search method
 
     Validation should go here. Validations include:
@@ -1113,6 +1114,7 @@ def search(config: Config, index_name: str, text: Union[str, dict],
         num_highlights: number of highlights to return for each doc
         boost: boosters to re-weight the scores of individual fields
         image_download_headers: headers for downloading images
+        context: a dictionary to allow custom vectors in search, for tensor search only
     Returns:
 
     """
@@ -1164,7 +1166,7 @@ def search(config: Config, index_name: str, text: Union[str, dict],
             return_doc_ids=return_doc_ids, searchable_attributes=searchable_attributes, verbose=verbose,
             number_of_highlights=num_highlights, simplified_format=simplified_format,
             filter_string=filter, device=device, attributes_to_retrieve=attributes_to_retrieve, boost=boost,
-            image_download_headers=image_download_headers
+            image_download_headers=image_download_headers, context=context
         )
     elif search_method.upper() == SearchMethod.LEXICAL:
         search_result = _lexical_search(
@@ -1582,7 +1584,7 @@ def get_query_vectors_from_jobs(
     Handles multi-modal queries, by weighting and combining queries into a single vector
 
     Args:
-        - queries: Original search queries. 
+        - queries: Original search queries.
         - qidx_to_job: VectorisedJobPointer for each query
         - job_to_vectors: inference output from each VectorisedJob
         - config: standard Marqo config.
@@ -1667,15 +1669,15 @@ def create_empty_query_response(queries: List[BulkSearchQueryEntity]) -> List[Di
 
 
 def _bulk_vector_text_search(config: Config, queries: List[BulkSearchQueryEntity], device=None) -> List[Dict]:
-    """Resolve a batch of search queries in parallel. 
+    """Resolve a batch of search queries in parallel.
 
     Args:
-        - config: 
+        - config:
         - queries: A list of independent search queries. Can be across multiple indexes, but are all expected to have `searchMethod = "TENSOR"`
     Returns:
-        A list of search query responses (see `_format_ordered_docs_simple` for structure of individual entities). 
+        A list of search query responses (see `_format_ordered_docs_simple` for structure of individual entities).
     Note:
-        - Search results are in the same order as `queries`. 
+        - Search results are in the same order as `queries`.
     """
     if len(queries) == 0:
         return []
@@ -1765,7 +1767,8 @@ def _vector_text_search(
         verbose=0, raise_on_searchable_attribs=False, hide_vectors=True, k=500,
         simplified_format=True, filter_string: str = None, device=None,
         attributes_to_retrieve: Optional[List[str]] = None, boost: Optional[Dict] = None,
-        image_download_headers: Optional[Dict] = None):
+        image_download_headers: Optional[Dict] = None,
+        context: Optional[Dict] = None,):
     """
     Args:
         config:
@@ -1783,12 +1786,15 @@ def _vector_text_search(
             objects are printed out
         attributes_to_retrieve: if set, only returns these fields
         image_download_headers: headers for downloading images
+        context: a dictionary to allow custom vectors in search
     Returns:
+
     Note:
         - uses multisearch, which returns k results in each attribute. Not that much of a concern unless you have a
         ridiculous number of attributes
         - Should not be directly called by client - the search() method should
         be called. The search() method adds syncing
+
     Output format:
         [
             {
@@ -1802,6 +1808,17 @@ def _vector_text_search(
         - searching a non existent index should return a HTTP-type error
     """
     # SEARCH TIMER-LOGGER (pre-processing)
+    custom_tensors = None
+    if context is not None:
+        if isinstance(query, dict):
+            validation.validate_context_object(context_object=context)
+            custom_tensors = context.get("tensor", None)
+        elif isinstance(query, str):
+            raise errors.InvalidArgError(f"Marqo received a query = `{query}` with type =`{type(query).__name__}` "
+                                         f"and a context = `{context}`.\n"
+                                         f"This is not supported as the context only works when the query is a dictionary."
+                                         f"If you aim to search with your custom vectors, reformat the query as a dictionary.\n"
+                                         f"Please check `https://docs.marqo.ai/0.0.16/` for more information.")
     start_preprocess_time = timer()
     try:
         index_info = get_index_info(config=config, index_name=index_name)
@@ -1843,8 +1860,15 @@ def _vector_text_search(
                     if q in batch_dict:
                         vec = batch_dict[q]
                 weighted_vectors.append(np.asarray(vec) * weight)
-
-            vectorised_text = np.mean(weighted_vectors, axis=0)
+            if custom_tensors:
+                weighted_vectors += [np.asarray(v["vector"]) * v["weight"]for v in custom_tensors]
+            try:
+                vectorised_text = np.mean(weighted_vectors, axis=0)
+            except ValueError as e:
+                raise errors.InvalidArgError(f"The provided vectors are not in the same dimension of the index."
+                                             f"This causes the error when we do `numpy.mean()` over all the vectors.\n"
+                                             f"The original error is `{e}`.\n"
+                                             f"Please check `https://docs.marqo.ai/0.0.15/API-Reference/search/`.")
             if index_info.index_settings['index_defaults']['normalize_embeddings']:
                 norm = np.linalg.norm(vectorised_text, axis=-1, keepdims=True)
                 if norm > 0:

diff --git a/src/marqo/tensor_search/validation.py b/src/marqo/tensor_search/validation.py
@@ -16,6 +16,7 @@
 import jsonschema
 from marqo.tensor_search.models.settings_object import settings_schema
 from marqo.tensor_search.models.mappings_object import mappings_schema, multimodal_combination_schema
+from marqo.tensor_search.models.context_object import context_schema
 
 
 def validate_query(q: Union[dict, str], search_method: Union[str, SearchMethod]):
@@ -457,6 +458,23 @@ def validate_multimodal_combination(field_content, is_non_tensor_field, field_ma
     return True
 
 
+def validate_context_object(context_object: dict):
+    """validates the mappings object.
+        Returns
+            the given context_object if passed the validation
+
+        Raises an InvalidArgError if the context object is badly formatted
+        """
+    try:
+        jsonschema.validate(instance=context_object, schema=context_schema)
+        return context_object
+    except jsonschema.ValidationError as e:
+        raise InvalidArgError(
+            f"Error validating mappings object. Reason: \n{str(e)}"
+            f"\nRead about the mappings object here: https://docs.marqo.ai/0.0.16"
+        )
+
+
 def validate_mappings_object(mappings_object: dict):
     """validates the mappings object.
     Returns

diff --git a/tests/tensor_search/test_custom_vectors_search.py b/tests/tensor_search/test_custom_vectors_search.py
@@ -0,0 +1,111 @@
+import unittest.mock
+import pprint
+
+import torch
+
+import marqo.tensor_search.backend
+from marqo.errors import IndexNotFoundError, InvalidArgError
+from marqo.tensor_search import tensor_search
+from marqo.tensor_search.enums import TensorField, IndexSettingsField, SearchMethod
+from tests.marqo_test import MarqoTestCase
+from unittest.mock import patch
+import numpy as np
+
+
+class TestMultimodalTensorCombination(MarqoTestCase):
+
+    def setUp(self):
+        self.index_name_1 = "my-test-index-1"
+        self.endpoint = self.authorized_url
+
+        try:
+            tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
+        except IndexNotFoundError as e:
+            pass
+
+        tensor_search.create_vector_index(
+            index_name=self.index_name_1, config=self.config, index_settings={
+                IndexSettingsField.index_defaults: {
+                    IndexSettingsField.model: "ViT-B/32",
+                    IndexSettingsField.treat_urls_and_pointers_as_images: True,
+                    IndexSettingsField.normalize_embeddings: True
+                }
+            })
+        tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
+            {
+                "Title": "Horse rider",
+                "text_field": "A rider is riding a horse jumping over the barrier.",
+                "_id": "1"
+            }], auto_refresh=True)
+
+    def tearDown(self) -> None:
+        try:
+            tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
+        except:
+            pass
+
+    def test_search(self):
+        query = {
+            "A rider is riding a horse jumping over the barrier": 1,
+        }
+        res = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
+        {"tensor": [{"vector": [1, ] * 512, "weight": 2}, {"vector": [2, ] * 512, "weight": -1}], })
+
+    def test_search_with_incorrect_tensor_dimension(self):
+        query = {
+            "A rider is riding a horse jumping over the barrier": 1,
+        }
+        try:
+            res = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
+            {"tensor": [{"vector": [1, ] * 3, "weight": 0}, {"vector": [2, ] * 512, "weight": 0}], })
+            raise AssertionError
+        except InvalidArgError as e:
+            assert "This causes the error when we do `numpy.mean()` over" in e.message
+
+    def test_search_with_incorrect_query_format(self):
+        query = "A rider is riding a horse jumping over the barrier"
+        try:
+            res = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
+            {"tensor": [{"vector": [1, ] * 512, "weight": 0}, {"vector": [2, ] * 512, "weight": 0}], })
+            raise AssertionError
+        except InvalidArgError as e:
+            assert "This is not supported as the context only works when the query is a dictionary." in e.message
+
+    def test_search_score(self):
+        query = {
+            "A rider is riding a horse jumping over the barrier": 1,
+        }
+
+        res_1 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query)
+        res_2 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
+        {"tensor": [{"vector": [1, ] * 512, "weight": 0}, {"vector": [2, ] * 512, "weight": 0}], })
+        res_3 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
+        {"tensor": [{"vector": [1, ] * 512, "weight": -1}, {"vector": [1, ] * 512, "weight": 1}], })
+
+        assert res_1["hits"][0]["_score"] == res_2["hits"][0]["_score"]
+        assert res_1["hits"][0]["_score"] == res_3["hits"][0]["_score"]
+
+    def test_search_vectors(self):
+        with patch("numpy.mean", wraps = np.mean) as mock_mean:
+            query = {
+                "A rider is riding a horse jumping over the barrier": 1,
+            }
+            res_1 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query)
+
+            weight_1, weight_2, weight_3 = 2.5, 3.4, -1.334
+            vector_2 = [-1,] * 512
+            vector_3 = [1.3,] * 512
+            query = {
+                "A rider is riding a horse jumping over the barrier": weight_1,
+            }
+
+            res_2 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
+            {"tensor": [{"vector": vector_2, "weight": weight_2}, {"vector": vector_3, "weight": weight_3}], })
+
+            args_list = [args[0] for args in mock_mean.call_args_list]
+            vectorised_string = args_list[0][0][0]
+            weighted_vectors = args_list[1][0]
+
+            assert np.allclose(vectorised_string * weight_1, weighted_vectors[0], atol=1e-9)
+            assert np.allclose(np.array(vector_2) * weight_2, weighted_vectors[1], atol=1e-9)
+            assert np.allclose(np.array(vector_3) * weight_3, weighted_vectors[2], atol=1e-9)