Skip to content

Commit

Permalink
[features] bring your own vectors (#381)
Browse files Browse the repository at this point in the history
* updated

* update mock

* update mock

* change sum to mean.

* change sum to mean.

* updated

* updated

* add query type check logic

* add maximum number of vectors limit

* add tests

* updated

* catch mainline

* catch mainline

* catch mainline

* catch mainline

* add test for vectors

* add test for vectors

* add test for vectors
  • Loading branch information
wanliAlex authored Mar 16, 2023
1 parent 6ec18eb commit 10fbf58
Show file tree
Hide file tree
Showing 7 changed files with 327 additions and 15 deletions.
3 changes: 2 additions & 1 deletion src/marqo/tensor_search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ def search(search_query: SearchQuery, index_name: str, device: str = Depends(api
reranker=search_query.reRanker,
filter=search_query.filter, device=device,
attributes_to_retrieve=search_query.attributesToRetrieve, boost=search_query.boost,
image_download_headers=search_query.image_download_headers
image_download_headers=search_query.image_download_headers,
context=search_query.context
)


Expand Down
1 change: 1 addition & 0 deletions src/marqo/tensor_search/models/api_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class SearchQuery(BaseModel):
attributesToRetrieve: Union[None, List[str]] = None
boost: Optional[Dict] = None
image_download_headers: Optional[Dict] = None
context: Optional[Dict] = None

@pydantic.validator('searchMethod')
def validate_search_method(cls, value):
Expand Down
31 changes: 31 additions & 0 deletions src/marqo/tensor_search/models/context_object.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
context_schema = {
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "object",
"properties": {
"tensor": {
"type": "array",
"minItems":1,
"maxItems" : 64,
"items":
{
"type": "object",
"properties": {
"vector": {
"type": "array",
"items": {"type": "number"}
},
"weight": {
"type": "number"
}
},
"required": [
"vector",
"weight"
]
},
}
},
"required": [
"tensor"
]
}
52 changes: 38 additions & 14 deletions src/marqo/tensor_search/tensor_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -994,13 +994,13 @@ def bulk_search(query: BulkSearchQuery, marqo_config: config.Config, verbose: bo
Args:
query: Set of search queries
marqo_config:
verbose:
device:
verbose:
device:
Notes:
Current limitations:
- Lexical and tensor search done in serial.
- A single error (e.g. validation errors) on any one of the search queries returns an error and does not
- A single error (e.g. validation errors) on any one of the search queries returns an error and does not
process non-erroring queries.
"""
# TODO: Let non-errored docs to propagate.
Expand Down Expand Up @@ -1041,7 +1041,7 @@ def bulk_search(query: BulkSearchQuery, marqo_config: config.Config, verbose: bo
s["limit"] = q.limit
s["offset"] = q.offset

## TODO: filter out highlights within `_lexical_search`
## TODO: filter out highlights within `_lexical_search`
if not q.showHighlights:
for hit in s["hits"]:
del hit["_highlights"]
Expand Down Expand Up @@ -1090,7 +1090,8 @@ def search(config: Config, index_name: str, text: Union[str, dict],
reranker: Union[str, Dict] = None, simplified_format: bool = True, filter: str = None,
attributes_to_retrieve: Optional[List[str]] = None,
device=None, boost: Optional[Dict] = None,
image_download_headers: Optional[Dict] = None) -> Dict:
image_download_headers: Optional[Dict] = None,
context: Optional[Dict] = None) -> Dict:
"""The root search method. Calls the specific search method
Validation should go here. Validations include:
Expand All @@ -1113,6 +1114,7 @@ def search(config: Config, index_name: str, text: Union[str, dict],
num_highlights: number of highlights to return for each doc
boost: boosters to re-weight the scores of individual fields
image_download_headers: headers for downloading images
context: a dictionary to allow custom vectors in search, for tensor search only
Returns:
"""
Expand Down Expand Up @@ -1164,7 +1166,7 @@ def search(config: Config, index_name: str, text: Union[str, dict],
return_doc_ids=return_doc_ids, searchable_attributes=searchable_attributes, verbose=verbose,
number_of_highlights=num_highlights, simplified_format=simplified_format,
filter_string=filter, device=device, attributes_to_retrieve=attributes_to_retrieve, boost=boost,
image_download_headers=image_download_headers
image_download_headers=image_download_headers, context=context
)
elif search_method.upper() == SearchMethod.LEXICAL:
search_result = _lexical_search(
Expand Down Expand Up @@ -1582,7 +1584,7 @@ def get_query_vectors_from_jobs(
Handles multi-modal queries, by weighting and combining queries into a single vector
Args:
- queries: Original search queries.
- queries: Original search queries.
- qidx_to_job: VectorisedJobPointer for each query
- job_to_vectors: inference output from each VectorisedJob
- config: standard Marqo config.
Expand Down Expand Up @@ -1667,15 +1669,15 @@ def create_empty_query_response(queries: List[BulkSearchQueryEntity]) -> List[Di


def _bulk_vector_text_search(config: Config, queries: List[BulkSearchQueryEntity], device=None) -> List[Dict]:
"""Resolve a batch of search queries in parallel.
"""Resolve a batch of search queries in parallel.
Args:
- config:
- config:
- queries: A list of independent search queries. Can be across multiple indexes, but are all expected to have `searchMethod = "TENSOR"`
Returns:
A list of search query responses (see `_format_ordered_docs_simple` for structure of individual entities).
A list of search query responses (see `_format_ordered_docs_simple` for structure of individual entities).
Note:
- Search results are in the same order as `queries`.
- Search results are in the same order as `queries`.
"""
if len(queries) == 0:
return []
Expand Down Expand Up @@ -1765,7 +1767,8 @@ def _vector_text_search(
verbose=0, raise_on_searchable_attribs=False, hide_vectors=True, k=500,
simplified_format=True, filter_string: str = None, device=None,
attributes_to_retrieve: Optional[List[str]] = None, boost: Optional[Dict] = None,
image_download_headers: Optional[Dict] = None):
image_download_headers: Optional[Dict] = None,
context: Optional[Dict] = None,):
"""
Args:
config:
Expand All @@ -1783,12 +1786,15 @@ def _vector_text_search(
objects are printed out
attributes_to_retrieve: if set, only returns these fields
image_download_headers: headers for downloading images
context: a dictionary to allow custom vectors in search
Returns:
Note:
- uses multisearch, which returns k results in each attribute. Not that much of a concern unless you have a
ridiculous number of attributes
- Should not be directly called by client - the search() method should
be called. The search() method adds syncing
Output format:
[
{
Expand All @@ -1802,6 +1808,17 @@ def _vector_text_search(
- searching a non existent index should return a HTTP-type error
"""
# SEARCH TIMER-LOGGER (pre-processing)
custom_tensors = None
if context is not None:
if isinstance(query, dict):
validation.validate_context_object(context_object=context)
custom_tensors = context.get("tensor", None)
elif isinstance(query, str):
raise errors.InvalidArgError(f"Marqo received a query = `{query}` with type =`{type(query).__name__}` "
f"and a context = `{context}`.\n"
f"This is not supported as the context only works when the query is a dictionary."
f"If you aim to search with your custom vectors, reformat the query as a dictionary.\n"
f"Please check `https://docs.marqo.ai/0.0.16/` for more information.")
start_preprocess_time = timer()
try:
index_info = get_index_info(config=config, index_name=index_name)
Expand Down Expand Up @@ -1843,8 +1860,15 @@ def _vector_text_search(
if q in batch_dict:
vec = batch_dict[q]
weighted_vectors.append(np.asarray(vec) * weight)

vectorised_text = np.mean(weighted_vectors, axis=0)
if custom_tensors:
weighted_vectors += [np.asarray(v["vector"]) * v["weight"]for v in custom_tensors]
try:
vectorised_text = np.mean(weighted_vectors, axis=0)
except ValueError as e:
raise errors.InvalidArgError(f"The provided vectors are not in the same dimension of the index."
f"This causes the error when we do `numpy.mean()` over all the vectors.\n"
f"The original error is `{e}`.\n"
f"Please check `https://docs.marqo.ai/0.0.15/API-Reference/search/`.")
if index_info.index_settings['index_defaults']['normalize_embeddings']:
norm = np.linalg.norm(vectorised_text, axis=-1, keepdims=True)
if norm > 0:
Expand Down
18 changes: 18 additions & 0 deletions src/marqo/tensor_search/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import jsonschema
from marqo.tensor_search.models.settings_object import settings_schema
from marqo.tensor_search.models.mappings_object import mappings_schema, multimodal_combination_schema
from marqo.tensor_search.models.context_object import context_schema


def validate_query(q: Union[dict, str], search_method: Union[str, SearchMethod]):
Expand Down Expand Up @@ -457,6 +458,23 @@ def validate_multimodal_combination(field_content, is_non_tensor_field, field_ma
return True


def validate_context_object(context_object: dict):
"""validates the mappings object.
Returns
the given context_object if passed the validation
Raises an InvalidArgError if the context object is badly formatted
"""
try:
jsonschema.validate(instance=context_object, schema=context_schema)
return context_object
except jsonschema.ValidationError as e:
raise InvalidArgError(
f"Error validating mappings object. Reason: \n{str(e)}"
f"\nRead about the mappings object here: https://docs.marqo.ai/0.0.16"
)


def validate_mappings_object(mappings_object: dict):
"""validates the mappings object.
Returns
Expand Down
111 changes: 111 additions & 0 deletions tests/tensor_search/test_custom_vectors_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import unittest.mock
import pprint

import torch

import marqo.tensor_search.backend
from marqo.errors import IndexNotFoundError, InvalidArgError
from marqo.tensor_search import tensor_search
from marqo.tensor_search.enums import TensorField, IndexSettingsField, SearchMethod
from tests.marqo_test import MarqoTestCase
from unittest.mock import patch
import numpy as np


class TestMultimodalTensorCombination(MarqoTestCase):

def setUp(self):
self.index_name_1 = "my-test-index-1"
self.endpoint = self.authorized_url

try:
tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
except IndexNotFoundError as e:
pass

tensor_search.create_vector_index(
index_name=self.index_name_1, config=self.config, index_settings={
IndexSettingsField.index_defaults: {
IndexSettingsField.model: "ViT-B/32",
IndexSettingsField.treat_urls_and_pointers_as_images: True,
IndexSettingsField.normalize_embeddings: True
}
})
tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
{
"Title": "Horse rider",
"text_field": "A rider is riding a horse jumping over the barrier.",
"_id": "1"
}], auto_refresh=True)

def tearDown(self) -> None:
try:
tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
except:
pass

def test_search(self):
query = {
"A rider is riding a horse jumping over the barrier": 1,
}
res = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
{"tensor": [{"vector": [1, ] * 512, "weight": 2}, {"vector": [2, ] * 512, "weight": -1}], })

def test_search_with_incorrect_tensor_dimension(self):
query = {
"A rider is riding a horse jumping over the barrier": 1,
}
try:
res = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
{"tensor": [{"vector": [1, ] * 3, "weight": 0}, {"vector": [2, ] * 512, "weight": 0}], })
raise AssertionError
except InvalidArgError as e:
assert "This causes the error when we do `numpy.mean()` over" in e.message

def test_search_with_incorrect_query_format(self):
query = "A rider is riding a horse jumping over the barrier"
try:
res = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
{"tensor": [{"vector": [1, ] * 512, "weight": 0}, {"vector": [2, ] * 512, "weight": 0}], })
raise AssertionError
except InvalidArgError as e:
assert "This is not supported as the context only works when the query is a dictionary." in e.message

def test_search_score(self):
query = {
"A rider is riding a horse jumping over the barrier": 1,
}

res_1 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query)
res_2 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
{"tensor": [{"vector": [1, ] * 512, "weight": 0}, {"vector": [2, ] * 512, "weight": 0}], })
res_3 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
{"tensor": [{"vector": [1, ] * 512, "weight": -1}, {"vector": [1, ] * 512, "weight": 1}], })

assert res_1["hits"][0]["_score"] == res_2["hits"][0]["_score"]
assert res_1["hits"][0]["_score"] == res_3["hits"][0]["_score"]

def test_search_vectors(self):
with patch("numpy.mean", wraps = np.mean) as mock_mean:
query = {
"A rider is riding a horse jumping over the barrier": 1,
}
res_1 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query)

weight_1, weight_2, weight_3 = 2.5, 3.4, -1.334
vector_2 = [-1,] * 512
vector_3 = [1.3,] * 512
query = {
"A rider is riding a horse jumping over the barrier": weight_1,
}

res_2 = tensor_search.search(config=self.config, index_name=self.index_name_1, text=query, context=
{"tensor": [{"vector": vector_2, "weight": weight_2}, {"vector": vector_3, "weight": weight_3}], })

args_list = [args[0] for args in mock_mean.call_args_list]
vectorised_string = args_list[0][0][0]
weighted_vectors = args_list[1][0]

assert np.allclose(vectorised_string * weight_1, weighted_vectors[0], atol=1e-9)
assert np.allclose(np.array(vector_2) * weight_2, weighted_vectors[1], atol=1e-9)
assert np.allclose(np.array(vector_3) * weight_3, weighted_vectors[2], atol=1e-9)
Loading

0 comments on commit 10fbf58

Please sign in to comment.