Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Single KNN Field Optimisation #530

Merged
merged 36 commits into from
Jul 16, 2023
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3362422
using a single kNN opensearch field
pandu-k Jun 16, 2023
77e5ea5
made end-to-end case work
pandu-k Jun 16, 2023
c8eeaf0
added debug statements for OpenSearch requests
pandu-k Jun 16, 2023
82847dc
Added brackets around combined opensearch filter
pandu-k Jun 19, 2023
8bcde85
fixed bug with no searchable attributes
vicilliar Jun 22, 2023
d3c271a
changed marqo version
vicilliar Jun 29, 2023
c6c70a5
single knn-field: test_search.py passes (#526)
pandu-k Jul 4, 2023
a883792
removed debug message in OS
vicilliar Jul 4, 2023
d26beab
fixed add docs, backend, create index unit tests
vicilliar Jul 5, 2023
8a3b224
pagination error handling
vicilliar Jul 6, 2023
8652643
removed debug message
vicilliar Jul 6, 2023
fd8e2bd
fixed merge conflicts with mainline
vicilliar Jul 6, 2023
f246e31
fixed bulk search and search tests
vicilliar Jul 6, 2023
8757e32
fixed broken tests
vicilliar Jul 7, 2023
b154b07
fixed index meta cache tests, add knn field validation
vicilliar Jul 7, 2023
80f5ddd
empty results for empty searchable attr lexical
vicilliar Jul 10, 2023
aaab3c1
finished pagination tests
vicilliar Jul 11, 2023
4f0c180
refactored filtering, get_model_properties, index creation tests
vicilliar Jul 11, 2023
1dfab3c
fixed generic model error tests
vicilliar Jul 11, 2023
941a253
added knn field tests
vicilliar Jul 12, 2023
fa17d6e
updated filtering tests
vicilliar Jul 12, 2023
a9516c0
added changes to contextualise_user_filter
pandu-k Jul 13, 2023
32e2f90
filtering unit tests pass, double backslash escape fixed
vicilliar Jul 13, 2023
44ebbf1
added backslash escaping to Lucene sanitise function
pandu-k Jul 13, 2023
3cd3bf3
Merge branch 'joshua/single-knn-field' into pandu/filtering-update
pandu-k Jul 13, 2023
a322c15
fixed contextualise edge cases with field at start of filter
vicilliar Jul 13, 2023
119b3a1
fixed contextualise edge cases with field at start of filter
vicilliar Jul 13, 2023
333e0c0
more edge cases for filtering
vicilliar Jul 13, 2023
ced42b2
edge case fixed in contextualise, also added draft regex solution
vicilliar Jul 14, 2023
9acd484
fixed wrong string length skip
vicilliar Jul 14, 2023
4425043
removed unused code, updated name of contextualise
vicilliar Jul 14, 2023
5b2a815
Merge branch 'mainline' into joshua/single-knn-field
vicilliar Jul 14, 2023
01d14ce
fixed bug when content has field name by adding colon req
vicilliar Jul 14, 2023
556b164
gitignore vscode directory
vicilliar Jul 14, 2023
2c7785a
removed vscode settings json
vicilliar Jul 14, 2023
38ce7da
improved function docstrings
Jul 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,5 @@ src/marqo/tensor_search/test_throttle_timing.txt
src/marqo/tensor_search/test_throttle_timing.csv
dump.rdb

# VSCode
.vscode/
19 changes: 2 additions & 17 deletions src/marqo/tensor_search/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_index_info(config: Config, index_name: str) -> IndexInfo:

def add_customer_field_properties(config: Config, index_name: str,
customer_field_names: Iterable[Tuple[str, enums.OpenSearchDataType]],
model_properties: dict, multimodal_combination_fields: Dict[str, Iterable[Tuple[str, enums.OpenSearchDataType]]]):
multimodal_combination_fields: Dict[str, Iterable[Tuple[str, enums.OpenSearchDataType]]]):
"""Adds new customer fields to index mapping.

Pushes the updated mapping to OpenSearch, and updates the local cache.
Expand All @@ -80,25 +80,11 @@ def add_customer_field_properties(config: Config, index_name: str,
"""
existing_info = get_cached_index_info(config=config, index_name=index_name)

# check if there is multimodal fie;ds and convert the fields name to a list with the same
# format of customer_field_names
knn_field_names = copy.deepcopy(customer_field_names)
if len(multimodal_combination_fields) > 0:
multimodal_customer_field_names = set([(field_name, "_") for field_name in list(multimodal_combination_fields)])
knn_field_names = knn_field_names.union(multimodal_customer_field_names)

body = {
"properties": {
enums.TensorField.chunks: {
"type": "nested",
"properties": {
validation.validate_vector_name(
utils.generate_vector_name(field_name[0])): {
"type": "knn_vector",
"dimension": model_properties["dimensions"],
"method": existing_info.get_ann_parameters()
} for field_name in knn_field_names
}
"properties": {}
}
}
}
Expand Down Expand Up @@ -146,7 +132,6 @@ def add_customer_field_properties(config: Config, index_name: str,
"type": type_to_set
}


for multimodal_field, child_fields in multimodal_combination_fields.items():
# update the new multimodal_field if it's not in it
if multimodal_field not in new_index_properties:
Expand Down
2 changes: 1 addition & 1 deletion src/marqo/tensor_search/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_default_index_settings():
NsFields.ann_parameters: get_default_ann_parameters()
},
NsFields.number_of_shards: 5,
NsFields.number_of_replicas : 1,
NsFields.number_of_replicas: 1,
}

def get_default_ann_parameters():
Expand Down
11 changes: 11 additions & 0 deletions src/marqo/tensor_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,14 @@
NON_TENSORISABLE_FIELD_TYPES = [int, float, bool, list]

ALLOWED_MULTIMODAL_FIELD_TYPES = [str]

LUCENE_SPECIAL_CHARS = {
'/', '*', '^', '\\', '!', '[', '||', '?',
'&&', '"', ']', '-', '{', '~', '+', '}', ':', ')', '('
}

# these are chars that are not officially listed as Lucene special chars, but
# aren't treated as normal chars either
NON_OFFICIAL_LUCENE_SPECIAL_CHARS = {
' '
}
1 change: 1 addition & 0 deletions src/marqo/tensor_search/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class TensorField:
chunk_ids = "__chunk_ids"
# the prefix will have the customer's field name appended to the end of it
vector_prefix = "__vector_"
marqo_knn_field = "__vector_marqo_knn_field"
chunks = "__chunks"
output_highlights = "_highlights"
output_score = "_score"
Expand Down
142 changes: 142 additions & 0 deletions src/marqo/tensor_search/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import os
import typing
from timeit import default_timer as timer
from marqo import errors
from marqo.tensor_search import enums, configs, constants
from typing import (
List, Optional, Union, Callable, Iterable, Sequence, Dict, Tuple
)
from marqo.marqo_logging import logger
import copy
from marqo.tensor_search.enums import EnvVars
import re

def build_tensor_search_filter(
filter_string: str, simple_properties: dict,
searchable_attribs: Sequence):
"""Builds a Lucene-DSL filter string for OpenSearch, that combines the user's filter string
with searchable_attributes

"""
if searchable_attribs is not None:
copied_searchable_attribs = copy.deepcopy(searchable_attribs)
searchable_attribs_filter = build_searchable_attributes_filter(
searchable_attribs=copied_searchable_attribs)
else:
searchable_attribs_filter = ""

filter_string_with_chunks_prefixes = add_chunks_prefix_to_filter_string_fields(
filter_string=filter_string, simple_properties=simple_properties)

if filter_string_with_chunks_prefixes and searchable_attribs_filter:
return f"({searchable_attribs_filter}) AND ({filter_string_with_chunks_prefixes})"
else:
return f"{searchable_attribs_filter}{filter_string_with_chunks_prefixes}"


def build_searchable_attributes_filter(searchable_attribs: Sequence) -> str:
"""Recursively constructs the filter used to narrow the search down to specific searchable attributes"""
if searchable_attribs is None or len(searchable_attribs) == 0:
return ""

vector_prop_count = len(searchable_attribs)

# brackets surround field name, in case it contains a space:
sanitised_attr_name = f"({sanitise_lucene_special_chars(searchable_attribs.pop())})"

# base case
if vector_prop_count == 1:
return f"{enums.TensorField.chunks}.{enums.TensorField.field_name}:{sanitised_attr_name}"
else:
return (
f"{enums.TensorField.chunks}.{enums.TensorField.field_name}:{sanitised_attr_name}"
f" OR {build_searchable_attributes_filter(searchable_attribs=searchable_attribs)}")


def sanitise_lucene_special_chars(to_be_sanitised: str) -> str:
"""Santitises Lucene's special chars in a string.

We shouldn't apply this to the user's filter string, as they can choose to escape
Lucene's special chars themselves.

This should be used to sanitise a filter string constructed for users behind the
scenes (such as for searchable attributes).

See here for more info:
https://lucene.apache.org/core/6_0_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Escaping_Special_Characters

"""

# always escape backslashes before the other special chars
to_be_sanitised = to_be_sanitised.replace("\\", "\\\\")

# this prevents us from double-escaping backslashes.
non_backslash_chars = constants.LUCENE_SPECIAL_CHARS.union(constants.NON_OFFICIAL_LUCENE_SPECIAL_CHARS) - {'\\'}

for char in non_backslash_chars:
to_be_sanitised = to_be_sanitised.replace(char, f'\\{char}')
return to_be_sanitised


def add_chunks_prefix_to_filter_string_fields(filter_string: Optional[str], simple_properties: typing.Iterable) -> str:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if filter_string MUST NOT be None, it should be reflected in the signature:

Omit Optional[] and just leave filter_string: str

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

filter string can be None, actually this is what is passed when the user does not input a filter string. simple_properties is the one that cannot be None.

"""adds the chunk prefix to the start of properties found in simple string (filter_string)
This allows for filtering within chunks.

Because this is a user-defined filter, if they want to filter on field names that contain
special characters, we expect them to escape the special characters themselves.

In order to search chunks we need to append the chunk prefix to the start of the field name.
This will only work if they escape the special characters in the field names themselves in
the exact same way that we do.

Args:
filter_string: the user defined filter string
simple_properties: simple properties of an index (such as text or floats
and bools)

Returns:
a string where the properties are referenced as children of a chunk.
"""
if simple_properties is None:
# If an index has no simple properties, simple_properties should be {}, but never None
raise errors.InternalError("simple properties of an index can never be None!")

if filter_string is None:
return ''

prefixed_filter = filter_string

for field in simple_properties:
escaped_field_name = sanitise_lucene_special_chars(field)
if escaped_field_name in filter_string:
# The field name MUST be followed by a colon.
escaped_field_name_with_colon = f'{escaped_field_name}:'
# we want to replace the field name that directly corresponds to the simple property,
# not any other field names that contain the simple property as a substring.

# case 0: field name is at the start of the filter string
# it must be followed by a colon, otherwise it is a substring of another field name
# edge case example: "field_a_excess_chars:a, escaped_field_name=field_a"
if filter_string.startswith(escaped_field_name_with_colon):
# add the chunk prefix ONCE to the start of the field name
prefixed_filter = f'{enums.TensorField.chunks}.{prefixed_filter}'

# next we check every occurence of field name NOT at the start of the filter string
# note: we do this even if it was also at the start
possible_chars_before_field_name = {" ", "("}
i = 0
while i < len(prefixed_filter):
# find every occurence of the field name in the filter string
if prefixed_filter[i:i+len(escaped_field_name_with_colon)] == escaped_field_name_with_colon:
# check if it is preceded by a space or an opening parenthesis
# also check that the preceding char is NOT escaped
if \
(i > 0 and prefixed_filter[i-1] in possible_chars_before_field_name) and \
(i == 1 or prefixed_filter[i-2] != "\\"):
# if so, add the chunk prefix the start of the field name
prefixed_filter = prefixed_filter[:i] + f"{enums.TensorField.chunks}." + prefixed_filter[i:]
# skip checking the newly inserted part
i += len(f"{enums.TensorField.chunks}.")
i += 1

return prefixed_filter
2 changes: 1 addition & 1 deletion src/marqo/tensor_search/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _clean_doc(doc: dict, doc_id=None, include_vectors: bool = False) -> dict:
if include_vectors:
copied[TensorField.tensor_facets] = [
{ch[TensorField.field_name]: ch[TensorField.field_content],
TensorField.embedding: ch[utils.generate_vector_name(ch[TensorField.field_name])]
TensorField.embedding: ch[TensorField.marqo_knn_field]
} for ch in copied[TensorField.chunks]
]
if TensorField.chunks in copied:
Expand Down
55 changes: 42 additions & 13 deletions src/marqo/tensor_search/models/index_info.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,36 @@
import pprint
from typing import NamedTuple, Any, Dict
from marqo.tensor_search import enums
from marqo.tensor_search.enums import IndexSettingsField as NsFields
from marqo.tensor_search.enums import IndexSettingsField as NsField
from marqo.tensor_search import configs
from marqo.s2_inference import s2_inference
from marqo import errors
from marqo.s2_inference import errors as s2_inference_errors


# For use outside of this module
def get_model_properties_from_index_defaults(index_defaults: Dict, model_name: str):
""" Gets model_properties from index defaults if available
"""
try:
model_properties = index_defaults[NsField.model_properties]
except KeyError:
try:
model_properties = s2_inference.get_model_properties_from_registry(model_name)
except s2_inference_errors.UnknownModelError:
raise errors.InvalidArgError(
f"Could not find model properties for model={model_name}. "
f"Please check that the model name is correct. "
f"Please provide model_properties if the model is a custom model and is not supported by default")
return model_properties


class IndexInfo(NamedTuple):
"""
model_name: name of the ML model used to encode the data
properties: keys are different index field names, values
provide info about the properties
index_settings: settings for the index
"""
model_name: str
properties: dict
Expand Down Expand Up @@ -36,6 +58,15 @@ def get_text_properties(self) -> dict:
bool fields.

"""
# get_text_properties will flatten the object properties
# Example: left-text_properties right-true_text_properties
# {'Description': {'type': 'text'}, {'Description': {'type': 'text'},
# 'Genre': {'type': 'text'}, 'Genre': {'type': 'text'},
# 'Title': {'type': 'text'}, 'Title': {'type': 'text'},
# 'my_combination_field': {'properties': {'lexical_field': {'type': 'text'}, -----> 'my_combination_field.lexical_field': {'type': 'text'},
# 'my_image': {'type': 'text'}, 'my_combination_field.my_image': {'type': 'text'},
# 'some_text': {'type': 'text'}}}} 'my_combination_field.some_text': {'type': 'text'}}

text_props_dict = {}
for text_field, text_props in self.properties.items():
if not text_field.startswith(enums.TensorField.vector_prefix) and not text_field in enums.TensorField.__dict__.values():
Expand All @@ -46,14 +77,12 @@ def get_text_properties(self) -> dict:
text_props_dict[f"{text_field}.{sub_field}"] = sub_field_props
return text_props_dict

# get_text_properties will flatten the object properties
# Example: left-text_properties right-true_text_properties
# {'Description': {'type': 'text'}, {'Description': {'type': 'text'},
# 'Genre': {'type': 'text'}, 'Genre': {'type': 'text'},
# 'Title': {'type': 'text'}, 'Title': {'type': 'text'},
# 'my_combination_field': {'properties': {'lexical_field': {'type': 'text'}, -----> 'my_combination_field.lexical_field': {'type': 'text'},
# 'my_image': {'type': 'text'}, 'my_combination_field.my_image': {'type': 'text'},
# 'some_text': {'type': 'text'}}}} 'my_combination_field.some_text': {'type': 'text'}}
def get_model_properties(self) -> dict:
index_defaults = self.index_settings["index_defaults"]
return get_model_properties_from_index_defaults(
index_defaults=index_defaults, model_name=self.model_name
)


def get_true_text_properties(self) -> dict:
"""returns a dict containing only names and properties of fields that
Expand All @@ -79,16 +108,16 @@ def get_ann_parameters(self) -> Dict[str, Any]:

"""
ann_default = configs.get_default_ann_parameters()
index_ann_defaults = self.index_settings[NsFields.index_defaults].get(NsFields.ann_parameters, {})
index_ann_defaults = self.index_settings[NsField.index_defaults].get(NsField.ann_parameters, {})

# index defaults override generic defaults
ann_params = {
**ann_default,
**index_ann_defaults
}
ann_params[NsFields.ann_method_parameters] = {
**ann_default[NsFields.ann_method_parameters],
**index_ann_defaults.get(NsFields.ann_method_parameters, {})
ann_params[NsField.ann_method_parameters] = {
**ann_default[NsField.ann_method_parameters],
**index_ann_defaults.get(NsField.ann_method_parameters, {})
}

return ann_params
Loading