Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add HNSW hyperparameters m, ef_construction to default index_settings config #386

Merged
merged 9 commits into from
Apr 3, 2023
12 changes: 2 additions & 10 deletions src/marqo/tensor_search/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def add_customer_field_properties(config: Config, index_name: str,
HTTP Response
"""
engine = "lucene"
existing_info = get_cached_index_info(config=config, index_name=index_name)

# check if there is multimodal fie;ds and convert the fields name to a list with the same
# format of customer_field_names
Expand All @@ -96,21 +97,12 @@ def add_customer_field_properties(config: Config, index_name: str,
utils.generate_vector_name(field_name[0])): {
"type": "knn_vector",
"dimension": model_properties["dimensions"],
"method": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could be "ann_method" for clarity, since it's handling ann parameters. but may not be important as this isn't exposed to users

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, this is explicitly defined by Opensearch, we must follow their convention.

"name": "hnsw",
"space_type": "cosinesimil",
"engine": engine,
"parameters": {
"ef_construction": 128,
"m": 16
}
}
"method": existing_info.get_ann_parameters()
} for field_name in knn_field_names
}
}
}
}
existing_info = get_cached_index_info(config=config, index_name=index_name)
new_index_properties = existing_info.properties.copy()

# copy fields to the chunk for prefiltering. If it is text, convert it to a keyword type to save space
Expand Down
16 changes: 15 additions & 1 deletion src/marqo/tensor_search/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,26 @@ def get_default_index_settings():
# TODO move these into a processing dict with sub-dicts
NsFields.image_preprocessing: {
NsFields.patch_method: None
}
},
NsFields.ann_parameters: get_default_ann_parameters()
},
NsFields.number_of_shards: 5,
NsFields.number_of_replicas : 1,
}

def get_default_ann_parameters():
return {
NsFields.ann_method_name: "hnsw",
NsFields.ann_metric: "cosinesimil",

# `ann_engine` not exposed to customer (via index settings).
NsFields.ann_engine: "lucene",
NsFields.ann_method_parameters: {
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}
}


def default_env_vars() -> dict:
"""Returns a dict of default env vars.
Expand Down
11 changes: 11 additions & 0 deletions src/marqo/tensor_search/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ class IndexSettingsField:
number_of_shards = "number_of_shards"
number_of_replicas = "number_of_replicas"

ann_parameters = "ann_parameters"
ann_method = "method"
ann_method_name = "name"
ann_metric = "space_type"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why isn't this just ann_space_type

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, see above.

ann_engine = "engine"
ann_method_parameters = "parameters"

# method_parameters keys for "method"="hnsw"
hnsw_ef_construction = "ef_construction"
hnsw_m = "m"


class SplitMethod:
# consider moving this enum into processing
Expand Down
31 changes: 28 additions & 3 deletions src/marqo/tensor_search/models/index_info.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pprint
from typing import NamedTuple, Any
from typing import NamedTuple, Any, Dict
from marqo.tensor_search import enums

from marqo.tensor_search.enums import IndexSettingsField as NsFields
from marqo.tensor_search import configs

class IndexInfo(NamedTuple):
"""
Expand Down Expand Up @@ -66,4 +67,28 @@ def get_true_text_properties(self) -> dict:
true_text_props[text_field] = text_props
except KeyError:
continue
return true_text_props
return true_text_props

def get_ann_parameters(self) -> Dict[str, Any]:
"""Gets the ANN parameters to use as the default for the index.

Preferentially use index settings over generic defaults, when index settings exist.

Returns:
Dict of ann parameters. Structure can be seen at `configs.get_default_ann_parameters`.

"""
ann_default = configs.get_default_ann_parameters()
index_ann_defaults = self.index_settings[NsFields.index_defaults].get(NsFields.ann_parameters, {})

# index defaults override generic defaults
ann_params = {
**ann_default,
**index_ann_defaults
}
ann_params[NsFields.ann_method_parameters] = {
**ann_default[NsFields.ann_method_parameters],
**index_ann_defaults.get(NsFields.ann_method_parameters, {})
}

return ann_params
66 changes: 66 additions & 0 deletions src/marqo/tensor_search/models/settings_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,56 @@
"examples": [{
NsFields.patch_method: None
}]
},
NsFields.ann_parameters: {
"type": "object",
"required": [
# Non required for backwards compatibility
],
"properties": {
NsFields.ann_method: {
"type": "string",
"examples": [
"hnsw"
]
},
NsFields.ann_metric: {
"type": "string",
"examples": [
"cosinesimil"
]
},
NsFields.ann_method_parameters: {
"type": "object",
"required": [],
"properties": {
NsFields.hnsw_ef_construction: {
"type": "integer",
"examples": [
128
]
},
NsFields.hnsw_m: {
"type": "integer",
"examples": [
16
]
},
},
"examples": [{
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}]
}
},
"examples": [{
NsFields.ann_method: "hnsw",
NsFields.ann_metric: "cosinesimil",
NsFields.ann_method_parameters: {
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}
}]
}
},
"examples": [{
Expand All @@ -105,6 +155,14 @@
},
NsFields.image_preprocessing: {
NsFields.patch_method: None
},
NsFields.ann_parameters: {
NsFields.ann_method: "hnsw",
NsFields.ann_metric: "cosinesimil",
NsFields.ann_method_parameters: {
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}
}
}]
},
Expand Down Expand Up @@ -135,6 +193,14 @@
},
NsFields.image_preprocessing: {
NsFields.patch_method: None
},
NsFields.ann_parameters: {
NsFields.ann_method: "hnsw",
NsFields.ann_metric: "cosinesimil",
NsFields.ann_method_parameters: {
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}
}
},
NsFields.number_of_shards: 5,
Expand Down
52 changes: 51 additions & 1 deletion tests/tensor_search/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import requests
from marqo.tensor_search import enums, backend, utils
from marqo.tensor_search import tensor_search
from marqo.tensor_search.configs import get_default_ann_parameters
from marqo.errors import MarqoApiError, IndexNotFoundError
from tests.marqo_test import MarqoTestCase
from unittest import mock
Expand Down Expand Up @@ -68,4 +69,53 @@ def run():
args, kwargs0 = mock__put.call_args_list[0]
sent_dict = json.loads(kwargs0["body"])
assert "lucene" == sent_dict["properties"][enums.TensorField.chunks
]["properties"][utils.generate_vector_name(field_name="f1")]["method"]["engine"]
]["properties"][utils.generate_vector_name(field_name="f1")]["method"]["engine"]

def test_add_customer_field_properties_default_ann_parameters(self):
mock_config = copy.deepcopy(self.config)
mock__put = mock.MagicMock()

tensor_search.create_vector_index(
config=mock_config, index_name=self.index_name_1)
@mock.patch("marqo._httprequests.HttpRequests.put", mock__put)
def run():
tensor_search.add_documents(config=mock_config, docs=[{"f1": "doc"}, {"f2":"C"}],
index_name=self.index_name_1, auto_refresh=True)
return True
assert run()
args, kwargs0 = mock__put.call_args_list[0]
sent_dict = json.loads(kwargs0["body"])
assert sent_dict["properties"][enums.TensorField.chunks]["properties"][utils.generate_vector_name(field_name="f1")]["method"] == get_default_ann_parameters()


def test_add_customer_field_properties_index_ann_parameters(self):
mock_config = copy.deepcopy(self.config)
mock__put = mock.MagicMock()

tensor_search.create_vector_index(
config=mock_config,
index_name=self.index_name_1,
index_settings={
enums.IndexSettingsField.index_defaults: {
enums.IndexSettingsField.ann_parameters: {
enums.IndexSettingsField.ann_method_parameters: {
enums.IndexSettingsField.hnsw_ef_construction: 1,
enums.IndexSettingsField.hnsw_m: 2
}
}
}
}
)
@mock.patch("marqo._httprequests.HttpRequests.put", mock__put)
def run():
tensor_search.add_documents(config=mock_config, docs=[{"f1": "doc"}, {"f2":"C"}],
index_name=self.index_name_1, auto_refresh=True)
return True
assert run()
args, kwargs0 = mock__put.call_args_list[0]
sent_dict = json.loads(kwargs0["body"])
assert sent_dict["properties"][enums.TensorField.chunks]["properties"][utils.generate_vector_name(field_name="f1")]["method"]['engine'] == "lucene"
assert sent_dict["properties"][enums.TensorField.chunks]["properties"][utils.generate_vector_name(field_name="f1")]["method"]["parameters"] == {
enums.IndexSettingsField.hnsw_ef_construction: 1,
enums.IndexSettingsField.hnsw_m: 2
}
11 changes: 10 additions & 1 deletion tests/tensor_search/test_create_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,16 @@ def test_create_vector_index_custom_index_settings(self):
IndexSettingsField.treat_urls_and_pointers_as_images: True,
IndexSettingsField.normalize_embeddings: False,
IndexSettingsField.text_preprocessing: default_text_preprocessing,
IndexSettingsField.image_preprocessing: default_image_preprocessing
IndexSettingsField.image_preprocessing: default_image_preprocessing,
IndexSettingsField.ann_parameters: {
IndexSettingsField.ann_engine: 'lucene',
IndexSettingsField.ann_method_name: 'hnsw',
IndexSettingsField.ann_method_parameters: {
IndexSettingsField.hnsw_ef_construction: 128,
IndexSettingsField.hnsw_m: 16
},
IndexSettingsField.ann_metric: 'cosinesimil'
},
},
IndexSettingsField.number_of_shards: default_settings[IndexSettingsField.number_of_shards],
IndexSettingsField.number_of_replicas: default_settings[IndexSettingsField.number_of_replicas]
Expand Down
62 changes: 61 additions & 1 deletion tests/tensor_search/test_index_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import unittest
from marqo.tensor_search.models.index_info import IndexInfo
from marqo.tensor_search.models import index_info
from marqo.tensor_search.enums import TensorField
from marqo.tensor_search.enums import IndexSettingsField as NsFields, TensorField
from marqo.tensor_search import configs


Expand Down Expand Up @@ -98,3 +98,63 @@ def test_get_text_properties_some_text_props(self):
index_settings=configs.get_default_index_settings()
)
assert {"some_text_prop": {1:2334}, "cat": {"hat": "ter"}} == ii.get_text_properties()

def test_get_ann_parameters__default_index_param(self):
ii = IndexInfo(
model_name='some model',
properties={},
index_settings=configs.get_default_index_settings()
)
assert ii.get_ann_parameters() == configs.get_default_ann_parameters()

def test_get_ann_parameters__without_default_ann_parameters__use_defaults(self):
index_settings = configs.get_default_index_settings()
del index_settings[NsFields.index_defaults][NsFields.ann_parameters]

ii = IndexInfo(
model_name='some model',
properties={},
index_settings=index_settings
)
assert ii.get_ann_parameters() == configs.get_default_ann_parameters()

def test_get_ann_parameters__use_specified_index_settings__overide_defaults(self):
index_settings = configs.get_default_index_settings()
index_settings[NsFields.index_defaults][NsFields.ann_parameters][NsFields.ann_method_name] = "not-hnsw"

ii = IndexInfo(
model_name='some model',
properties={},
index_settings=index_settings
)
actual = ii.get_ann_parameters()
default = configs.get_default_ann_parameters()
assert actual[NsFields.ann_method_name] == "not-hnsw"

del actual[NsFields.ann_method_name]
del default[NsFields.ann_method_name]

assert actual == default

def test_get_ann_parameters__use_specified_ann_method_parameters__default_unspecified_values(self):
index_settings = configs.get_default_index_settings()
index_settings[NsFields.index_defaults][NsFields.ann_parameters][NsFields.ann_method_parameters] = {
NsFields.hnsw_ef_construction: 1,
NsFields.hnsw_m: 2
}

ii = IndexInfo(
model_name='some model',
properties={},
index_settings=index_settings
)
default = configs.get_default_ann_parameters()
actual = ii.get_ann_parameters()
assert actual[NsFields.ann_method_parameters] == {
NsFields.hnsw_ef_construction: 1,
NsFields.hnsw_m: 2
}
del actual[NsFields.ann_method_parameters]
del default[NsFields.ann_method_parameters]

assert actual == default