Skip to content

Commit

Permalink
Add HNSW hyperparameters m, ef_construction to default `index_set…
Browse files Browse the repository at this point in the history
…tings` config (#386)

* add hnsw hyperparameters m, ef_construction to default index_settings config

* update from self-PR review

* add hnsw hyperparameters m, ef_construction to default index_settings config

* fix OS index information parameters

* Readme examples new features (#403)

* Updated readme with new examples from the documentation and reordered the examples to keep document deletion and index deletion at the bottom

* removed accidental duplication of example

* updated links in README examples to more stable versions

* add hnsw hyperparameters m, ef_construction to default index_settings config

* add ann_parameters defaults into test_create_vector_index_custom_index_settings

---------

Co-authored-by: OwenElliott <41710527+OwenPendrighElliott@users.noreply.github.com>
  • Loading branch information
Jeadie and OwenPendrighElliott authored Apr 3, 2023
1 parent 5452c07 commit 37b1231
Show file tree
Hide file tree
Showing 8 changed files with 244 additions and 17 deletions.
12 changes: 2 additions & 10 deletions src/marqo/tensor_search/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def add_customer_field_properties(config: Config, index_name: str,
HTTP Response
"""
engine = "lucene"
existing_info = get_cached_index_info(config=config, index_name=index_name)

# check if there is multimodal fie;ds and convert the fields name to a list with the same
# format of customer_field_names
Expand All @@ -96,21 +97,12 @@ def add_customer_field_properties(config: Config, index_name: str,
utils.generate_vector_name(field_name[0])): {
"type": "knn_vector",
"dimension": model_properties["dimensions"],
"method": {
"name": "hnsw",
"space_type": "cosinesimil",
"engine": engine,
"parameters": {
"ef_construction": 128,
"m": 16
}
}
"method": existing_info.get_ann_parameters()
} for field_name in knn_field_names
}
}
}
}
existing_info = get_cached_index_info(config=config, index_name=index_name)
new_index_properties = existing_info.properties.copy()

# copy fields to the chunk for prefiltering. If it is text, convert it to a keyword type to save space
Expand Down
16 changes: 15 additions & 1 deletion src/marqo/tensor_search/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,26 @@ def get_default_index_settings():
# TODO move these into a processing dict with sub-dicts
NsFields.image_preprocessing: {
NsFields.patch_method: None
}
},
NsFields.ann_parameters: get_default_ann_parameters()
},
NsFields.number_of_shards: 5,
NsFields.number_of_replicas : 1,
}

def get_default_ann_parameters():
return {
NsFields.ann_method_name: "hnsw",
NsFields.ann_metric: "cosinesimil",

# `ann_engine` not exposed to customer (via index settings).
NsFields.ann_engine: "lucene",
NsFields.ann_method_parameters: {
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}
}


def default_env_vars() -> dict:
"""Returns a dict of default env vars.
Expand Down
11 changes: 11 additions & 0 deletions src/marqo/tensor_search/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ class IndexSettingsField:
number_of_shards = "number_of_shards"
number_of_replicas = "number_of_replicas"

ann_parameters = "ann_parameters"
ann_method = "method"
ann_method_name = "name"
ann_metric = "space_type"
ann_engine = "engine"
ann_method_parameters = "parameters"

# method_parameters keys for "method"="hnsw"
hnsw_ef_construction = "ef_construction"
hnsw_m = "m"


class SplitMethod:
# consider moving this enum into processing
Expand Down
31 changes: 28 additions & 3 deletions src/marqo/tensor_search/models/index_info.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pprint
from typing import NamedTuple, Any
from typing import NamedTuple, Any, Dict
from marqo.tensor_search import enums

from marqo.tensor_search.enums import IndexSettingsField as NsFields
from marqo.tensor_search import configs

class IndexInfo(NamedTuple):
"""
Expand Down Expand Up @@ -66,4 +67,28 @@ def get_true_text_properties(self) -> dict:
true_text_props[text_field] = text_props
except KeyError:
continue
return true_text_props
return true_text_props

def get_ann_parameters(self) -> Dict[str, Any]:
"""Gets the ANN parameters to use as the default for the index.
Preferentially use index settings over generic defaults, when index settings exist.
Returns:
Dict of ann parameters. Structure can be seen at `configs.get_default_ann_parameters`.
"""
ann_default = configs.get_default_ann_parameters()
index_ann_defaults = self.index_settings[NsFields.index_defaults].get(NsFields.ann_parameters, {})

# index defaults override generic defaults
ann_params = {
**ann_default,
**index_ann_defaults
}
ann_params[NsFields.ann_method_parameters] = {
**ann_default[NsFields.ann_method_parameters],
**index_ann_defaults.get(NsFields.ann_method_parameters, {})
}

return ann_params
66 changes: 66 additions & 0 deletions src/marqo/tensor_search/models/settings_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,56 @@
"examples": [{
NsFields.patch_method: None
}]
},
NsFields.ann_parameters: {
"type": "object",
"required": [
# Non required for backwards compatibility
],
"properties": {
NsFields.ann_method: {
"type": "string",
"examples": [
"hnsw"
]
},
NsFields.ann_metric: {
"type": "string",
"examples": [
"cosinesimil"
]
},
NsFields.ann_method_parameters: {
"type": "object",
"required": [],
"properties": {
NsFields.hnsw_ef_construction: {
"type": "integer",
"examples": [
128
]
},
NsFields.hnsw_m: {
"type": "integer",
"examples": [
16
]
},
},
"examples": [{
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}]
}
},
"examples": [{
NsFields.ann_method: "hnsw",
NsFields.ann_metric: "cosinesimil",
NsFields.ann_method_parameters: {
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}
}]
}
},
"examples": [{
Expand All @@ -105,6 +155,14 @@
},
NsFields.image_preprocessing: {
NsFields.patch_method: None
},
NsFields.ann_parameters: {
NsFields.ann_method: "hnsw",
NsFields.ann_metric: "cosinesimil",
NsFields.ann_method_parameters: {
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}
}
}]
},
Expand Down Expand Up @@ -135,6 +193,14 @@
},
NsFields.image_preprocessing: {
NsFields.patch_method: None
},
NsFields.ann_parameters: {
NsFields.ann_method: "hnsw",
NsFields.ann_metric: "cosinesimil",
NsFields.ann_method_parameters: {
NsFields.hnsw_ef_construction: 128,
NsFields.hnsw_m: 16
}
}
},
NsFields.number_of_shards: 5,
Expand Down
52 changes: 51 additions & 1 deletion tests/tensor_search/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import requests
from marqo.tensor_search import enums, backend, utils
from marqo.tensor_search import tensor_search
from marqo.tensor_search.configs import get_default_ann_parameters
from marqo.errors import MarqoApiError, IndexNotFoundError
from tests.marqo_test import MarqoTestCase
from unittest import mock
Expand Down Expand Up @@ -68,4 +69,53 @@ def run():
args, kwargs0 = mock__put.call_args_list[0]
sent_dict = json.loads(kwargs0["body"])
assert "lucene" == sent_dict["properties"][enums.TensorField.chunks
]["properties"][utils.generate_vector_name(field_name="f1")]["method"]["engine"]
]["properties"][utils.generate_vector_name(field_name="f1")]["method"]["engine"]

def test_add_customer_field_properties_default_ann_parameters(self):
mock_config = copy.deepcopy(self.config)
mock__put = mock.MagicMock()

tensor_search.create_vector_index(
config=mock_config, index_name=self.index_name_1)
@mock.patch("marqo._httprequests.HttpRequests.put", mock__put)
def run():
tensor_search.add_documents(config=mock_config, docs=[{"f1": "doc"}, {"f2":"C"}],
index_name=self.index_name_1, auto_refresh=True)
return True
assert run()
args, kwargs0 = mock__put.call_args_list[0]
sent_dict = json.loads(kwargs0["body"])
assert sent_dict["properties"][enums.TensorField.chunks]["properties"][utils.generate_vector_name(field_name="f1")]["method"] == get_default_ann_parameters()


def test_add_customer_field_properties_index_ann_parameters(self):
mock_config = copy.deepcopy(self.config)
mock__put = mock.MagicMock()

tensor_search.create_vector_index(
config=mock_config,
index_name=self.index_name_1,
index_settings={
enums.IndexSettingsField.index_defaults: {
enums.IndexSettingsField.ann_parameters: {
enums.IndexSettingsField.ann_method_parameters: {
enums.IndexSettingsField.hnsw_ef_construction: 1,
enums.IndexSettingsField.hnsw_m: 2
}
}
}
}
)
@mock.patch("marqo._httprequests.HttpRequests.put", mock__put)
def run():
tensor_search.add_documents(config=mock_config, docs=[{"f1": "doc"}, {"f2":"C"}],
index_name=self.index_name_1, auto_refresh=True)
return True
assert run()
args, kwargs0 = mock__put.call_args_list[0]
sent_dict = json.loads(kwargs0["body"])
assert sent_dict["properties"][enums.TensorField.chunks]["properties"][utils.generate_vector_name(field_name="f1")]["method"]['engine'] == "lucene"
assert sent_dict["properties"][enums.TensorField.chunks]["properties"][utils.generate_vector_name(field_name="f1")]["method"]["parameters"] == {
enums.IndexSettingsField.hnsw_ef_construction: 1,
enums.IndexSettingsField.hnsw_m: 2
}
11 changes: 10 additions & 1 deletion tests/tensor_search/test_create_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,16 @@ def test_create_vector_index_custom_index_settings(self):
IndexSettingsField.treat_urls_and_pointers_as_images: True,
IndexSettingsField.normalize_embeddings: False,
IndexSettingsField.text_preprocessing: default_text_preprocessing,
IndexSettingsField.image_preprocessing: default_image_preprocessing
IndexSettingsField.image_preprocessing: default_image_preprocessing,
IndexSettingsField.ann_parameters: {
IndexSettingsField.ann_engine: 'lucene',
IndexSettingsField.ann_method_name: 'hnsw',
IndexSettingsField.ann_method_parameters: {
IndexSettingsField.hnsw_ef_construction: 128,
IndexSettingsField.hnsw_m: 16
},
IndexSettingsField.ann_metric: 'cosinesimil'
},
},
IndexSettingsField.number_of_shards: default_settings[IndexSettingsField.number_of_shards],
IndexSettingsField.number_of_replicas: default_settings[IndexSettingsField.number_of_replicas]
Expand Down
62 changes: 61 additions & 1 deletion tests/tensor_search/test_index_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import unittest
from marqo.tensor_search.models.index_info import IndexInfo
from marqo.tensor_search.models import index_info
from marqo.tensor_search.enums import TensorField
from marqo.tensor_search.enums import IndexSettingsField as NsFields, TensorField
from marqo.tensor_search import configs


Expand Down Expand Up @@ -98,3 +98,63 @@ def test_get_text_properties_some_text_props(self):
index_settings=configs.get_default_index_settings()
)
assert {"some_text_prop": {1:2334}, "cat": {"hat": "ter"}} == ii.get_text_properties()

def test_get_ann_parameters__default_index_param(self):
ii = IndexInfo(
model_name='some model',
properties={},
index_settings=configs.get_default_index_settings()
)
assert ii.get_ann_parameters() == configs.get_default_ann_parameters()

def test_get_ann_parameters__without_default_ann_parameters__use_defaults(self):
index_settings = configs.get_default_index_settings()
del index_settings[NsFields.index_defaults][NsFields.ann_parameters]

ii = IndexInfo(
model_name='some model',
properties={},
index_settings=index_settings
)
assert ii.get_ann_parameters() == configs.get_default_ann_parameters()

def test_get_ann_parameters__use_specified_index_settings__overide_defaults(self):
index_settings = configs.get_default_index_settings()
index_settings[NsFields.index_defaults][NsFields.ann_parameters][NsFields.ann_method_name] = "not-hnsw"

ii = IndexInfo(
model_name='some model',
properties={},
index_settings=index_settings
)
actual = ii.get_ann_parameters()
default = configs.get_default_ann_parameters()
assert actual[NsFields.ann_method_name] == "not-hnsw"

del actual[NsFields.ann_method_name]
del default[NsFields.ann_method_name]

assert actual == default

def test_get_ann_parameters__use_specified_ann_method_parameters__default_unspecified_values(self):
index_settings = configs.get_default_index_settings()
index_settings[NsFields.index_defaults][NsFields.ann_parameters][NsFields.ann_method_parameters] = {
NsFields.hnsw_ef_construction: 1,
NsFields.hnsw_m: 2
}

ii = IndexInfo(
model_name='some model',
properties={},
index_settings=index_settings
)
default = configs.get_default_ann_parameters()
actual = ii.get_ann_parameters()
assert actual[NsFields.ann_method_parameters] == {
NsFields.hnsw_ef_construction: 1,
NsFields.hnsw_m: 2
}
del actual[NsFields.ann_method_parameters]
del default[NsFields.ann_method_parameters]

assert actual == default

0 comments on commit 37b1231

Please sign in to comment.