diff --git a/src/marqo/core/models/marqo_index.py b/src/marqo/core/models/marqo_index.py index b3b1d1fc5..d58a18a35 100644 --- a/src/marqo/core/models/marqo_index.py +++ b/src/marqo/core/models/marqo_index.py @@ -56,7 +56,7 @@ class DistanceMetric(Enum): Euclidean = 'euclidean' Angular = 'angular' DotProduct = 'dotproduct' - PrenormalizedAnguar = 'prenormalized-angular' + PrenormalizedAngular = 'prenormalized-angular' Geodegrees = 'geodegrees' Hamming = 'hamming' diff --git a/src/marqo/core/structured_vespa_index/common.py b/src/marqo/core/structured_vespa_index/common.py index 24156a5ad..0d36c3de8 100644 --- a/src/marqo/core/structured_vespa_index/common.py +++ b/src/marqo/core/structured_vespa_index/common.py @@ -17,19 +17,3 @@ SUMMARY_ALL_NON_VECTOR = 'all-non-vector-summary' SUMMARY_ALL_VECTOR = 'all-vector-summary' - -_DISTANCE_METRIC_MAP = { - DistanceMetric.Euclidean: 'euclidean', - DistanceMetric.Angular: 'angular', - DistanceMetric.DotProduct: 'dotproduct', - DistanceMetric.PrenormalizedAnguar: 'prenormalized-angular', - DistanceMetric.Geodegrees: 'geodegrees', - DistanceMetric.Hamming: 'hamming' -} - - -def get_distance_metric(marqo_distance_metric: DistanceMetric) -> str: - try: - return _DISTANCE_METRIC_MAP[marqo_distance_metric] - except KeyError: - raise ValueError(f'Unknown Marqo distance metric: {marqo_distance_metric}') diff --git a/src/marqo/core/structured_vespa_index/structured_vespa_schema.py b/src/marqo/core/structured_vespa_index/structured_vespa_schema.py index 868b9e5c8..5eec41640 100644 --- a/src/marqo/core/structured_vespa_index/structured_vespa_schema.py +++ b/src/marqo/core/structured_vespa_index/structured_vespa_schema.py @@ -129,7 +129,7 @@ def _generate_document_section(self, schema_name: str) -> (List[str], Structured document.append(f'field {embedding_field_name} type tensor(p{{}}, x[{model_dim}]) {{') document.append('indexing: attribute | index | summary') document.append( - f'attribute {{ distance-metric: {common.get_distance_metric(self._index_request.distance_metric)} }}') + f'attribute {{ distance-metric: {self._get_distance_metric(self._index_request.distance_metric)} }}') document.append('index { hnsw {') document.append(f'max-links-per-node: {self._index_request.hnsw_config.m}') document.append(f'neighbors-to-explore-at-insert: {self._index_request.hnsw_config.ef_construction}') diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_vespa_schema.py b/src/marqo/core/unstructured_vespa_index/unstructured_vespa_schema.py index b8ff0c249..0e83de7b9 100644 --- a/src/marqo/core/unstructured_vespa_index/unstructured_vespa_schema.py +++ b/src/marqo/core/unstructured_vespa_index/unstructured_vespa_schema.py @@ -152,7 +152,7 @@ def _generate_unstructured_schema(cls, marqo_index: UnstructuredMarqoIndex) -> s field {cls._EMBEDDINGS} type tensor(p{{}}, x[{dimension}]) {{ indexing: attribute | index | summary attribute {{ - distance-metric: prenormalized-angular + distance-metric: {cls._get_distance_metric(cls, marqo_index.distance_metric)} }} index {{ hnsw {{ diff --git a/src/marqo/core/vespa_schema.py b/src/marqo/core/vespa_schema.py index 2c5f7680d..d148721ac 100644 --- a/src/marqo/core/vespa_schema.py +++ b/src/marqo/core/vespa_schema.py @@ -5,6 +5,8 @@ from marqo.core.models.marqo_index_request import MarqoIndexRequest, StructuredMarqoIndexRequest, \ UnstructuredMarqoIndexRequest +from marqo.core.models.marqo_index import * + class VespaSchema(ABC): """ @@ -16,6 +18,21 @@ class VespaSchema(ABC): '-': '_01', } + _DISTANCE_METRIC_MAP = { + DistanceMetric.Euclidean: 'euclidean', + DistanceMetric.Angular: 'angular', + DistanceMetric.DotProduct: 'dotproduct', + DistanceMetric.PrenormalizedAngular: 'prenormalized-angular', + DistanceMetric.Geodegrees: 'geodegrees', + DistanceMetric.Hamming: 'hamming' + } + + def _get_distance_metric(self, marqo_distance_metric: DistanceMetric) -> str: + try: + return self._DISTANCE_METRIC_MAP[marqo_distance_metric] + except KeyError: + raise ValueError(f'Unknown Marqo distance metric: {marqo_distance_metric}') + @abstractmethod def generate_schema(self) -> (str, MarqoIndex): """ diff --git a/src/marqo/tensor_search/models/index_settings.py b/src/marqo/tensor_search/models/index_settings.py index 893f5734c..9f9470370 100644 --- a/src/marqo/tensor_search/models/index_settings.py +++ b/src/marqo/tensor_search/models/index_settings.py @@ -35,7 +35,7 @@ class IndexSettings(StrictBaseModel): ) vectorNumericType: core.VectorNumericType = core.VectorNumericType.Float annParameters: AnnParameters = AnnParameters( - spaceType=core.DistanceMetric.PrenormalizedAnguar, + spaceType=core.DistanceMetric.PrenormalizedAngular, parameters=core.HnswConfig( efConstruction=512, m=16 diff --git a/tests/core/index_management/test_index_management.py b/tests/core/index_management/test_index_management.py index 373d16d3f..359ba5df9 100644 --- a/tests/core/index_management/test_index_management.py +++ b/tests/core/index_management/test_index_management.py @@ -113,7 +113,7 @@ def test_create_index_settingsSchemaDoesNotExist_successful(self): marqo_index_request = self.structured_marqo_index_request( name=index_name, model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ @@ -156,7 +156,7 @@ def test_create_index_settingsSchemaExists_successful(self): marqo_index_request = self.structured_marqo_index_request( name=index_name_1, model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ @@ -203,7 +203,7 @@ def test_create_index_indexExists_fails(self): marqo_index_request = self.structured_marqo_index_request( name=index_name, model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ diff --git a/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_angular.sd b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_angular.sd new file mode 100644 index 000000000..bdb8dc953 --- /dev/null +++ b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_angular.sd @@ -0,0 +1,65 @@ +schema marqo__test_00structured_00schema_00distance_00metric { +document { +field marqo__id type string { +indexing: attribute | summary +attribute: fast-search +rank: filter +} +field title type string { +indexing: summary +} +field description type string { +indexing: summary +} +field marqo__chunks_title type array { +indexing: attribute | summary +} +field marqo__embeddings_title type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: angular } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__chunks_description type array { +indexing: attribute | summary +} +field marqo__embeddings_description type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: angular } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__vector_count type int { indexing: attribute | summary } +} +rank-profile embedding_similarity inherits default { +inputs { +query(marqo__query_embedding) tensor(x[512]) +query(title): 0 +query(description): 0 +} +first-phase { +expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0)) +} +match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description) +} +document-summary all-non-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +} +document-summary all-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +summary marqo__embeddings_title type tensor(p{}, x[512]) { } +summary marqo__embeddings_description type tensor(p{}, x[512]) { } +} +} \ No newline at end of file diff --git a/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_dotproduct.sd b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_dotproduct.sd new file mode 100644 index 000000000..cdfc57558 --- /dev/null +++ b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_dotproduct.sd @@ -0,0 +1,65 @@ +schema marqo__test_00structured_00schema_00distance_00metric { +document { +field marqo__id type string { +indexing: attribute | summary +attribute: fast-search +rank: filter +} +field title type string { +indexing: summary +} +field description type string { +indexing: summary +} +field marqo__chunks_title type array { +indexing: attribute | summary +} +field marqo__embeddings_title type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: dotproduct } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__chunks_description type array { +indexing: attribute | summary +} +field marqo__embeddings_description type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: dotproduct } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__vector_count type int { indexing: attribute | summary } +} +rank-profile embedding_similarity inherits default { +inputs { +query(marqo__query_embedding) tensor(x[512]) +query(title): 0 +query(description): 0 +} +first-phase { +expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0)) +} +match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description) +} +document-summary all-non-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +} +document-summary all-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +summary marqo__embeddings_title type tensor(p{}, x[512]) { } +summary marqo__embeddings_description type tensor(p{}, x[512]) { } +} +} \ No newline at end of file diff --git a/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_euclidean.sd b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_euclidean.sd new file mode 100644 index 000000000..f07fb50c7 --- /dev/null +++ b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_euclidean.sd @@ -0,0 +1,65 @@ +schema marqo__test_00structured_00schema_00distance_00metric { +document { +field marqo__id type string { +indexing: attribute | summary +attribute: fast-search +rank: filter +} +field title type string { +indexing: summary +} +field description type string { +indexing: summary +} +field marqo__chunks_title type array { +indexing: attribute | summary +} +field marqo__embeddings_title type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: euclidean } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__chunks_description type array { +indexing: attribute | summary +} +field marqo__embeddings_description type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: euclidean } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__vector_count type int { indexing: attribute | summary } +} +rank-profile embedding_similarity inherits default { +inputs { +query(marqo__query_embedding) tensor(x[512]) +query(title): 0 +query(description): 0 +} +first-phase { +expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0)) +} +match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description) +} +document-summary all-non-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +} +document-summary all-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +summary marqo__embeddings_title type tensor(p{}, x[512]) { } +summary marqo__embeddings_description type tensor(p{}, x[512]) { } +} +} \ No newline at end of file diff --git a/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_geodegrees.sd b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_geodegrees.sd new file mode 100644 index 000000000..bfdae9172 --- /dev/null +++ b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_geodegrees.sd @@ -0,0 +1,65 @@ +schema marqo__test_00structured_00schema_00distance_00metric { +document { +field marqo__id type string { +indexing: attribute | summary +attribute: fast-search +rank: filter +} +field title type string { +indexing: summary +} +field description type string { +indexing: summary +} +field marqo__chunks_title type array { +indexing: attribute | summary +} +field marqo__embeddings_title type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: geodegrees } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__chunks_description type array { +indexing: attribute | summary +} +field marqo__embeddings_description type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: geodegrees } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__vector_count type int { indexing: attribute | summary } +} +rank-profile embedding_similarity inherits default { +inputs { +query(marqo__query_embedding) tensor(x[512]) +query(title): 0 +query(description): 0 +} +first-phase { +expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0)) +} +match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description) +} +document-summary all-non-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +} +document-summary all-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +summary marqo__embeddings_title type tensor(p{}, x[512]) { } +summary marqo__embeddings_description type tensor(p{}, x[512]) { } +} +} \ No newline at end of file diff --git a/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_hamming.sd b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_hamming.sd new file mode 100644 index 000000000..354dcd0af --- /dev/null +++ b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_hamming.sd @@ -0,0 +1,65 @@ +schema marqo__test_00structured_00schema_00distance_00metric { +document { +field marqo__id type string { +indexing: attribute | summary +attribute: fast-search +rank: filter +} +field title type string { +indexing: summary +} +field description type string { +indexing: summary +} +field marqo__chunks_title type array { +indexing: attribute | summary +} +field marqo__embeddings_title type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: hamming } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__chunks_description type array { +indexing: attribute | summary +} +field marqo__embeddings_description type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: hamming } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__vector_count type int { indexing: attribute | summary } +} +rank-profile embedding_similarity inherits default { +inputs { +query(marqo__query_embedding) tensor(x[512]) +query(title): 0 +query(description): 0 +} +first-phase { +expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0)) +} +match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description) +} +document-summary all-non-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +} +document-summary all-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +summary marqo__embeddings_title type tensor(p{}, x[512]) { } +summary marqo__embeddings_description type tensor(p{}, x[512]) { } +} +} \ No newline at end of file diff --git a/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_prenormalized-angular.sd b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_prenormalized-angular.sd new file mode 100644 index 000000000..83261d5f9 --- /dev/null +++ b/tests/core/structured_vespa_index/test_schemas/structured_distance_metric_prenormalized-angular.sd @@ -0,0 +1,65 @@ +schema marqo__test_00structured_00schema_00distance_00metric { +document { +field marqo__id type string { +indexing: attribute | summary +attribute: fast-search +rank: filter +} +field title type string { +indexing: summary +} +field description type string { +indexing: summary +} +field marqo__chunks_title type array { +indexing: attribute | summary +} +field marqo__embeddings_title type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: prenormalized-angular } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__chunks_description type array { +indexing: attribute | summary +} +field marqo__embeddings_description type tensor(p{}, x[512]) { +indexing: attribute | index | summary +attribute { distance-metric: prenormalized-angular } +index { hnsw { +max-links-per-node: 16 +neighbors-to-explore-at-insert: 100 +}} +} +field marqo__vector_count type int { indexing: attribute | summary } +} +rank-profile embedding_similarity inherits default { +inputs { +query(marqo__query_embedding) tensor(x[512]) +query(title): 0 +query(description): 0 +} +first-phase { +expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0)) +} +match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description) +} +document-summary all-non-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +} +document-summary all-vector-summary { +summary marqo__id type string { } +summary title type string { source: title } +summary description type string { source: description } +summary marqo__chunks_title type array { } +summary marqo__chunks_description type array { } +summary marqo__embeddings_title type tensor(p{}, x[512]) { } +summary marqo__embeddings_description type tensor(p{}, x[512]) { } +} +} \ No newline at end of file diff --git a/tests/core/structured_vespa_index/test_structured_vespa_index.py b/tests/core/structured_vespa_index/test_structured_vespa_index.py index 9d39ac952..3b4e54b3c 100644 --- a/tests/core/structured_vespa_index/test_structured_vespa_index.py +++ b/tests/core/structured_vespa_index/test_structured_vespa_index.py @@ -16,7 +16,7 @@ def setUp(self) -> None: name='my_index', schema_name='my_index', model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ diff --git a/tests/core/structured_vespa_index/test_structured_vespa_schema.py b/tests/core/structured_vespa_index/test_structured_vespa_schema.py index 4a4f45e68..5c50b3673 100644 --- a/tests/core/structured_vespa_index/test_structured_vespa_schema.py +++ b/tests/core/structured_vespa_index/test_structured_vespa_schema.py @@ -16,7 +16,7 @@ def test_generate_schema_standardIndex_successful(self): marqo_index_request = self.structured_marqo_index_request( name='my_index', model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ @@ -51,7 +51,7 @@ def test_generate_schema_oneTensorField_successful(self): marqo_index_request = self.structured_marqo_index_request( name='my_index', model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ @@ -86,7 +86,7 @@ def test_generate_schema_FourTensorFields_successful(self): marqo_index_request = self.structured_marqo_index_request( name='my_index', model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ @@ -120,7 +120,7 @@ def test_generate_schema_noLexicalFields_successful(self): marqo_index_request = self.structured_marqo_index_request( name='my_index', model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ @@ -146,7 +146,7 @@ def test_generate_schema_noScoreModifierFields_successful(self): marqo_index_request = self.structured_marqo_index_request( name='my_index', model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ @@ -172,7 +172,7 @@ def test_generate_schema_noTensorFields_successful(self): marqo_index_request = self.structured_marqo_index_request( name='my_index', model=Model(name='ViT-B/32'), - distance_metric=DistanceMetric.PrenormalizedAnguar, + distance_metric=DistanceMetric.PrenormalizedAngular, vector_numeric_type=VectorNumericType.Float, hnsw_config=HnswConfig(ef_construction=100, m=16), fields=[ @@ -191,6 +191,33 @@ def test_generate_schema_noTensorFields_successful(self): self._remove_whitespace_in_schema(actual_schema) ) + def test_generate_schema_all_distance_metrics(self): + """A test for the structured Vespa schema generation with each of the distance metrics.""" + index_name = "test_structured_schema_distance_metric" + + for distance_metric in DistanceMetric: + with self.subTest(f"Structured index with distance metric: {distance_metric.value}"): + marqo_index_request = self.structured_marqo_index_request( + name=index_name, + model=Model(name='ViT-B/32'), + distance_metric=distance_metric, # Manually set distance metric to each one. + vector_numeric_type=VectorNumericType.Float, + hnsw_config=HnswConfig(ef_construction=100, m=16), + fields=[ + FieldRequest(name='title', type=FieldType.Text), + FieldRequest(name='description', type=FieldType.Text) + ], + tensor_fields=['title', 'description'] + ) + + actual_schema, _ = StructuredVespaSchema(marqo_index_request).generate_schema() + expected_schema = self._read_schema_from_file(f'test_schemas/structured_distance_metric_{distance_metric.value}.sd') + + self.assertEqual( + self._remove_whitespace_in_schema(expected_schema), + self._remove_whitespace_in_schema(actual_schema) + ) + def _read_schema_from_file(self, path: str) -> str: currentdir = os.path.dirname(os.path.abspath(__file__)) abspath = os.path.join(currentdir, path) diff --git a/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_angular.sd b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_angular.sd new file mode 100644 index 000000000..b75a0a416 --- /dev/null +++ b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_angular.sd @@ -0,0 +1,171 @@ +schema marqo__test_00unstructured_00schema_00distance_00metric { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__strings type array{ + indexing: index + index: enable-bm25 + } + + field marqo__long_string_fields type map { + indexing: summary + } + + field marqo__short_string_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__chunks type array { + indexing: summary + } + + field marqo__vector_count type int { + indexing: attribute | summary + } + + field marqo__embeddings type tensor(p{}, x[512]) { + indexing: attribute | index | summary + attribute { + distance-metric: angular + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + } + + fieldset default { + fields: marqo__strings + } + + rank-profile embedding_similarity inherits default { + inputs { + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: closeness(field, marqo__embeddings) + } + match-features: closest(marqo__embeddings) + } + + rank-profile bm25 inherits default { + first-phase { + expression: bm25(marqo__strings) + } + } + + rank-profile modifiers inherits default { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + function modify(score) { + expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum) + } + } + + rank-profile bm25_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + first-phase { + expression: modify(bm25(marqo__strings)) + } + } + + rank-profile embedding_similarity_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: modify(closeness(field, marqo__embeddings)) + } + match-features: closest(marqo__embeddings) + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + summary marqo__embeddings type tensor(p{}, x[512]) {} + } +} \ No newline at end of file diff --git a/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_dotproduct.sd b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_dotproduct.sd new file mode 100644 index 000000000..9309d85e5 --- /dev/null +++ b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_dotproduct.sd @@ -0,0 +1,171 @@ +schema marqo__test_00unstructured_00schema_00distance_00metric { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__strings type array{ + indexing: index + index: enable-bm25 + } + + field marqo__long_string_fields type map { + indexing: summary + } + + field marqo__short_string_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__chunks type array { + indexing: summary + } + + field marqo__vector_count type int { + indexing: attribute | summary + } + + field marqo__embeddings type tensor(p{}, x[512]) { + indexing: attribute | index | summary + attribute { + distance-metric: dotproduct + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + } + + fieldset default { + fields: marqo__strings + } + + rank-profile embedding_similarity inherits default { + inputs { + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: closeness(field, marqo__embeddings) + } + match-features: closest(marqo__embeddings) + } + + rank-profile bm25 inherits default { + first-phase { + expression: bm25(marqo__strings) + } + } + + rank-profile modifiers inherits default { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + function modify(score) { + expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum) + } + } + + rank-profile bm25_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + first-phase { + expression: modify(bm25(marqo__strings)) + } + } + + rank-profile embedding_similarity_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: modify(closeness(field, marqo__embeddings)) + } + match-features: closest(marqo__embeddings) + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + summary marqo__embeddings type tensor(p{}, x[512]) {} + } +} \ No newline at end of file diff --git a/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_euclidean.sd b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_euclidean.sd new file mode 100644 index 000000000..d90e0df72 --- /dev/null +++ b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_euclidean.sd @@ -0,0 +1,171 @@ +schema marqo__test_00unstructured_00schema_00distance_00metric { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__strings type array{ + indexing: index + index: enable-bm25 + } + + field marqo__long_string_fields type map { + indexing: summary + } + + field marqo__short_string_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__chunks type array { + indexing: summary + } + + field marqo__vector_count type int { + indexing: attribute | summary + } + + field marqo__embeddings type tensor(p{}, x[512]) { + indexing: attribute | index | summary + attribute { + distance-metric: euclidean + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + } + + fieldset default { + fields: marqo__strings + } + + rank-profile embedding_similarity inherits default { + inputs { + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: closeness(field, marqo__embeddings) + } + match-features: closest(marqo__embeddings) + } + + rank-profile bm25 inherits default { + first-phase { + expression: bm25(marqo__strings) + } + } + + rank-profile modifiers inherits default { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + function modify(score) { + expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum) + } + } + + rank-profile bm25_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + first-phase { + expression: modify(bm25(marqo__strings)) + } + } + + rank-profile embedding_similarity_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: modify(closeness(field, marqo__embeddings)) + } + match-features: closest(marqo__embeddings) + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + summary marqo__embeddings type tensor(p{}, x[512]) {} + } +} \ No newline at end of file diff --git a/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_geodegrees.sd b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_geodegrees.sd new file mode 100644 index 000000000..a75160421 --- /dev/null +++ b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_geodegrees.sd @@ -0,0 +1,171 @@ +schema marqo__test_00unstructured_00schema_00distance_00metric { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__strings type array{ + indexing: index + index: enable-bm25 + } + + field marqo__long_string_fields type map { + indexing: summary + } + + field marqo__short_string_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__chunks type array { + indexing: summary + } + + field marqo__vector_count type int { + indexing: attribute | summary + } + + field marqo__embeddings type tensor(p{}, x[512]) { + indexing: attribute | index | summary + attribute { + distance-metric: geodegrees + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + } + + fieldset default { + fields: marqo__strings + } + + rank-profile embedding_similarity inherits default { + inputs { + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: closeness(field, marqo__embeddings) + } + match-features: closest(marqo__embeddings) + } + + rank-profile bm25 inherits default { + first-phase { + expression: bm25(marqo__strings) + } + } + + rank-profile modifiers inherits default { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + function modify(score) { + expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum) + } + } + + rank-profile bm25_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + first-phase { + expression: modify(bm25(marqo__strings)) + } + } + + rank-profile embedding_similarity_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: modify(closeness(field, marqo__embeddings)) + } + match-features: closest(marqo__embeddings) + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + summary marqo__embeddings type tensor(p{}, x[512]) {} + } +} \ No newline at end of file diff --git a/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_hamming.sd b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_hamming.sd new file mode 100644 index 000000000..679b8b097 --- /dev/null +++ b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_hamming.sd @@ -0,0 +1,171 @@ +schema marqo__test_00unstructured_00schema_00distance_00metric { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__strings type array{ + indexing: index + index: enable-bm25 + } + + field marqo__long_string_fields type map { + indexing: summary + } + + field marqo__short_string_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__chunks type array { + indexing: summary + } + + field marqo__vector_count type int { + indexing: attribute | summary + } + + field marqo__embeddings type tensor(p{}, x[512]) { + indexing: attribute | index | summary + attribute { + distance-metric: hamming + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + } + + fieldset default { + fields: marqo__strings + } + + rank-profile embedding_similarity inherits default { + inputs { + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: closeness(field, marqo__embeddings) + } + match-features: closest(marqo__embeddings) + } + + rank-profile bm25 inherits default { + first-phase { + expression: bm25(marqo__strings) + } + } + + rank-profile modifiers inherits default { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + function modify(score) { + expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum) + } + } + + rank-profile bm25_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + first-phase { + expression: modify(bm25(marqo__strings)) + } + } + + rank-profile embedding_similarity_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: modify(closeness(field, marqo__embeddings)) + } + match-features: closest(marqo__embeddings) + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + summary marqo__embeddings type tensor(p{}, x[512]) {} + } +} \ No newline at end of file diff --git a/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_prenormalized-angular.sd b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_prenormalized-angular.sd new file mode 100644 index 000000000..a944b932a --- /dev/null +++ b/tests/core/unstructured_vespa_index/test_schemas/unstructured_vespa_index_schema_distance_metric_prenormalized-angular.sd @@ -0,0 +1,171 @@ +schema marqo__test_00unstructured_00schema_00distance_00metric { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__strings type array{ + indexing: index + index: enable-bm25 + } + + field marqo__long_string_fields type map { + indexing: summary + } + + field marqo__short_string_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__chunks type array { + indexing: summary + } + + field marqo__vector_count type int { + indexing: attribute | summary + } + + field marqo__embeddings type tensor(p{}, x[512]) { + indexing: attribute | index | summary + attribute { + distance-metric: prenormalized-angular + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + } + + fieldset default { + fields: marqo__strings + } + + rank-profile embedding_similarity inherits default { + inputs { + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: closeness(field, marqo__embeddings) + } + match-features: closest(marqo__embeddings) + } + + rank-profile bm25 inherits default { + first-phase { + expression: bm25(marqo__strings) + } + } + + rank-profile modifiers inherits default { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + function modify(score) { + expression: if (count(query(marqo__mult_weights)) == 0, 1, reduce(query(marqo__mult_weights) * attribute(marqo__score_modifiers), prod)) * score + reduce(query(marqo__add_weights) * attribute(marqo__score_modifiers), sum) + } + } + + rank-profile bm25_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + } + first-phase { + expression: modify(bm25(marqo__strings)) + } + } + + rank-profile embedding_similarity_modifiers inherits modifiers { + inputs { + query(marqo__mult_weights) tensor(p{}) + query(marqo__add_weights) tensor(p{}) + query(embedding_query) tensor(x[512]) + } + first-phase { + expression: modify(closeness(field, marqo__embeddings)) + } + match-features: closest(marqo__embeddings) + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__strings type array {} + summary marqo__long_string_fields type map {} + summary marqo__short_string_fields type map {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks type array {} + summary marqo__embeddings type tensor(p{}, x[512]) {} + } +} \ No newline at end of file diff --git a/tests/core/unstructured_vespa_index/test_unstructured_vespa_schema.py b/tests/core/unstructured_vespa_index/test_unstructured_vespa_schema.py index 9dec8fbcd..d34dbf23b 100644 --- a/tests/core/unstructured_vespa_index/test_unstructured_vespa_schema.py +++ b/tests/core/unstructured_vespa_index/test_unstructured_vespa_schema.py @@ -3,7 +3,8 @@ from marqo.core.models.marqo_index_request import MarqoIndexRequest from marqo.core.unstructured_vespa_index.unstructured_vespa_schema import UnstructuredVespaSchema -from marqo.tensor_search.models.index_settings import IndexSettings +from marqo.tensor_search.models.index_settings import * +from marqo.core.models.marqo_index import * from tests.marqo_test import MarqoTestCase @@ -55,4 +56,34 @@ def test_unstructured_index_schema_random_model(self): self.assertEqual( self._remove_whitespace_in_schema(expected_schema), self._remove_whitespace_in_schema(generated_schema) - ) \ No newline at end of file + ) + + def test_unstructured_index_schema_all_distance_metrics(self): + """A test for the unstructured Vespa schema generation with each of the distance metrics.""" + index_name = "test_unstructured_schema_distance_metric" + + for distance_metric in DistanceMetric: + with self.subTest(f"Unstructured index with distance metric: {distance_metric.value}"): + test_marqo_index_request: MarqoIndexRequest = IndexSettings( + type="unstructured", + model="ViT-B/32", + annParameters=AnnParameters( + spaceType=distance_metric.value, # Manually set distance metric to each one. + parameters=core.HnswConfig( + efConstruction=512, + m=16 + ) + ) + ).to_marqo_index_request(index_name) + + test_unstructured_schema_object = UnstructuredVespaSchema(test_marqo_index_request) + + generated_schema, _ = test_unstructured_schema_object.generate_schema() + + expected_schema = self._read_schema_from_file( + f'test_schemas/unstructured_vespa_index_schema_distance_metric_{distance_metric.value}.sd') + + self.assertEqual( + self._remove_whitespace_in_schema(expected_schema), + self._remove_whitespace_in_schema(generated_schema) + ) \ No newline at end of file