Skip to content

Commit

Permalink
Unstructured index distance metric fix (#772)
Browse files Browse the repository at this point in the history
  • Loading branch information
vicilliar authored Feb 28, 2024
1 parent 48b0ac2 commit e99b549
Show file tree
Hide file tree
Showing 22 changed files with 1,507 additions and 32 deletions.
2 changes: 1 addition & 1 deletion src/marqo/core/models/marqo_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class DistanceMetric(Enum):
Euclidean = 'euclidean'
Angular = 'angular'
DotProduct = 'dotproduct'
PrenormalizedAnguar = 'prenormalized-angular'
PrenormalizedAngular = 'prenormalized-angular'
Geodegrees = 'geodegrees'
Hamming = 'hamming'

Expand Down
16 changes: 0 additions & 16 deletions src/marqo/core/structured_vespa_index/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,3 @@

SUMMARY_ALL_NON_VECTOR = 'all-non-vector-summary'
SUMMARY_ALL_VECTOR = 'all-vector-summary'

_DISTANCE_METRIC_MAP = {
DistanceMetric.Euclidean: 'euclidean',
DistanceMetric.Angular: 'angular',
DistanceMetric.DotProduct: 'dotproduct',
DistanceMetric.PrenormalizedAnguar: 'prenormalized-angular',
DistanceMetric.Geodegrees: 'geodegrees',
DistanceMetric.Hamming: 'hamming'
}


def get_distance_metric(marqo_distance_metric: DistanceMetric) -> str:
try:
return _DISTANCE_METRIC_MAP[marqo_distance_metric]
except KeyError:
raise ValueError(f'Unknown Marqo distance metric: {marqo_distance_metric}')
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _generate_document_section(self, schema_name: str) -> (List[str], Structured
document.append(f'field {embedding_field_name} type tensor<float>(p{{}}, x[{model_dim}]) {{')
document.append('indexing: attribute | index | summary')
document.append(
f'attribute {{ distance-metric: {common.get_distance_metric(self._index_request.distance_metric)} }}')
f'attribute {{ distance-metric: {self._get_distance_metric(self._index_request.distance_metric)} }}')
document.append('index { hnsw {')
document.append(f'max-links-per-node: {self._index_request.hnsw_config.m}')
document.append(f'neighbors-to-explore-at-insert: {self._index_request.hnsw_config.ef_construction}')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def _generate_unstructured_schema(cls, marqo_index: UnstructuredMarqoIndex) -> s
field {cls._EMBEDDINGS} type tensor<float>(p{{}}, x[{dimension}]) {{
indexing: attribute | index | summary
attribute {{
distance-metric: prenormalized-angular
distance-metric: {cls._get_distance_metric(cls, marqo_index.distance_metric)}
}}
index {{
hnsw {{
Expand Down
17 changes: 17 additions & 0 deletions src/marqo/core/vespa_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from marqo.core.models.marqo_index_request import MarqoIndexRequest, StructuredMarqoIndexRequest, \
UnstructuredMarqoIndexRequest

from marqo.core.models.marqo_index import *


class VespaSchema(ABC):
"""
Expand All @@ -16,6 +18,21 @@ class VespaSchema(ABC):
'-': '_01',
}

_DISTANCE_METRIC_MAP = {
DistanceMetric.Euclidean: 'euclidean',
DistanceMetric.Angular: 'angular',
DistanceMetric.DotProduct: 'dotproduct',
DistanceMetric.PrenormalizedAngular: 'prenormalized-angular',
DistanceMetric.Geodegrees: 'geodegrees',
DistanceMetric.Hamming: 'hamming'
}

def _get_distance_metric(self, marqo_distance_metric: DistanceMetric) -> str:
try:
return self._DISTANCE_METRIC_MAP[marqo_distance_metric]
except KeyError:
raise ValueError(f'Unknown Marqo distance metric: {marqo_distance_metric}')

@abstractmethod
def generate_schema(self) -> (str, MarqoIndex):
"""
Expand Down
2 changes: 1 addition & 1 deletion src/marqo/tensor_search/models/index_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class IndexSettings(StrictBaseModel):
)
vectorNumericType: core.VectorNumericType = core.VectorNumericType.Float
annParameters: AnnParameters = AnnParameters(
spaceType=core.DistanceMetric.PrenormalizedAnguar,
spaceType=core.DistanceMetric.PrenormalizedAngular,
parameters=core.HnswConfig(
efConstruction=512,
m=16
Expand Down
6 changes: 3 additions & 3 deletions tests/core/index_management/test_index_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_create_index_settingsSchemaDoesNotExist_successful(self):
marqo_index_request = self.structured_marqo_index_request(
name=index_name,
model=Model(name='ViT-B/32'),
distance_metric=DistanceMetric.PrenormalizedAnguar,
distance_metric=DistanceMetric.PrenormalizedAngular,
vector_numeric_type=VectorNumericType.Float,
hnsw_config=HnswConfig(ef_construction=100, m=16),
fields=[
Expand Down Expand Up @@ -156,7 +156,7 @@ def test_create_index_settingsSchemaExists_successful(self):
marqo_index_request = self.structured_marqo_index_request(
name=index_name_1,
model=Model(name='ViT-B/32'),
distance_metric=DistanceMetric.PrenormalizedAnguar,
distance_metric=DistanceMetric.PrenormalizedAngular,
vector_numeric_type=VectorNumericType.Float,
hnsw_config=HnswConfig(ef_construction=100, m=16),
fields=[
Expand Down Expand Up @@ -203,7 +203,7 @@ def test_create_index_indexExists_fails(self):
marqo_index_request = self.structured_marqo_index_request(
name=index_name,
model=Model(name='ViT-B/32'),
distance_metric=DistanceMetric.PrenormalizedAnguar,
distance_metric=DistanceMetric.PrenormalizedAngular,
vector_numeric_type=VectorNumericType.Float,
hnsw_config=HnswConfig(ef_construction=100, m=16),
fields=[
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
schema marqo__test_00structured_00schema_00distance_00metric {
document {
field marqo__id type string {
indexing: attribute | summary
attribute: fast-search
rank: filter
}
field title type string {
indexing: summary
}
field description type string {
indexing: summary
}
field marqo__chunks_title type array<string> {
indexing: attribute | summary
}
field marqo__embeddings_title type tensor<float>(p{}, x[512]) {
indexing: attribute | index | summary
attribute { distance-metric: angular }
index { hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 100
}}
}
field marqo__chunks_description type array<string> {
indexing: attribute | summary
}
field marqo__embeddings_description type tensor<float>(p{}, x[512]) {
indexing: attribute | index | summary
attribute { distance-metric: angular }
index { hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 100
}}
}
field marqo__vector_count type int { indexing: attribute | summary }
}
rank-profile embedding_similarity inherits default {
inputs {
query(marqo__query_embedding) tensor<float>(x[512])
query(title): 0
query(description): 0
}
first-phase {
expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0))
}
match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description)
}
document-summary all-non-vector-summary {
summary marqo__id type string { }
summary title type string { source: title }
summary description type string { source: description }
summary marqo__chunks_title type array<string> { }
summary marqo__chunks_description type array<string> { }
}
document-summary all-vector-summary {
summary marqo__id type string { }
summary title type string { source: title }
summary description type string { source: description }
summary marqo__chunks_title type array<string> { }
summary marqo__chunks_description type array<string> { }
summary marqo__embeddings_title type tensor<float>(p{}, x[512]) { }
summary marqo__embeddings_description type tensor<float>(p{}, x[512]) { }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
schema marqo__test_00structured_00schema_00distance_00metric {
document {
field marqo__id type string {
indexing: attribute | summary
attribute: fast-search
rank: filter
}
field title type string {
indexing: summary
}
field description type string {
indexing: summary
}
field marqo__chunks_title type array<string> {
indexing: attribute | summary
}
field marqo__embeddings_title type tensor<float>(p{}, x[512]) {
indexing: attribute | index | summary
attribute { distance-metric: dotproduct }
index { hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 100
}}
}
field marqo__chunks_description type array<string> {
indexing: attribute | summary
}
field marqo__embeddings_description type tensor<float>(p{}, x[512]) {
indexing: attribute | index | summary
attribute { distance-metric: dotproduct }
index { hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 100
}}
}
field marqo__vector_count type int { indexing: attribute | summary }
}
rank-profile embedding_similarity inherits default {
inputs {
query(marqo__query_embedding) tensor<float>(x[512])
query(title): 0
query(description): 0
}
first-phase {
expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0))
}
match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description)
}
document-summary all-non-vector-summary {
summary marqo__id type string { }
summary title type string { source: title }
summary description type string { source: description }
summary marqo__chunks_title type array<string> { }
summary marqo__chunks_description type array<string> { }
}
document-summary all-vector-summary {
summary marqo__id type string { }
summary title type string { source: title }
summary description type string { source: description }
summary marqo__chunks_title type array<string> { }
summary marqo__chunks_description type array<string> { }
summary marqo__embeddings_title type tensor<float>(p{}, x[512]) { }
summary marqo__embeddings_description type tensor<float>(p{}, x[512]) { }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
schema marqo__test_00structured_00schema_00distance_00metric {
document {
field marqo__id type string {
indexing: attribute | summary
attribute: fast-search
rank: filter
}
field title type string {
indexing: summary
}
field description type string {
indexing: summary
}
field marqo__chunks_title type array<string> {
indexing: attribute | summary
}
field marqo__embeddings_title type tensor<float>(p{}, x[512]) {
indexing: attribute | index | summary
attribute { distance-metric: euclidean }
index { hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 100
}}
}
field marqo__chunks_description type array<string> {
indexing: attribute | summary
}
field marqo__embeddings_description type tensor<float>(p{}, x[512]) {
indexing: attribute | index | summary
attribute { distance-metric: euclidean }
index { hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 100
}}
}
field marqo__vector_count type int { indexing: attribute | summary }
}
rank-profile embedding_similarity inherits default {
inputs {
query(marqo__query_embedding) tensor<float>(x[512])
query(title): 0
query(description): 0
}
first-phase {
expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0))
}
match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description)
}
document-summary all-non-vector-summary {
summary marqo__id type string { }
summary title type string { source: title }
summary description type string { source: description }
summary marqo__chunks_title type array<string> { }
summary marqo__chunks_description type array<string> { }
}
document-summary all-vector-summary {
summary marqo__id type string { }
summary title type string { source: title }
summary description type string { source: description }
summary marqo__chunks_title type array<string> { }
summary marqo__chunks_description type array<string> { }
summary marqo__embeddings_title type tensor<float>(p{}, x[512]) { }
summary marqo__embeddings_description type tensor<float>(p{}, x[512]) { }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
schema marqo__test_00structured_00schema_00distance_00metric {
document {
field marqo__id type string {
indexing: attribute | summary
attribute: fast-search
rank: filter
}
field title type string {
indexing: summary
}
field description type string {
indexing: summary
}
field marqo__chunks_title type array<string> {
indexing: attribute | summary
}
field marqo__embeddings_title type tensor<float>(p{}, x[512]) {
indexing: attribute | index | summary
attribute { distance-metric: geodegrees }
index { hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 100
}}
}
field marqo__chunks_description type array<string> {
indexing: attribute | summary
}
field marqo__embeddings_description type tensor<float>(p{}, x[512]) {
indexing: attribute | index | summary
attribute { distance-metric: geodegrees }
index { hnsw {
max-links-per-node: 16
neighbors-to-explore-at-insert: 100
}}
}
field marqo__vector_count type int { indexing: attribute | summary }
}
rank-profile embedding_similarity inherits default {
inputs {
query(marqo__query_embedding) tensor<float>(x[512])
query(title): 0
query(description): 0
}
first-phase {
expression: max(if(query(title) > 0, closeness(field, marqo__embeddings_title), 0), if(query(description) > 0, closeness(field, marqo__embeddings_description), 0))
}
match-features: closest(marqo__embeddings_title) closest(marqo__embeddings_description) distance(field, marqo__embeddings_title) distance(field, marqo__embeddings_description)
}
document-summary all-non-vector-summary {
summary marqo__id type string { }
summary title type string { source: title }
summary description type string { source: description }
summary marqo__chunks_title type array<string> { }
summary marqo__chunks_description type array<string> { }
}
document-summary all-vector-summary {
summary marqo__id type string { }
summary title type string { source: title }
summary description type string { source: description }
summary marqo__chunks_title type array<string> { }
summary marqo__chunks_description type array<string> { }
summary marqo__embeddings_title type tensor<float>(p{}, x[512]) { }
summary marqo__embeddings_description type tensor<float>(p{}, x[512]) { }
}
}
Loading

0 comments on commit e99b549

Please sign in to comment.