diff --git a/src/marqo/api/configs.py b/src/marqo/api/configs.py index b5eeec762..bdc5a1345 100644 --- a/src/marqo/api/configs.py +++ b/src/marqo/api/configs.py @@ -52,6 +52,7 @@ def default_env_vars() -> dict: EnvVars.MARQO_INFERENCE_CACHE_TYPE: "LRU", EnvVars.MARQO_BEST_AVAILABLE_DEVICE: "cpu", # on_start_script will determine this. EnvVars.MARQO_MAX_TENSOR_FIELD_COUNT_UNSTRUCTURED: 100, + EnvVars.MARQO_MAX_STRING_ARRAY_FIELD_COUNT_UNSTRUCTURED: 100, EnvVars.MARQO_MAX_LEXICAL_FIELD_COUNT_UNSTRUCTURED: 100, EnvVars.MARQO_INDEX_DEPLOYMENT_LOCK_TIMEOUT: 5, # index operations acquire this distributed lock with a timeout EnvVars.ZOOKEEPER_CONNECTION_TIMEOUT: 15, diff --git a/src/marqo/core/constants.py b/src/marqo/core/constants.py index fb586e6a0..4d91a8586 100644 --- a/src/marqo/core/constants.py +++ b/src/marqo/core/constants.py @@ -6,6 +6,7 @@ MARQO_DOC_CHUNKS = 'chunks' MARQO_DOC_EMBEDDINGS = 'embeddings' MARQO_DOC_ID = '_id' +MARQO_FIELD_TYPES = "field_types" MARQO_SEARCH_METHOD_TENSOR = 'tensor' MARQO_SEARCH_METHOD_LEXICAL = 'lexical' @@ -29,4 +30,4 @@ QUERY_INPUT_SCORE_MODIFIERS_ADD_WEIGHTS_TENSOR = 'marqo__add_weights_tensor' QUERY_INPUT_SCORE_MODIFIERS_MULT_WEIGHTS_GLOBAL = 'marqo__mult_weights_global' QUERY_INPUT_SCORE_MODIFIERS_ADD_WEIGHTS_GLOBAL = 'marqo__add_weights_global' -MARQO_GLOBAL_SCORE_MODIFIERS = 'global' +MARQO_GLOBAL_SCORE_MODIFIERS = 'global' \ No newline at end of file diff --git a/src/marqo/core/document/document.py b/src/marqo/core/document/document.py index 6cb36a460..7e7e23ee2 100644 --- a/src/marqo/core/document/document.py +++ b/src/marqo/core/document/document.py @@ -1,25 +1,31 @@ from timeit import default_timer as timer from typing import Dict, List, Tuple, Optional +import semver + import marqo.api.exceptions as api_exceptions from marqo.core.constants import MARQO_DOC_ID from marqo.core.models.add_docs_params import AddDocsParams -from marqo.core.exceptions import UnsupportedFeatureError, ParsingError, InternalError +from marqo.core.exceptions import UnsupportedFeatureError, ParsingError, InternalError, MarqoDocumentParsingError from marqo.core.index_management.index_management import IndexManagement from marqo.core.models.marqo_add_documents_response import MarqoAddDocumentsResponse, MarqoAddDocumentsItem from marqo.core.models.marqo_index import IndexType, SemiStructuredMarqoIndex, StructuredMarqoIndex, \ UnstructuredMarqoIndex from marqo.core.models.marqo_update_documents_response import MarqoUpdateDocumentsResponse, MarqoUpdateDocumentsItem +from marqo.core.semi_structured_vespa_index.common import SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION, \ + VESPA_FIELD_ID, INT_FIELDS, FLOAT_FIELDS, VESPA_DOC_FIELD_TYPES, VESPA_DOC_CREATE_TIMESTAMP from marqo.core.semi_structured_vespa_index.semi_structured_add_document_handler import \ SemiStructuredAddDocumentsHandler, SemiStructuredFieldCountConfig from marqo.core.structured_vespa_index.structured_add_document_handler import StructuredAddDocumentsHandler from marqo.core.unstructured_vespa_index.unstructured_add_document_handler import UnstructuredAddDocumentsHandler from marqo.core.vespa_index.vespa_index import for_marqo_index as vespa_index_factory from marqo.logging import get_logger +from marqo.tensor_search.telemetry import RequestMetricsStore from marqo.vespa.models import UpdateDocumentsBatchResponse, VespaDocument from marqo.vespa.models.delete_document_response import DeleteAllDocumentsResponse from marqo.vespa.models.feed_response import FeedBatchResponse from marqo.vespa.vespa_client import VespaClient +from marqo.version import get_version logger = get_logger(__name__) @@ -95,7 +101,7 @@ def partial_update_documents(self, partial_documents: List[Dict], marqo_index) \ If the document does not exist, this document will error out and the error will be returned in the response. Args: - partial_documents: A list of documents to partially update + partial_documents: A list of documents to partially update received in the request marqo_index: The index object to partially update documents in Raises: @@ -104,11 +110,15 @@ def partial_update_documents(self, partial_documents: List[Dict], marqo_index) \ Return: MarqoUpdateDocumentsResponse containing the response of the partial update operation """ - if marqo_index.type in [IndexType.Unstructured, IndexType.SemiStructured]: + if marqo_index.type is IndexType.Unstructured: raise UnsupportedFeatureError("Partial document update is not supported for unstructured indexes. " "Please use add_documents with use_existing_tensor=True instead") - elif marqo_index.type == IndexType.Structured: + elif marqo_index.type is IndexType.Structured: pass + elif marqo_index.type is IndexType.SemiStructured: + if marqo_index.parsed_marqo_version() < SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION: # Partial updates for semi-structured indexes are only supported for Marqo version >= 2.16.0 + raise UnsupportedFeatureError("Partial document update is not supported for this index version. " + "Please upgrade the index version, or create a new index to use this feature.") else: raise ValueError(f"Invalid index type: {marqo_index.type}") @@ -118,24 +128,37 @@ def partial_update_documents(self, partial_documents: List[Dict], marqo_index) \ unsuccessful_docs: List[Tuple[int, MarqoUpdateDocumentsItem]] = [] # Remove duplicated documents based on _id - partial_documents, _ = self.remove_duplicated_documents(partial_documents) + partial_documents, doc_ids, documents_that_contain_maps = self.process_documents(partial_documents, + unsuccessful_docs, is_index_semi_structured=marqo_index.type is IndexType.SemiStructured) + existing_vespa_documents = {} + + if marqo_index.type is IndexType.SemiStructured and documents_that_contain_maps: # Only retrieve the document back if the partial update request contains maps and the index is semi-structured + get_batch_response = self.vespa_client.get_batch(ids = list(documents_that_contain_maps), fields = [ + VESPA_FIELD_ID, INT_FIELDS, FLOAT_FIELDS, VESPA_DOC_FIELD_TYPES, VESPA_DOC_CREATE_TIMESTAMP], schema = marqo_index.schema_name) + responses = get_batch_response.responses + for resp in responses: + existing_vespa_documents[resp.document.fields[VESPA_FIELD_ID]] = resp.document.dict() for index, doc in enumerate(partial_documents): try: - vespa_document = VespaDocument(**vespa_index.to_vespa_partial_document(doc)) + vespa_document = VespaDocument(**vespa_index.to_vespa_partial_document(doc, existing_vespa_documents.get(doc.get(MARQO_DOC_ID, ''), None))) vespa_documents.append(vespa_document) except ParsingError as e: unsuccessful_docs.append( (index, MarqoUpdateDocumentsItem(id=doc.get(MARQO_DOC_ID, ''), error=e.message, status=int(api_exceptions.InvalidArgError.status_code)))) - vespa_res: UpdateDocumentsBatchResponse = ( - self.vespa_client.update_documents_batch(vespa_documents, - marqo_index.schema_name, - vespa_id_field=vespa_index.get_vespa_id_field())) + with RequestMetricsStore.for_request().time("partial_update.vespa._bulk"): + vespa_res: UpdateDocumentsBatchResponse = ( + self.vespa_client.update_documents_batch(vespa_documents, + marqo_index.schema_name, + vespa_id_field=vespa_index.get_vespa_id_field())) + + with RequestMetricsStore.for_request().time("partial_update.postprocess"): + result = self._translate_update_document_response(vespa_res, unsuccessful_docs, + marqo_index.name, start_time) - return self._translate_update_document_response(vespa_res, unsuccessful_docs, - marqo_index.name, start_time) + return result def _translate_update_document_response(self, responses: UpdateDocumentsBatchResponse, unsuccessful_docs: List, index_name: str, start_time) \ @@ -170,34 +193,76 @@ def _translate_update_document_response(self, responses: UpdateDocumentsBatchRes return MarqoUpdateDocumentsResponse(errors=errors, index_name=index_name, items=items, processingTimeMs=(timer() - start_time) * 1000) - def remove_duplicated_documents(self, documents: List) -> Tuple[List, set]: - """Remove duplicated documents based on _id in the given list of documents. + def process_documents(self, documents: List[Dict], unsuccessful_docs: List[Tuple[int, MarqoUpdateDocumentsItem]], + is_index_semi_structured = False) -> Tuple[List, set, set]: + """Process documents to remove duplicates and identify documents containing maps. + + This method combines duplicate removal and map detection into a single pass through + the documents for better efficiency. - For a list of documents, if there exists duplicate _id, the last document will be used while the - previous ones will be removed from the list. - - This function does not validate the documents, it only removes the duplicates based on _id fields. + Args: + is_index_semi_structured: Variable denoting if the index that's is currently being processed is of type SemiStructured + unsuccessful_docs: A list of documents which were processed unsuccessfully + documents: List of document dictionaries to process + + Returns: + Tuple containing: + - List of deduplicated documents + - Set of unique document IDs + - Set of document IDs that contain dictionary values """ - # Deduplicate docs, keep the latest docs = [] doc_ids = set() + documents_with_maps = set() + + # Process documents in reverse to keep latest version of duplicates for i in range(len(documents) - 1, -1, -1): doc = documents[i] - - if isinstance(doc, dict) and '_id' in doc: - doc_id = doc['_id'] - try: - if doc_id is not None and doc_id in doc_ids: - logger.debug(f'Duplicate document ID {doc_id} found, keeping the latest') - continue - doc_ids.add(doc_id) - except TypeError as e: # Happens if ID is a non-hashable type -- ID validation will catch this later on - logger.debug(f'Could not hash document ID {doc_id}: {e}') - - docs.append(doc) - # Reverse to preserve order in request + + if not isinstance(doc, dict) or '_id' not in doc: + docs.append(doc) + continue + + doc_id = doc['_id'] + + try: + # Skip if we've already seen this ID + if doc_id is not None and doc_id in doc_ids: + logger.debug(f'Duplicate document ID {doc_id} found, keeping the latest') + continue + + # Check for dictionary values while processing doc to populate the documents_with_maps set. + # Only do it in case of semi-structured indexes. + if is_index_semi_structured: + for field_name, field_value in doc.items(): + if isinstance(field_value, dict): + if len(field_value) == 0: # If the dictionary is empty, get back the document so that we can update the doc with an empty dictionary (i.e remove the map from the doc). + documents_with_maps.add(doc_id) + else: + for key, val in field_value.items(): + if isinstance(val, (int, float)): + documents_with_maps.add(doc_id) + break + else: + raise MarqoDocumentParsingError( + f'Unsupported field type {type(val)} for field {field_name} in doc {doc_id}. We only support int and float types for map values when updating a document.' + ) + break + doc_ids.add(doc_id) + docs.append(doc) + + except TypeError as e: + logger.debug(f'Could not hash document ID {doc_id}: {e}') + docs.append(doc) + + except MarqoDocumentParsingError as e: + unsuccessful_docs.append((i, MarqoUpdateDocumentsItem(id=doc.get(MARQO_DOC_ID, ''), + error=e.message, + status=int(api_exceptions.InvalidArgError.status_code)))) + + # Reverse to preserve original order docs.reverse() - return docs, doc_ids + return docs, doc_ids, documents_with_maps def translate_add_documents_response(self, responses: Optional[FeedBatchResponse], index_name: str, diff --git a/src/marqo/core/index_management/index_management.py b/src/marqo/core/index_management/index_management.py index 64c9b6b2e..26c85e92c 100644 --- a/src/marqo/core/index_management/index_management.py +++ b/src/marqo/core/index_management/index_management.py @@ -231,7 +231,8 @@ def is_subset(dict_a, dict_b): return all(k in dict_b and dict_b[k] == v for k, v in dict_a.items()) if (is_subset(marqo_index.tensor_field_map, existing_index.tensor_field_map) and - is_subset(marqo_index.field_map, existing_index.field_map)): + is_subset(marqo_index.field_map, existing_index.field_map) and + is_subset(marqo_index.name_to_string_array_field_map, existing_index.name_to_string_array_field_map)): logger.debug(f'Another thread has updated the index {marqo_index.name} already.') return diff --git a/src/marqo/core/models/marqo_index.py b/src/marqo/core/models/marqo_index.py index 0b024961c..59f71735e 100644 --- a/src/marqo/core/models/marqo_index.py +++ b/src/marqo/core/models/marqo_index.py @@ -98,6 +98,12 @@ def check_all_fields(cls, values): return values +class StringArrayField(ImmutableStrictBaseModel): + name: str + type: FieldType + string_array_field_name: Optional[str] + features: List[FieldFeature] = [] + class TensorField(ImmutableStrictBaseModel): """ @@ -505,6 +511,7 @@ class SemiStructuredMarqoIndex(UnstructuredMarqoIndex): type: IndexType = IndexType.SemiStructured lexical_fields: List[Field] tensor_fields: List[TensorField] + string_array_fields: Optional[List[StringArrayField]] # This is required so that when saving a document containing string array fields, we can make changes to the schema on the fly. Ref: https://github.com/marqo-ai/marqo/blob/cfea70adea7039d1586c94e36adae8e66cabe306/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template_2_16.sd.jinja2#L83 def __init__(self, **data): super().__init__(**data) @@ -519,8 +526,32 @@ def field_map(self) -> Dict[str, Field]: A map from field name to the field. """ return self._cache_or_get('field_map', - lambda: {field.name: field for field in self.lexical_fields} - ) + lambda: {field.name: field for field in self.lexical_fields}) + + @property + def name_to_string_array_field_map(self): + """ + A map from a StringArrayField object's " "name" property to corresponding StringArrayField object. + "Name" is the name of the StringArrayField object, which is passed by the user. It does not start with marqo__string_array prefix. + + Returns an empty dict if string_array_fields is None. + """ + + return self._cache_or_get('name_to_string_array_field_map', + lambda : {} if self.string_array_fields is None + else {field.name: field for field in self.string_array_fields}) + + @property + def string_array_field_name_to_string_array_field_map(self): + """ + A map from a StringArrayField object's "string_array_field_name" property to corresponding StringArrayField object. + A "string_array_field_name" is that name of a StringArrayField object, which is used in the index schema, and it starts with marqo__string_array prefix. + + Returns an empty dict if string_array_fields is None. + """ + return self._cache_or_get('string_array_field_map', + lambda : {} if self.string_array_fields is None + else {field.string_array_field_name: field for field in self.string_array_fields}) @property def lexical_field_map(self) -> Dict[str, Field]: diff --git a/src/marqo/core/semi_structured_vespa_index/common.py b/src/marqo/core/semi_structured_vespa_index/common.py index 40f273496..610ea4f42 100644 --- a/src/marqo/core/semi_structured_vespa_index/common.py +++ b/src/marqo/core/semi_structured_vespa_index/common.py @@ -1,3 +1,5 @@ +import semver + VESPA_FIELD_ID = "marqo__id" STRINGS = "marqo__strings" SHORT_STRINGS_FIELDS = "marqo__short_string_fields" @@ -20,6 +22,10 @@ MARQO_DOC_MULTIMODAL_PARAMS = "multimodal_params" VESPA_DOC_MULTIMODAL_PARAMS = "marqo__multimodal_params" +# A metadata field that's used to store a dictionary of key-value pairs where key is the field name and value is a string denoting the field type +VESPA_DOC_FIELD_TYPES = "marqo__field_types" +VESPA_DOC_CREATE_TIMESTAMP = "marqo__create_timestamp" + SUMMARY_ALL_NON_VECTOR = 'all-non-vector-summary' SUMMARY_ALL_VECTOR = 'all-vector-summary' @@ -50,4 +56,6 @@ QUERY_INPUT_HYBRID_FIELDS_TO_RANK_TENSOR = "marqo__fields_to_rank_tensor" VESPA_DOC_HYBRID_RAW_TENSOR_SCORE = 'marqo__raw_tensor_score' -VESPA_DOC_HYBRID_RAW_LEXICAL_SCORE = 'marqo__raw_lexical_score' \ No newline at end of file +VESPA_DOC_HYBRID_RAW_LEXICAL_SCORE = 'marqo__raw_lexical_score' + +SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION = semver.VersionInfo.parse("2.16.0") #Denotes the Marqo version from which partial update support for semi-structured index was added \ No newline at end of file diff --git a/src/marqo/core/semi_structured_vespa_index/marqo_field_types.py b/src/marqo/core/semi_structured_vespa_index/marqo_field_types.py new file mode 100644 index 000000000..36c832c28 --- /dev/null +++ b/src/marqo/core/semi_structured_vespa_index/marqo_field_types.py @@ -0,0 +1,15 @@ +from enum import Enum + + +class MarqoFieldTypes(Enum): + """ + Enum class for Marqo field types. Used to specify the type of field in a Marqo index. + """ + BOOL = 'bool' + INT_MAP = 'int_map_entry' + FLOAT_MAP = 'float_map_entry' + INT = 'int' + FLOAT = 'float' + STRING_ARRAY = 'string_array' + STRING = 'string' + TENSOR = 'tensor' diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py index 00ee8b697..4f3a5d5b0 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py @@ -8,7 +8,9 @@ from marqo.core.exceptions import TooManyFieldsError from marqo.core.models.add_docs_params import AddDocsParams from marqo.core.index_management.index_management import IndexManagement -from marqo.core.models.marqo_index import SemiStructuredMarqoIndex, Field, FieldType, FieldFeature, TensorField +from marqo.core.models.marqo_index import SemiStructuredMarqoIndex, Field, FieldType, FieldFeature, TensorField, \ + StringArrayField +from marqo.core.semi_structured_vespa_index.common import SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION from marqo.core.semi_structured_vespa_index.semi_structured_vespa_index import SemiStructuredVespaIndex from marqo.core.semi_structured_vespa_index.semi_structured_vespa_schema import SemiStructuredVespaSchema from marqo.core.unstructured_vespa_index.unstructured_add_document_handler import UnstructuredAddDocumentsHandler @@ -26,7 +28,8 @@ class SemiStructuredFieldCountConfig(ImmutableStrictBaseModel): EnvVars.MARQO_MAX_LEXICAL_FIELD_COUNT_UNSTRUCTURED)) max_tensor_field_count: int = pydantic.Field(default_factory=lambda: read_env_vars_and_defaults_ints( EnvVars.MARQO_MAX_TENSOR_FIELD_COUNT_UNSTRUCTURED)) - + max_string_array_field_count: int = pydantic.Field(default_factory=lambda: read_env_vars_and_defaults_ints( + EnvVars.MARQO_MAX_STRING_ARRAY_FIELD_COUNT_UNSTRUCTURED)) class SemiStructuredAddDocumentsHandler(UnstructuredAddDocumentsHandler): def __init__(self, marqo_index: SemiStructuredMarqoIndex, add_docs_params: AddDocsParams, @@ -40,10 +43,30 @@ def __init__(self, marqo_index: SemiStructuredMarqoIndex, add_docs_params: AddDo self.field_count_config = field_count_config def _handle_field(self, marqo_doc, field_name, field_content): + """Handle a field in a Marqo document by processing it and updating the index schema if needed. + + Args: + marqo_doc: The Marqo document being processed + field_name: Name of the field + field_content: Content of the field + """ + # Process field using parent class handler super()._handle_field(marqo_doc, field_name, field_content) + + # Add lexical field if content is a string if isinstance(marqo_doc[field_name], str): self._add_lexical_field_to_index(field_name) + # Add string array field if content is list of strings and index version supports it + is_string_array = ( + isinstance(field_content, list) and + all(isinstance(elem, str) for elem in field_content) + ) + if (is_string_array and + self.marqo_index.parsed_marqo_version() >= SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION): #This is required so that we can update schema on the fly + self._add_string_array_field_to_index(field_name) + + def _to_vespa_doc(self, doc: Dict[str, Any]) -> VespaDocument: doc_tensor_fields = self.tensor_fields_container.get_tensor_field_content(doc[MARQO_DOC_ID]) processed_tensor_fields = dict() @@ -56,6 +79,7 @@ def _to_vespa_doc(self, doc: Dict[str, Any]) -> VespaDocument: if processed_tensor_fields: doc[constants.MARQO_DOC_TENSORS] = processed_tensor_fields + # doc here is Dict[str, Any], which will be converted to a VespaDocument return VespaDocument(**self.vespa_index.to_vespa_document(marqo_document=doc)) def _pre_persist_to_vespa(self): @@ -90,6 +114,25 @@ def _add_lexical_field_to_index(self, field_name): self.marqo_index.clear_cache() self.should_update_index = True + def _add_string_array_field_to_index(self, field_name): + if field_name in self.marqo_index.name_to_string_array_field_map: + return + + max_string_array_field_count = self.field_count_config.max_string_array_field_count + if len(self.marqo_index.string_array_fields) >= max_string_array_field_count: + raise TooManyFieldsError(f'Index {self.marqo_index.name} has {len(self.marqo_index.string_array_fields)} ' + f'string array fields. Your request to add {field_name} as a string array field is ' + f'rejected since it exceeds the limit of {max_string_array_field_count}. Please set ' + f'a larger limit in MARQO_MAX_STRING_ARRAY_FIELD_COUNT_UNSTRUCTURED environment variable.') + + logger.debug(f'Adding string array field {field_name} to index {self.marqo_index.name}') + + self.marqo_index.string_array_fields.append( + StringArrayField(name = field_name, type = FieldType.ArrayText, string_array_field_name = f'{SemiStructuredVespaSchema.FIELD_STRING_ARRAY_PREFIX}{field_name}', features=[FieldFeature.Filter]) + ) + self.marqo_index.clear_cache() + self.should_update_index = True + def _add_tensor_field_to_index(self, field_name): if field_name in self.marqo_index.tensor_field_map: return diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py index dd9579999..0c0974aeb 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_document.py @@ -1,5 +1,6 @@ import json -from typing import List, Dict, Any +import time +from typing import List, Dict, Any, Union from pydantic import Field @@ -7,25 +8,30 @@ from marqo.core import constants as index_constants, constants from marqo.core.exceptions import VespaDocumentParsingError, MarqoDocumentParsingError, InvalidFieldNameError, \ InvalidTensorFieldError -from marqo.core.models.marqo_index import SemiStructuredMarqoIndex +from marqo.core.models.marqo_index import SemiStructuredMarqoIndex, logger from marqo.core.semi_structured_vespa_index import common +from marqo.core.semi_structured_vespa_index.common import VESPA_DOC_FIELD_TYPES, STRING_ARRAY +from marqo.core.semi_structured_vespa_index.marqo_field_types import MarqoFieldTypes +from marqo.core.unstructured_vespa_index.common import MARQO_DOC_MULTIMODAL_PARAMS, MARQO_DOC_MULTIMODAL_PARAMS_WEIGHTS class SemiStructuredVespaDocumentFields(MarqoBaseModel): """A class with fields that are common to all Vespa documents.""" marqo__id: str = Field(alias=common.VESPA_FIELD_ID) + create_timestamp: float = Field(default_factory=time.time, alias=common.VESPA_DOC_CREATE_TIMESTAMP) short_string_fields: Dict[str, str] = Field(default_factory=dict, alias=common.SHORT_STRINGS_FIELDS) - string_arrays: List[str] = Field(default_factory=list, alias=common.STRING_ARRAY) + string_arrays: List[str] = Field(default_factory=list, alias=common.STRING_ARRAY) # Indexes created pre marqo version 2.16 will have string arrays stored as a list of strings int_fields: Dict[str, int] = Field(default_factory=dict, alias=common.INT_FIELDS) bool_fields: Dict[str, int] = Field(default_factory=dict, alias=common.BOOL_FIELDS) float_fields: Dict[str, float] = Field(default_factory=dict, alias=common.FLOAT_FIELDS) score_modifiers_fields: Dict[str, Any] = Field(default_factory=dict, alias=common.SCORE_MODIFIERS) - vespa_multimodal_params: Dict[str, str] = Field(default_factory=str, alias=common.VESPA_DOC_MULTIMODAL_PARAMS) + vespa_multimodal_params: Dict[str, str] = Field(default_factory=dict, alias=common.VESPA_DOC_MULTIMODAL_PARAMS) + field_types: Dict[str, str] = Field(default_factory=dict, alias=common.VESPA_DOC_FIELD_TYPES) class SemiStructuredVespaDocument(MarqoBaseModel): - """A helper class to handle the conversion between Vespa and Marqo documents for an semi-structured index. + """A helper class to handle the conversion between Vespa and Marqo documents for a semi-structured index. The object can be instantiated from a Marqo document using the from_marqo_document method, or can be instantiated from a Vespa document using the from_vespa_document method. """ @@ -35,6 +41,8 @@ class SemiStructuredVespaDocument(MarqoBaseModel): tensor_fields: dict = Field(default_factory=dict) vector_counts: int = Field(default=0, alias=common.FIELD_VECTOR_COUNT) match_features: Dict[str, Any] = Field(default_factory=dict, alias=common.VESPA_DOC_MATCH_FEATURES) + string_array_fields: Dict[str, List[str]] = Field(default_factory=dict) + index_supports_partial_updates: bool = False # For hybrid search raw_tensor_score: float = None @@ -52,23 +60,96 @@ def from_vespa_document(cls, document: Dict, marqo_index: SemiStructuredMarqoInd fields = document.get(cls._VESPA_DOC_FIELDS, {}) tensor_fields = {} text_fields = {} - for field_name in fields: + string_arrays_dict = {} + string_arrays_list = [] + + def process_field(field_name: str, fields: Dict) -> None: + """Helper function to process individual tensor fields, lexical fields and populate the appropriate dictionaries""" if field_name in marqo_index.tensor_subfield_map: tensor_fields[field_name] = fields[field_name] - elif field_name in marqo_index.lexical_field_map: # lexical fields are returned with prefixed name from get_by_ids + elif field_name in marqo_index.lexical_field_map: + # Lexical fields are returned with prefixed name from get_by_ids text_field_name = marqo_index.lexical_field_map[field_name].name text_fields[text_field_name] = fields[field_name] - elif field_name in marqo_index.field_map: # lexical fields are returned with the original name from search + elif field_name in marqo_index.field_map: + # Lexical fields are returned with original name from search text_fields[field_name] = fields[field_name] - return cls(id=document[cls._VESPA_DOC_ID], - fixed_fields=SemiStructuredVespaDocumentFields.construct(**fields), - tensor_fields=tensor_fields, - text_fields=text_fields, - raw_tensor_score=cls.extract_field(fields, common.VESPA_DOC_HYBRID_RAW_TENSOR_SCORE, None), - raw_lexical_score=cls.extract_field(fields, common.VESPA_DOC_HYBRID_RAW_LEXICAL_SCORE, None), - match_features=cls.extract_field(fields, common.VESPA_DOC_MATCH_FEATURES, dict()), - vector_counts=cls.extract_field(fields, common.FIELD_VECTOR_COUNT, 0)) + index_supports_partial_updates = marqo_index.parsed_marqo_version() >= common.SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION + + if index_supports_partial_updates: + for field_name in fields: + # Process tensor and text fields + process_field(field_name, fields) + + # Handle string arrays separately + # In case of indexes which support partial updates (i.e indexes created with Marqo version post 2.16.0), string arrays are stored in Vespa like + # 'marqo__string_array_field_name_1': ['element1', 'element2', ...] + # 'marqo__string_array_field_name_2': ['element3', 'element4', ...] + # Here we will collect all such string array fields and put them in string_arrays_dict, which will later be used to construct the SemiStructuredVespaDocument object. + if field_name.startswith(common.STRING_ARRAY+'_'): + string_array_field_key = field_name.replace(common.STRING_ARRAY+'_', '') + string_array_field_value = fields[field_name] + string_arrays_dict[string_array_field_key] = string_array_field_value + + fixed_fields = SemiStructuredVespaDocumentFields.construct( + marqo__id=cls.extract_field(fields, common.VESPA_FIELD_ID, None), + create_timestamp=cls.extract_field(fields, common.VESPA_DOC_CREATE_TIMESTAMP, None), + short_string_fields=cls.extract_field(fields, common.SHORT_STRINGS_FIELDS, dict()), + string_arrays=cls.extract_field(fields, common.STRING_ARRAY, list()), + int_fields=cls.extract_field(fields, common.INT_FIELDS, dict()), + bool_fields=cls.extract_field(fields, common.BOOL_FIELDS, dict()), + float_fields=cls.extract_field(fields, common.FLOAT_FIELDS, dict()), + score_modifiers_fields=cls.extract_field(fields, common.SCORE_MODIFIERS, dict()), + vespa_multimodal_params=cls.extract_field(fields, common.VESPA_DOC_MULTIMODAL_PARAMS, dict()), + field_types=cls.extract_field(fields, VESPA_DOC_FIELD_TYPES, dict()) + ) + + return cls(id=document[cls._VESPA_DOC_ID], + fixed_fields=fixed_fields, + tensor_fields=tensor_fields, + text_fields=text_fields, + string_array_fields = string_arrays_dict, + raw_tensor_score=cls.extract_field(fields, common.VESPA_DOC_HYBRID_RAW_TENSOR_SCORE, None), + raw_lexical_score=cls.extract_field(fields, common.VESPA_DOC_HYBRID_RAW_LEXICAL_SCORE, None), + match_features=cls.extract_field(fields, common.VESPA_DOC_MATCH_FEATURES, dict()), + vector_counts=cls.extract_field(fields, common.FIELD_VECTOR_COUNT, 0), + index_supports_partial_updates=True) + else: + # For older versions, just process tensor and text fields + for field_name in fields: + process_field(field_name, fields) + + # Handle string arrays separately + # In case of indexes which don't support partial updates (i.e indexes created with Marqo version pre 2.16.0), string arrays are stored like + # 'marqo__string_array': ['field_name_1::element1', 'field_name_1::element2', 'field_name_2::element3', 'field_name_2::element4', ...], inside the Vespa document. + # Here we will just take the list defined under marqo__string_array and put under string_arrays_list, which will later be used to construct the SemiStructuredVespaDocumentFields object. + # The SemiStructuredVespaDocumentFields object will later be used to construct SemiStructuredVespaDocument object. + # Later when converting SemiStructuredVespaDocument object to Marqo document (which is a dictionary of fields to be returned to the user), we will process this list & convert it + # to a dictionary by splitting the list element by '::', making the prefix before '::' as the key and suffix after '::' as the value. + if field_name == common.STRING_ARRAY: + string_arrays_list = fields[field_name] + fixed_fields = SemiStructuredVespaDocumentFields.construct( + marqo__id=cls.extract_field(fields, common.VESPA_FIELD_ID, None), + short_string_fields=cls.extract_field(fields, common.SHORT_STRINGS_FIELDS, dict()), + string_arrays=string_arrays_list, + int_fields=cls.extract_field(fields, common.INT_FIELDS, dict()), + bool_fields=cls.extract_field(fields, common.BOOL_FIELDS, dict()), + float_fields=cls.extract_field(fields, common.FLOAT_FIELDS, dict()), + score_modifiers_fields=cls.extract_field(fields, common.SCORE_MODIFIERS, dict()), + vespa_multimodal_params=cls.extract_field(fields, common.VESPA_DOC_MULTIMODAL_PARAMS, dict()), + field_types=cls.extract_field(fields, VESPA_DOC_FIELD_TYPES, dict()) + ) + + return cls(id=document[cls._VESPA_DOC_ID], + fixed_fields=fixed_fields, + tensor_fields=tensor_fields, + text_fields=text_fields, + raw_tensor_score=cls.extract_field(fields, common.VESPA_DOC_HYBRID_RAW_TENSOR_SCORE, None), + raw_lexical_score=cls.extract_field(fields, common.VESPA_DOC_HYBRID_RAW_LEXICAL_SCORE, None), + match_features=cls.extract_field(fields, common.VESPA_DOC_MATCH_FEATURES, dict()), + vector_counts=cls.extract_field(fields, common.FIELD_VECTOR_COUNT, 0), + index_supports_partial_updates=False) @classmethod def extract_field(cls, fields, name: str, default: Any): @@ -76,8 +157,31 @@ def extract_field(cls, fields, name: str, default: Any): @classmethod def from_marqo_document(cls, document: Dict, marqo_index: SemiStructuredMarqoIndex) -> "SemiStructuredVespaDocument": - """Instantiate an SemiStructuredVespaDocument from a valid Marqo document for feeding to Vespa""" - + """ + Creates a SemiStructuredVespaDocument object from a Marqo document. + + Args: + document (Dict): A dictionary representing a valid Marqo document. Must contain a '_id' field + and can include various field types like strings, booleans, numbers, arrays and tensors. + marqo_index (SemiStructuredMarqoIndex): The Marqo index object that this document belongs to. + Used to determine index capabilities and settings. + + Returns: + SemiStructuredVespaDocument: A new instance containing the document data structured for Vespa. + + Raises: + MarqoDocumentParsingError: If the document is missing required fields or contains invalid data. + + Example: + doc = { + "_id": "doc1", + "title": "Sample Document", + "tags": ["tag1", "tag2"], + "rating": 4.5 + } + vespa_doc = SemiStructuredVespaDocument.from_marqo_document(doc, index) + """ + index_supports_partial_updates = (marqo_index.parsed_marqo_version() >= common.SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION) if index_constants.MARQO_DOC_ID not in document: # Please note we still use unstructured in the error message since it will be exposed to user raise MarqoDocumentParsingError( @@ -85,71 +189,156 @@ def from_marqo_document(cls, document: Dict, marqo_index: SemiStructuredMarqoInd f"This should be assigned for a valid document") doc_id = document[index_constants.MARQO_DOC_ID] - instance = cls(id=doc_id, fixed_fields=SemiStructuredVespaDocumentFields(marqo__id=doc_id)) + instance = cls(id=doc_id, fixed_fields=SemiStructuredVespaDocumentFields(marqo__id=doc_id), index_supports_partial_updates=index_supports_partial_updates) + + if common.VESPA_DOC_CREATE_TIMESTAMP in document: + instance.fixed_fields.create_timestamp = document[common.VESPA_DOC_CREATE_TIMESTAMP] + # Process regular fields + cls._process_regular_fields(document, instance, marqo_index, doc_id) + + # Process tensor fields if present + vector_count = cls._process_tensor_fields(document, instance, marqo_index) + instance.vector_counts = vector_count + + # Add multimodal params if present + instance.fixed_fields.vespa_multimodal_params = document.get(common.MARQO_DOC_MULTIMODAL_PARAMS, {}) + return instance + + @classmethod + def _process_regular_fields(cls, document: dict, instance, marqo_index: SemiStructuredMarqoIndex, doc_id: str): + """Process non-tensor fields in the document""" for field_name, field_content in document.items(): if field_name in [index_constants.MARQO_DOC_ID, constants.MARQO_DOC_TENSORS]: continue - if isinstance(field_content, str): - if field_name not in marqo_index.field_map: - # All string fields will be added to the index as lexical fields before this convertion happens - raise MarqoDocumentParsingError(f'Field {field_name} is not in index {marqo_index.name}') - field = marqo_index.field_map[field_name] - instance.text_fields[field.lexical_field_name] = field_content - if len(field_content) <= marqo_index.filter_string_max_length: - instance.fixed_fields.short_string_fields[field_name] = field_content - elif isinstance(field_content, bool): - instance.fixed_fields.bool_fields[field_name] = int(field_content) - elif isinstance(field_content, list) and all(isinstance(elem, str) for elem in field_content): - instance.fixed_fields.string_arrays.extend([f"{field_name}::{element}" for element in field_content]) - elif isinstance(field_content, int): - instance.fixed_fields.int_fields[field_name] = field_content - instance.fixed_fields.score_modifiers_fields[field_name] = field_content - elif isinstance(field_content, float): - instance.fixed_fields.float_fields[field_name] = field_content - instance.fixed_fields.score_modifiers_fields[field_name] = field_content - elif isinstance(field_content, dict): - for k, v in field_content.items(): - if isinstance(v, int): - instance.fixed_fields.int_fields[f"{field_name}.{k}"] = v - instance.fixed_fields.score_modifiers_fields[f"{field_name}.{k}"] = v - elif isinstance(v, float): - instance.fixed_fields.float_fields[f"{field_name}.{k}"] = float(v) - instance.fixed_fields.score_modifiers_fields[f"{field_name}.{k}"] = v - else: + try: + cls._handle_field_content(field_name, field_content, instance, marqo_index) + except Exception as e: raise MarqoDocumentParsingError( - f"In document {doc_id}, field {field_name} has an " - f"unsupported type {type(field_content)} which has not been validated in advance.") + f"Error processing field '{field_name}' in document {doc_id}: {str(e)}") - # Tensors - vector_count = 0 - if constants.MARQO_DOC_TENSORS in document: - for marqo_tensor_field in document[constants.MARQO_DOC_TENSORS]: - marqo_tensor_value = document[constants.MARQO_DOC_TENSORS][marqo_tensor_field] - - cls._verify_marqo_tensor_field_name(marqo_tensor_field, marqo_index) - cls._verify_marqo_tensor_field(marqo_tensor_field, marqo_tensor_value) + @classmethod + def _handle_field_content(cls, field_name: str, field_content: Union[str, bool, list, int, float, dict], instance, + marqo_index: SemiStructuredMarqoIndex): + """Handle different field content types""" + if isinstance(field_content, str): + cls._handle_string_field(field_name, field_content, instance, marqo_index) + elif isinstance(field_content, bool): + cls._handle_bool_field(field_name, field_content, instance) + elif isinstance(field_content, list) and all(isinstance(elem, str) for elem in field_content): + cls._handle_string_array_field(field_name, field_content, instance) + elif isinstance(field_content, (int, float)): + cls._handle_numeric_field(field_name, field_content, instance) + elif isinstance(field_content, dict): + cls._handle_dict_field(field_name, field_content, instance) + else: + raise MarqoDocumentParsingError(f"Unsupported type {type(field_content)}") - # If chunking an image, chunks will be a list of tuples, hence the str(c) - chunks = [str(c) for c in marqo_tensor_value[constants.MARQO_DOC_CHUNKS]] - embeddings = marqo_tensor_value[constants.MARQO_DOC_EMBEDDINGS] - vector_count += len(embeddings) + @classmethod + def _handle_string_field(cls, field_name: str, field_content: str, instance, marqo_index: SemiStructuredMarqoIndex): + if field_name not in marqo_index.field_map: + raise MarqoDocumentParsingError(f'Field {field_name} is not in index {marqo_index.name}') + + field = marqo_index.field_map[field_name] + instance.text_fields[field.lexical_field_name] = field_content + + if len(field_content) <= marqo_index.filter_string_max_length: + instance.fixed_fields.short_string_fields[field_name] = field_content + + if instance.index_supports_partial_updates: + instance.fixed_fields.field_types[field_name] = MarqoFieldTypes.STRING.value - index_tensor_field = marqo_index.tensor_field_map[marqo_tensor_field] + @classmethod + def _handle_bool_field(cls, field_name: str, field_content: bool, instance): + instance.fixed_fields.bool_fields[field_name] = int(field_content) + if instance.index_supports_partial_updates: + instance.fixed_fields.field_types[field_name] = MarqoFieldTypes.BOOL.value - instance.tensor_fields[index_tensor_field.chunk_field_name] = chunks - instance.tensor_fields[index_tensor_field.embeddings_field_name] = \ - {f'{i}': embeddings[i] for i in range(len(embeddings))} + @classmethod + def _handle_string_array_field(cls, field_name: str, field_content: List[str], instance): + if instance.index_supports_partial_updates: + instance.string_array_fields[field_name] = field_content + instance.fixed_fields.field_types[field_name] = MarqoFieldTypes.STRING_ARRAY.value + else: + instance.fixed_fields.string_arrays.extend([f"{field_name}::{element}" for element in field_content]) - instance.vector_counts = vector_count + @classmethod + def _handle_numeric_field(cls, field_name: str, field_content: Union[int, float], instance): + if isinstance(field_content, int): + instance.fixed_fields.int_fields[field_name] = field_content + if instance.index_supports_partial_updates: + instance.fixed_fields.field_types[field_name] = MarqoFieldTypes.INT.value + else: # float + instance.fixed_fields.float_fields[field_name] = field_content + if instance.index_supports_partial_updates: + instance.fixed_fields.field_types[field_name] = MarqoFieldTypes.FLOAT.value + instance.fixed_fields.score_modifiers_fields[field_name] = field_content - instance.fixed_fields.vespa_multimodal_params = document.get(common.MARQO_DOC_MULTIMODAL_PARAMS, {}) + @classmethod + def _handle_dict_field(cls, field_name: str, field_content: Dict[str, Union[int, float]], instance): + for k, v in field_content.items(): + field_key = f"{field_name}.{k}" # field_key is the flattened field name for a dict field. + if isinstance(v, int): + instance.fixed_fields.int_fields[field_key] = v + instance.fixed_fields.score_modifiers_fields[field_key] = v + if instance.index_supports_partial_updates: + instance.fixed_fields.field_types[field_key] = MarqoFieldTypes.INT_MAP.value + instance.fixed_fields.field_types[field_name] = MarqoFieldTypes.INT_MAP.value # Marking the overall dict field as a int_map_entry field as well + elif isinstance(v, float): + instance.fixed_fields.float_fields[field_key] = float(v) + instance.fixed_fields.score_modifiers_fields[field_key] = v + if instance.index_supports_partial_updates: + instance.fixed_fields.field_types[field_key] = MarqoFieldTypes.FLOAT_MAP.value + instance.fixed_fields.field_types[field_name] = MarqoFieldTypes.FLOAT_MAP.value # Marking the overall dict field as a float_map_entry field as well - return instance + @classmethod + def _process_tensor_fields(cls, document: Dict, instance, marqo_index: SemiStructuredMarqoIndex): + """Process tensor fields in the document""" + vector_count = 0 + if constants.MARQO_DOC_TENSORS in document: + for marqo_tensor_field, tensor_value in document[constants.MARQO_DOC_TENSORS].items(): + if instance.index_supports_partial_updates: + instance.fixed_fields.field_types[marqo_tensor_field] = MarqoFieldTypes.TENSOR.value # Set field_types as tensor for tensor fields + + multimodal_params = document.get(MARQO_DOC_MULTIMODAL_PARAMS) + if multimodal_params is not None and multimodal_params.get(marqo_tensor_field) is not None: # Set field_types as tensor for sub-fields of multimodal combo fields + try: + multimodal_params = json.loads(document.get(MARQO_DOC_MULTIMODAL_PARAMS).get(marqo_tensor_field)) + multimodal_combo_sub_fields = multimodal_params.get(MARQO_DOC_MULTIMODAL_PARAMS_WEIGHTS).keys() + for sub_field in multimodal_combo_sub_fields: + instance.fixed_fields.field_types[sub_field] = MarqoFieldTypes.TENSOR.value + except json.JSONDecodeError as e: + raise MarqoDocumentParsingError(f"Error parsing multimodal params for field {marqo_tensor_field}: {str(e)}") + + cls._verify_marqo_tensor_field_name(marqo_tensor_field, marqo_index) + cls._verify_marqo_tensor_field(marqo_tensor_field, tensor_value) + + # If chunking an image, chunks will be a list of tuples, hence the str(c) + chunks = [str(c) for c in tensor_value[constants.MARQO_DOC_CHUNKS]] + embeddings = tensor_value[constants.MARQO_DOC_EMBEDDINGS] + vector_count += len(embeddings) + + index_tensor_field = marqo_index.tensor_field_map[marqo_tensor_field] + instance.tensor_fields[index_tensor_field.chunk_field_name] = chunks + instance.tensor_fields[index_tensor_field.embeddings_field_name] = \ + {f'{i}': embeddings[i] for i in range(len(embeddings))} + + return vector_count def to_vespa_document(self) -> Dict[str, Any]: - """Convert VespaDocumentObject to a Vespa document. - Empty fields are removed from the document.""" + """ + Converts this SemiStructuredVespaDocument object to a Vespa document format. + + @return: Dictionary containing the Vespa document representation with document ID and fields + + The returned document will have empty fields removed. The document structure follows Vespa's + expected format with a document ID and fields dictionary containing: + - Fixed fields (integers, floats, booleans etc) + - Text fields + - Tensor fields + - Vector count + - String arrays (handled differently based on partial update support) + """ vespa_fields = { **{k: v for k, v in self.fixed_fields.dict(exclude_none=True, by_alias=True).items() if v or v == 0}, **self.text_fields, @@ -157,16 +346,43 @@ def to_vespa_document(self) -> Dict[str, Any]: common.FIELD_VECTOR_COUNT: self.vector_counts, } + if self.index_supports_partial_updates: + if self.string_array_fields is not None: + for string_array_key, string_array_value in self.string_array_fields.items(): + key = f'{STRING_ARRAY}_{string_array_key}' + vespa_fields[key] = string_array_value + else: + vespa_fields[common.STRING_ARRAY] = self.fixed_fields.string_arrays + return {self._VESPA_DOC_ID: self.id, self._VESPA_DOC_FIELDS: vespa_fields} def to_marqo_document(self, marqo_index: SemiStructuredMarqoIndex) -> Dict[str, Any]: - """Convert VespaDocumentObject to marqo document document structure.""" + """ + Convert SemiStructuredVespaDocument object to marqo document structure. + + Args: + marqo_index: The SemiStructuredMarqoIndex instance containing index configuration + + Returns: + Dict[str, Any]: A dictionary representing the marqo document format containing: + - String array fields (handled differently for pre/post 2.16 indexes) + - Integer and float fields + - Boolean fields + - Document ID + - Text fields + - Tensor fields with chunks and embeddings + """ marqo_document = {} - for string_array in self.fixed_fields.string_arrays: - key, value = string_array.split("::", 1) - if key not in marqo_document: - marqo_document[key] = [] - marqo_document[key].append(value) + + if self.index_supports_partial_updates and self.string_array_fields: + # self.string_array_fields is a dictionary, Post 2.16 indexes will have string arrays stored as a map of string to list of strings. + marqo_document.update(self.string_array_fields) + else: # Pre 2.16 indexes will have string arrays stored as a list of strings in the SemiStructuredVespaDocumentsFields object under a field called "string_arrays". + for string_array in self.fixed_fields.string_arrays: + string_array_key, string_array_value = string_array.split("::", 1) # String_array_key will be string in this case, and string_array_value will be a single string in this case. + if string_array_key not in marqo_document: + marqo_document[string_array_key] = [] + marqo_document[string_array_key].append(string_array_value) # Add int and float fields back # Please note that int-map and float-map fields are flattened in the result. The correct behaviour is to convert @@ -176,6 +392,8 @@ def to_marqo_document(self, marqo_index: SemiStructuredMarqoIndex) -> Dict[str, marqo_document.update({k: bool(v) for k, v in self.fixed_fields.bool_fields.items()}) marqo_document[index_constants.MARQO_DOC_ID] = self.fixed_fields.marqo__id + # Note: We are not adding field_types & create_timestamp to the document because + # it's a field for internal Marqo use only. # text fields for field_name, field_content in self.text_fields.items(): diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py index e28624959..2e3266027 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_index.py @@ -1,23 +1,33 @@ -from typing import Dict, Any, Optional, cast +import json +from typing import Dict, Any, List, Optional, Type, Union, cast -from marqo.core.constants import MARQO_DOC_HIGHLIGHTS +from marqo.core.constants import MARQO_DOC_HIGHLIGHTS, MARQO_DOC_ID +from marqo.core.exceptions import MarqoDocumentParsingError from marqo.core.models import MarqoQuery from marqo.core.models.marqo_index import SemiStructuredMarqoIndex from marqo.core.models.marqo_query import MarqoTensorQuery, MarqoLexicalQuery, MarqoHybridQuery +from marqo.core.search import search_filter from marqo.core.semi_structured_vespa_index import common +from marqo.core.semi_structured_vespa_index.common import VESPA_FIELD_ID, BOOL_FIELDS, SHORT_STRINGS_FIELDS, \ + STRING_ARRAY, INT_FIELDS, FLOAT_FIELDS from marqo.core.semi_structured_vespa_index.semi_structured_document import SemiStructuredVespaDocument +from marqo.core.semi_structured_vespa_index.semi_structured_vespa_schema import SemiStructuredVespaSchema from marqo.core.structured_vespa_index.structured_vespa_index import StructuredVespaIndex +from marqo.core.unstructured_vespa_index.unstructured_validation import validate_field_name from marqo.core.unstructured_vespa_index.unstructured_vespa_index import UnstructuredVespaIndex -from marqo.exceptions import InternalError +from marqo.core.semi_structured_vespa_index.marqo_field_types import MarqoFieldTypes +from marqo.exceptions import InternalError, InvalidArgumentError class SemiStructuredVespaIndex(StructuredVespaIndex, UnstructuredVespaIndex): """ An implementation of VespaIndex for SemiStructured indexes. """ + index_supports_partial_updates: bool = False def __init__(self, marqo_index: SemiStructuredMarqoIndex): super().__init__(marqo_index) + self.index_supports_partial_updates = self._marqo_index_version >= SemiStructuredVespaSchema.SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION def get_marqo_index(self) -> SemiStructuredMarqoIndex: if isinstance(self._marqo_index, SemiStructuredMarqoIndex): @@ -45,13 +55,22 @@ def to_vespa_query(self, marqo_query: MarqoQuery) -> Dict[str, Any]: # Verify attributes to retrieve, if defined if marqo_query.attributes_to_retrieve is not None: if len(marqo_query.attributes_to_retrieve) > 0: - # Retrieve static fields content to extract non-string values from combined fields - marqo_query.attributes_to_retrieve.extend([ - common.STRING_ARRAY, - common.INT_FIELDS, - common.FLOAT_FIELDS, - common.BOOL_FIELDS, - ]) + if self.index_supports_partial_updates: + # Retrieve static fields content to extract non-string values from combined fields + marqo_query.attributes_to_retrieve.extend([ + common.INT_FIELDS, + common.FLOAT_FIELDS, + common.BOOL_FIELDS, + ]) + string_array_attributes_to_retrieve = self._get_string_array_attributes_to_retrieve(marqo_query.attributes_to_retrieve) + marqo_query.attributes_to_retrieve.extend(string_array_attributes_to_retrieve) + else: + marqo_query.attributes_to_retrieve.extend([ + common.STRING_ARRAY, + common.INT_FIELDS, + common.FLOAT_FIELDS, + common.BOOL_FIELDS, + ]) marqo_query.attributes_to_retrieve.append(common.VESPA_FIELD_ID) @@ -72,7 +91,561 @@ def to_vespa_query(self, marqo_query: MarqoQuery) -> Dict[str, Any]: else: raise InternalError(f'Unknown query type {type(marqo_query)}') - @classmethod - def _get_filter_term(cls, marqo_query: MarqoQuery) -> Optional[str]: + def _get_string_array_attributes_to_retrieve(self, attributes_to_retrieve: List) -> List[str]: + name_to_string_array_field_map = self.get_marqo_index().name_to_string_array_field_map + return [name_to_string_array_field_map[att].string_array_field_name for att in attributes_to_retrieve if + name_to_string_array_field_map.get(att)] + + def _get_filter_term(self, marqo_query: MarqoQuery) -> Optional[str]: # Reuse logic in UnstructuredVespaIndex to create filter term - return UnstructuredVespaIndex._get_filter_term(marqo_query) + def escape(s: str) -> str: + return s.replace('\\', '\\\\').replace('"', '\\"') + + def generate_equality_filter_string(node: search_filter.EqualityTerm) -> str: + filter_parts = [] + + # Filter on `_id` + if node.field == MARQO_DOC_ID: + return f'({VESPA_FIELD_ID} contains "{escape(node.value)}")' + + # Bool Filter + if node.value.lower() in self._FILTER_STRING_BOOL_VALUES: + filter_value = int(True if node.value.lower() == "true" else False) + bool_filter_string = (f'({BOOL_FIELDS} contains ' + f'sameElement(key contains "{node.field}", value = {filter_value}))') + filter_parts.append(bool_filter_string) + + # Short String Filter + short_string_filter_string = (f'({SHORT_STRINGS_FIELDS} ' + f'contains sameElement(key contains "{node.field}", ' + f'value contains "{escape(node.value)}"))') + filter_parts.append(short_string_filter_string) + + # String Array Filter + if self.index_supports_partial_updates: + if node.field in self.get_marqo_index().name_to_string_array_field_map: + string_array_field_name = f'{STRING_ARRAY}_{node.field}' + string_array_filter_string = (f'({string_array_field_name} contains ' + f'"{escape(node.value)}")') + filter_parts.append(string_array_filter_string) + else: + string_array_filter_string = (f'({STRING_ARRAY} contains ' + f'"{node.field}::{escape(node.value)}")') + filter_parts.append(string_array_filter_string) + + # Numeric Filter + numeric_filter_string = "" + try: + numeric_value = int(node.value) + numeric_filter_string = ( + f'({INT_FIELDS} contains sameElement(key contains "{node.field}", value = {numeric_value})) ' + f'OR ({FLOAT_FIELDS} contains sameElement(key contains "{node.field}", value = {numeric_value}))') + except ValueError: + try: + numeric_value = float(node.value) + numeric_filter_string = f'({FLOAT_FIELDS} contains sameElement(key contains "{node.field}", value = {numeric_value}))' + except ValueError: + pass + + if numeric_filter_string: + filter_parts.append(numeric_filter_string) + + # Final Filter String + final_filter_string = f"({' OR '.join(filter_parts)})" + return final_filter_string + + def generate_range_filter_string(node: search_filter.RangeTerm) -> str: + lower = f'value >= {node.lower}' if node.lower is not None else "" + higher = f'value <= {node.upper}' if node.upper is not None else "" + bound = f'{lower}, {higher}' if lower and higher else f'{lower}{higher}' + if not bound: + raise InternalError('RangeTerm has no lower or upper bound') + + float_field_string = (f'({FLOAT_FIELDS} contains ' + f'sameElement(key contains "{node.field}", {bound}))') + + int_field_string = (f'({INT_FIELDS} contains ' + f'sameElement(key contains "{node.field}", {bound}))') + + return f'({float_field_string} OR {int_field_string})' + + def tree_to_filter_string(node: search_filter.Node) -> str: + if isinstance(node, search_filter.Operator): + if isinstance(node, search_filter.And): + operator = 'AND' + elif isinstance(node, search_filter.Or): + operator = 'OR' + else: + raise InternalError(f'Unknown operator type {type(node)}') + return f'({tree_to_filter_string(node.left)} {operator} {tree_to_filter_string(node.right)})' + elif isinstance(node, search_filter.Modifier): + if isinstance(node, search_filter.Not): + return f'!({tree_to_filter_string(node.modified)})' + else: + raise InternalError(f'Unknown modifier type {type(node)}') + elif isinstance(node, search_filter.Term): + if isinstance(node, search_filter.EqualityTerm): + return generate_equality_filter_string(node) + elif isinstance(node, search_filter.RangeTerm): + return generate_range_filter_string(node) + elif isinstance(node, search_filter.InTerm): + raise InvalidArgumentError("The 'IN' filter keyword is not yet supported for unstructured indexes") + raise InternalError(f'Unknown node type {type(node)}') + + if marqo_query.filter is not None: + return tree_to_filter_string(marqo_query.filter.root) + + def _extract_document_id(self, document: Dict[str, Any]) -> str: + """Extract and validate document ID.""" + if "_id" not in document: + raise MarqoDocumentParsingError("'_id' is a required field") + doc_id = document["_id"] + self._verify_id_field(doc_id) + return doc_id + + def to_vespa_partial_document(self, marqo_document: Dict[str, Any], existing_vespa_document: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Convert a Marqo document to Vespa partial document format for updates. + + This method transforms a Marqo document into the format required by Vespa for partial document updates. + It processes each field in the document according to its type and creates the appropriate Vespa field + representations. + + Args: + marqo_document: A dictionary containing the Marqo document to be converted. Must contain an '_id' field. + existing_vespa_document: Optional existing Vespa document to be compared against while creating the update statement + + Returns: + Dict containing the Vespa partial document format with: + - 'id': Document ID + - 'field_types': Field name to type mapping + - 'fields': Field values + - 'create_timestamp': Original document timestamp if it exists + + Raises: + MarqoDocumentParsingError: If '_id' field is missing + InvalidFieldNameError: If any field name is invalid + """ + doc_id = self._extract_document_id(marqo_document) + + # Convert existing document if provided + original_doc = None + if existing_vespa_document: + original_doc = SemiStructuredVespaDocument.from_vespa_document( + existing_vespa_document, + marqo_index=self.get_marqo_index() + ) + + # Initialize tracking dictionaries + vespa_fields = {} + vespa_field_types = {} + + # Initialize dictionary to be later used for updating score modifiers. + numeric_fields = {} + + numeric_field_map: Dict[str, Any] = dict() # This map is used to store the numeric fields in the document. It is used to update the numeric fields & score modifiers later + if original_doc: + numeric_field_map.update(original_doc.fixed_fields.int_fields) + numeric_field_map.update(original_doc.fixed_fields.float_fields) + + # Process each field in the document + for field_name, value in marqo_document.items(): + if field_name == MARQO_DOC_ID: + continue + + validate_field_name(field_name) + + # This method broadly processes the field based on its type and updates the vespa_fields, + # vespa_field_types, numeric_fields, numeric_field_map dictionaries. Numeric fields and numeric field maps + # are special cases and are processed later. + self._process_field( + field_name=field_name, + value=value, + fields=vespa_fields, + field_types=vespa_field_types, + numeric_fields=numeric_fields, + numeric_field_map=numeric_field_map, + doc_id=doc_id + ) + + # This method creates the update statement for updating int fields / int map fields. + int_fields_changed = self._update_numeric_field( + int, numeric_field_map, original_doc, vespa_fields, vespa_field_types + ) + # This method creates the update statement for float numeric fields / float map fields. + float_fields_changed = self._update_numeric_field( + float, numeric_field_map, original_doc, vespa_fields, vespa_field_types + ) + + # Handle score modifier updates + if int_fields_changed or float_fields_changed: + self._update_score_modifiers( + original_doc=original_doc, + numeric_field_map=numeric_field_map, + vespa_fields=vespa_fields + ) + + return { + "id": doc_id, + "fields": vespa_fields, + "field_types": vespa_field_types, + "create_timestamp": original_doc.fixed_fields.create_timestamp if original_doc else None + } + + def _update_score_modifiers(self, original_doc: Optional[SemiStructuredVespaDocument], + numeric_field_map: Dict[str, Any], + vespa_fields: Dict[str, Any]) -> None: + """Updates score modifiers for numeric fields in Vespa documents. + + This method handles the updating of score modifiers for numeric fields during partial document updates. + It identifies which score modifiers need to be removed (fields that existed in the original document + but are no longer present) and which ones need to be modified (fields with new values). + + Args: + original_doc: The original document before the update, if it exists + numeric_field_map: Dictionary mapping field names to their numeric values (both int and float) + vespa_fields: Dictionary to store the update statements for Vespa fields + + Returns: + None + + Note: + Score modifiers are only updated if there are changes to numeric fields. + The method creates a Vespa update operation that either replaces or removes score modifiers. + """ + + original_fields = {} + # Find score modifiers to remove + score_modifier_to_be_removed = [] + if original_doc: + original_fields.update(original_doc.fixed_fields.int_fields) + original_fields.update(original_doc.fixed_fields.float_fields) + score_modifier_to_be_removed = [ + {"p": field} for field in original_fields + if field not in numeric_field_map + and original_doc.fixed_fields.field_types.get(field) in (MarqoFieldTypes.INT_MAP.value, MarqoFieldTypes.FLOAT_MAP.value) + ] + + + if len(score_modifier_to_be_removed) > 0 or len(score_modifier_to_be_removed) > 0: + vespa_fields[common.SCORE_MODIFIERS] = { + "modify": { + "operation": "replace", + "create": True, + "cells": numeric_field_map + } if len(numeric_field_map) > 0 else None, + "remove": { + "addresses": score_modifier_to_be_removed + } if len(score_modifier_to_be_removed) > 0 else None + } + + def _process_field( + self, + field_name: str, + value: Any, + fields: Dict[str, Any], + field_types: Dict[str, Any], + numeric_fields: Dict[str, Any], + numeric_field_map: Dict[str, Any], + doc_id: str + ) -> None: + """Process a single field from a document based on its type. + + This method determines the type of the field value and delegates processing to the appropriate handler method. + The field value is processed and added to the fields, field_types, and numeric_fields dictionaries as needed. + + Args: + field_name: The name of the field being processed + value: The value of the field, can be of the type bool, dict, int, float, list, or str + fields: Dictionary to store the update statements corresponding to the processed fields + field_types: Dictionary mapping field names to their Marqo field types + numeric_fields: Dictionary storing numeric field values for being later used to update score modifier + doc_id: The ID of the document containing this field + + Raises: + MarqoDocumentParsingError: If the field value is of an unsupported type + """ + if isinstance(value, bool): + self._handle_boolean_field(field_name, value, fields, field_types) + elif isinstance(value, dict): + self._handle_dict_field(field_name, value, doc_id, numeric_field_map) + elif isinstance(value, (int, float)): + numeric_field_map[field_name] = value # sets information about numeric fields in a map so it the numeric field + score modifiers can be updated later + elif isinstance(value, list): + self._handle_string_array_field(field_name, value, fields, field_types) + elif isinstance(value, str): + self._handle_string_field(field_name, value, fields, field_types) + else: + raise MarqoDocumentParsingError( + f'Unsupported field type {type(value)} for field {field_name} in doc {doc_id}' + ) + + def _update_numeric_field( + self, + numeric_type: Type[Union[int, float]], + numeric_field_map: Dict[str, Union[int, float]], + original_doc: Optional[SemiStructuredVespaDocument], + vespa_fields: Dict[str, Any], + vespa_field_types: Dict[str, Any] + ) -> bool: + """Updates numeric fields (int/float) in Vespa documents. + + This method processes numeric fields (integers or floats) for partial document updates. + It compares the new values with the original document (if it exists) and only updates + fields that have changed or are new. It also handles the field type metadata appropriately, + distinguishing between regular numeric fields and map types. + + Args: + numeric_type: The type of numeric field (int or float) to process + numeric_field_map: Dictionary mapping field names to their numeric values + original_doc: The original Vespa document if it exists, used for comparison + vespa_fields: Dictionary to store the update statements for Vespa fields + vespa_field_types: Dictionary to store the field type metadata updates + + Returns: + bool: True if any fields were changed, False otherwise + + Note: + - Fields that exist in the original document but not in the update request are preserved + - Map type fields that are no longer present will be removed + - Field type metadata is updated to maintain consistency with the field values + """ + fields_changed = False + field_prefix = common.INT_FIELDS if numeric_type is int else common.FLOAT_FIELDS + + original_fields = {} + if original_doc is not None: + original_fields = (original_doc.fixed_fields.int_fields # Get original fields if document exists + if numeric_type is int + else original_doc.fixed_fields.float_fields) + + # Process fields in update request + for field_name, value in numeric_field_map.items(): + if not isinstance(value, numeric_type): + continue + + vespa_field_name = f'{field_prefix}{{{field_name}}}' + vespa_field_types_field_name = f'{common.VESPA_DOC_FIELD_TYPES}{{{field_name}}}' + + # Only set field value if it doesn't exist in the original set of fields or has changed + field_exists = original_doc is not None and field_name in original_fields + field_value_changed = field_exists and original_fields[field_name] != value + + if not field_exists or field_value_changed: + vespa_fields[vespa_field_name] = {"assign": value} + + # Determine if field is a map type based on original document + is_map_type = (original_doc is not None and + original_doc.fixed_fields.field_types.get(field_name) in + (MarqoFieldTypes.INT_MAP.value, MarqoFieldTypes.FLOAT_MAP.value)) + + # Set appropriate field type based on numeric_type and whether it's a map + field_type = (MarqoFieldTypes.INT_MAP if numeric_type is int else MarqoFieldTypes.FLOAT_MAP) if is_map_type else (MarqoFieldTypes.INT if numeric_type is int else MarqoFieldTypes.FLOAT) + + # Set field type metadata by creating a assigned statement + vespa_fields[vespa_field_types_field_name] = {"assign": field_type.value} + + # Update field type metadata dictionary, so we can use it later when defining the update pre-condition to send to vespa + vespa_field_types[field_name] = field_type.value + fields_changed = True + + # Remove fields no longer in map + + for original_field_name in original_fields: + if (original_field_name not in numeric_field_map and + original_doc.fixed_fields.field_types.get(original_field_name) in (MarqoFieldTypes.INT_MAP.value, MarqoFieldTypes.FLOAT_MAP.value)): + + vespa_field_name = f'{field_prefix}{{{original_field_name}}}' + vespa_field_types_field_name = f'{common.VESPA_DOC_FIELD_TYPES}{{{original_field_name}}}' + + vespa_fields[vespa_field_name] = {"remove": 0} + vespa_fields[vespa_field_types_field_name] = {"remove": 0} + vespa_field_types.pop(original_field_name, None) + fields_changed = True + + return fields_changed + + def _handle_boolean_field( + self, + field_name: str, + value: bool, + fields: Dict[str, Any], + field_types: Dict[str, Any] + ) -> None: + """Handle boolean field processing for document updates. + + This method processes a boolean field by: + 1. Creating an update statement for the field value + 2. Setting the field type metadata to BOOL + 3. Creating an update statement for the field type metadata + + Args: + field_name: The name of the boolean field + value: The boolean value to be stored + fields: Dictionary to store the update statements for fields + field_types: Dictionary mapping field names to their Marqo field types + """ + self._create_update_statement_for_updating_field(fields, field_name, value) + field_types[field_name] = MarqoFieldTypes.BOOL.value + self._create_update_statement_for_updating_field_type_metadata(fields, field_types, field_name) + + def _handle_dict_field( + self, + field_name: str, + value: Dict[str, Any], + doc_id: str, + numeric_field_map: Dict[str, Any] + ) -> None: + """Handle dictionary field processing for document updates. + + This method processes a dictionary field by: + 1. Removing any existing entries for this field from the numeric field map + 2. Adding new entries for numeric values (int, float) in the dictionary + 3. Validating that all dictionary values are of supported types + + Args: + field_name: The name of the dictionary field + value: The dictionary to be processed + doc_id: The ID of the document being updated + numeric_field_map: Dictionary mapping flattened field names to their numeric values + + Raises: + MarqoDocumentParsingError: If any value in the dictionary is not a supported numeric type + """ + keys_to_remove = [ + key for key in numeric_field_map.keys() + if key.startswith(f'{field_name}.') + ] + for key in keys_to_remove: #remove existing entries for this specific map field + del numeric_field_map[key] + + # Add new entries + for k, v in value.items(): + if isinstance(v, (int, float)): + numeric_field_map[f'{field_name}.{k}'] = v + else: + raise MarqoDocumentParsingError(f'Unsupported field type {type(v)} for field {field_name} in doc {doc_id}') + + def _handle_string_array_field( + self, + field_name: str, + value: List[Any], + fields: Dict[str, Any], + field_types: Dict[str, Any] + ) -> None: + """Handle string array field processing for document updates. + + This method processes a string array field by: + 1. Validating that all array elements are strings + 2. Setting the field type to STRING_ARRAY + 3. Creating update statements for: + - The field value + - Field type metadata + + Args: + field_name: The name of the string array field + value: The list of strings to be processed + fields: Dictionary to store the update statements for fields + field_types: Dictionary mapping field names to their Marqo field types + + Raises: + MarqoDocumentParsingError: If any element in the array is not a string + """ + if not all(isinstance(v, str) for v in value) or self.get_marqo_index().name_to_string_array_field_map.get(field_name) is None: + raise MarqoDocumentParsingError('Unstructured index updates only support updating existing string array fields') + field_types[field_name] = MarqoFieldTypes.STRING_ARRAY.value # setting field types for later creating pre-conditions + self._create_update_statement_for_updating_field(fields, field_name, value) # To create update statement for updating the actual field + self._create_update_statement_for_updating_field_type_metadata(fields, field_types, field_name) # To create update statement for updating 'field type' metadata + + def _handle_string_field( + self, + field_name: str, + value: str, + fields: Dict[str, Any], + field_types: Dict[str, Any] + ) -> None: + """Handle string field processing for document updates. + + This method processes a string field by: + 1. Validating that the field exists in the lexical field map + 2. Creating update statements for: + - The lexical field value + - Short string field value (if string length is within limit) + - Field type metadata + + Args: + field_name: The name of the string field + value: The string value to be processed + fields: Dictionary to store the update statements for fields + field_types: Dictionary mapping field names to their Marqo field types + + Raises: + MarqoDocumentParsingError: If the field does not exist in the lexical field map + """ + lexical_field_name = f'{SemiStructuredVespaSchema.FIELD_INDEX_PREFIX}{field_name}' + if lexical_field_name not in self.get_marqo_index().lexical_field_map: + raise MarqoDocumentParsingError( + f'{field_name} of type str does not exist in the original document. ' + 'We do not support adding new lexical fields in partial updates' + ) + + fields[lexical_field_name] = {"assign": value} # To create update statement for updating the lexical fields + + short_string_field = f'{common.SHORT_STRINGS_FIELDS}{{{field_name}}}' + if len(value) <= self.get_marqo_index().filter_string_max_length: + fields[short_string_field] = {"assign": value} # To create update statement for updating the actual field + else: + fields[short_string_field] = {"remove": 0} + + field_types[field_name] = MarqoFieldTypes.STRING.value + self._create_update_statement_for_updating_field_type_metadata(fields, field_types, field_name) # To create update statement for updating 'field type' metadata + + def _create_update_statement_for_updating_field_type_metadata(self, update_statement_fields, field_types, + field_key): + """Create update statement for updating field type metadata. + + This method creates an update statement to modify the field type metadata in Vespa. + It assigns the field type value from field_types to a metadata field in the update statement. + + Args: + update_statement_fields: Dictionary containing the update statements for fields + field_types: Dictionary mapping field names to their Marqo field types + field_key: The field name whose type metadata needs to be updated + + Example: + If field_key is "title" and field_types["title"] is "string", this will add: + {"marqo__field_type{title}": {"assign": "string"}} to update_statement_fields + """ + update_field_type_metadata_key = f'{common.VESPA_DOC_FIELD_TYPES}{{{field_key}}}' + update_statement_fields[update_field_type_metadata_key] = {"assign": field_types[field_key]} + + def _create_update_statement_for_updating_field(self, fields, key, val): + """Create update statement for updating a field in Vespa. + + This method creates an update statement for a field based on its value type. + For boolean values, it converts them to integers (0/1) before assigning. + For other types (float, int, list), it assigns the value directly. + + Args: + fields: Dictionary containing the update statements for fields + key: The field name to be updated + val: The value to assign to the field. Can be bool, float, int or list. + + Example: + For a boolean field "active" with value True: + fields["marqo__bool_fields{active}"] = {"assign": 1} + + For a string array field "string_array_1" with value ['a', 'b', 'c']: + fields["marqo__string_array_string_array_1"] = {"assign": ['a', 'b', 'c']} + """ + vespa_doc_field_name = "" + # Create the vespa doc field name + if isinstance(val, bool): + vespa_doc_field_name = f'{common.BOOL_FIELDS}{{{key}}}' + elif isinstance(val, list): + vespa_doc_field_name = f'{common.STRING_ARRAY}_{key}' + + # Create the update statement + if isinstance(val, bool): + fields[vespa_doc_field_name] = {"assign": int(val)} + else: + fields[vespa_doc_field_name] = {"assign": val} diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema.py index 5c6ca4df7..baf84f138 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema.py @@ -1,5 +1,6 @@ import os +import semver from jinja2 import Environment, FileSystemLoader from marqo.core.models.marqo_index import SemiStructuredMarqoIndex, MarqoIndex @@ -11,6 +12,8 @@ class SemiStructuredVespaSchema(VespaSchema): FIELD_INDEX_PREFIX = 'marqo__lexical_' FIELD_CHUNKS_PREFIX = 'marqo__chunks_' FIELD_EMBEDDING_PREFIX = 'marqo__embeddings_' + FIELD_STRING_ARRAY_PREFIX = 'marqo__string_array_' + SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION = semver.VersionInfo.parse("2.16.0") def __init__(self, index_request: UnstructuredMarqoIndexRequest): self._index_request = index_request @@ -25,7 +28,10 @@ def generate_schema(self) -> (str, MarqoIndex): def generate_vespa_schema(cls, marqo_index: SemiStructuredMarqoIndex) -> str: template_path = str(os.path.dirname(os.path.abspath(__file__))) environment = Environment(loader=FileSystemLoader(template_path)) - vespa_schema_template = environment.get_template("semi_structured_vespa_schema_template.sd.jinja2") + if marqo_index.parsed_marqo_version() >= SemiStructuredVespaSchema.SEMISTRUCTURED_INDEX_PARTIAL_UPDATE_SUPPORT_VERSION: + vespa_schema_template = environment.get_template("semi_structured_vespa_schema_template_2_16.sd.jinja2") + else: + vespa_schema_template = environment.get_template("semi_structured_vespa_schema_template.sd.jinja2") return vespa_schema_template.render(index=marqo_index, dimension=str(marqo_index.model.get_dimension())) def _generate_marqo_index(self, schema_name: str) -> SemiStructuredMarqoIndex: @@ -46,6 +52,7 @@ def _generate_marqo_index(self, schema_name: str) -> SemiStructuredMarqoIndex: updated_at=self._index_request.updated_at, lexical_fields=[], tensor_fields=[], + string_array_fields=[], filter_string_max_length=self._index_request.filter_string_max_length, treat_urls_and_pointers_as_images=self._index_request.treat_urls_and_pointers_as_images, treat_urls_and_pointers_as_media=self._index_request.treat_urls_and_pointers_as_media, diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template.sd.jinja2 b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template.sd.jinja2 index 8d66472b3..ae311ff9e 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template.sd.jinja2 +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template.sd.jinja2 @@ -6,7 +6,6 @@ schema {{ index.schema_name }} { attribute: fast-search rank: filter } - {# Reserved fields for bool, int and float fields -#} field marqo__int_fields type map { indexing: summary diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template_2_16.sd.jinja2 b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template_2_16.sd.jinja2 new file mode 100644 index 000000000..15d55197c --- /dev/null +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template_2_16.sd.jinja2 @@ -0,0 +1,307 @@ +{# semi_structured_vespa_schema_template_2_16.sd.jinja2 -#} +schema {{ index.schema_name }} { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + {# Reserved fields for bool, int and float fields -#} + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__short_string_fields type map { + {#- indexing:summary is left out here since each short string has a corresponding lexical field #} + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + {# All int and float fields will be added score modifiers -#} + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__multimodal_params type map { + indexing: summary + } + + {% for lexical_field in index.lexical_fields -%} + field {{ lexical_field.lexical_field_name }} type string { + indexing: index | summary + index: enable-bm25 + } + {% endfor -%} + + {% for string_array_field in index.string_array_fields -%} + field {{ string_array_field.string_array_field_name }} type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + {% endfor -%} + + {% for field in index.tensor_fields -%} + field {{ field.chunk_field_name }} type array { + indexing: summary + } + + field {{ field.embeddings_field_name }} type tensor(p{}, x[{{ dimension }}]) { + indexing: attribute | index | summary + attribute { + distance-metric: {{ index.distance_metric.value }} + } + index { + hnsw { + max-links-per-node: {{ index.hnsw_config.m }} + neighbors-to-explore-at-insert: {{ index.hnsw_config.ef_construction }} + } + } + } + {% endfor -%} + + field marqo__vector_count type int { + indexing: attribute | summary + } + } + + {% if index.lexical_field_map -%} + fieldset default { + fields: {{ ", ".join(index.lexical_field_map.keys()) }} + } + {% endif -%} + + rank-profile base_rank_profile inherits default { + inputs { + {% for lexical_field in index.lexical_fields -%} + query({{ lexical_field.lexical_field_name }}): 0 + {% endfor -%} + {% for field in index.tensor_fields -%} + query({{ field.embeddings_field_name }}): 0 + {% endfor -%} + query(marqo__bm25_aggregator): 0 + query(marqo__query_embedding) tensor(x[{{ dimension }}]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } + function modify(score, mult_weights, add_weights) { + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) + } + + {% macro max_score(score_macro, fields) -%} + {%- set length = fields|length -%} + {%- if length == 0 -%} + 0 + {%- elif length == 1 -%} + {{ score_macro(fields[0]) }} + {%- else -%} + max({{ max_score(score_macro, fields[0:1]) }}, {{ max_score(score_macro, fields[1:]) }}) + {%- endif -%} + {% endmacro -%} + + {% macro lexical_score(field) -%} + if (query({{ field.lexical_field_name }}) > 0, bm25({{ field.lexical_field_name }}), 0) + {%- endmacro -%} + + {% macro embedding_score(field) -%} + if (query({{ field.embeddings_field_name }}) > 0, closeness(field, {{ field.embeddings_field_name }}), 0) + {%- endmacro -%} + + {% macro sum_score(score_macro, fields) -%} + {%- set add = joiner(" + ") -%} + {%- for field in fields -%}{{ add() }}{{ score_macro(field) }}{%- endfor -%} + {% endmacro -%} + + {% if index.lexical_fields -%} + function lexical_score_sum() { + expression: {{ sum_score(lexical_score, index.lexical_fields) }} + } + + {% macro count_lexical_fields() -%} + {%- set add = joiner(" + ") -%} + {%- for field in index.lexical_fields -%}{{ add() }}if (query({{ field.lexical_field_name }}) > 0, 1, 0){%- endfor -%} + {% endmacro -%} + function lexical_score_avg() { + expression: ({{ sum_score(lexical_score, index.lexical_fields) }}) / max(1, {{ count_lexical_fields() }}) + } + + function lexical_score_max() { + expression: {{ max_score(lexical_score, index.lexical_fields) }} + } + + function lexical_score() { + expression: if (query(marqo__bm25_aggregator) == 0, lexical_score_sum(), if (query(marqo__bm25_aggregator) == 1, lexical_score_avg(), lexical_score_max())) + } + {% endif -%} + + {# We provide this function even without the tensor field to support embed requests -#} + function embedding_score() { + expression: {{ max_score(embedding_score, index.tensor_fields) }} + } + + match-features: global_mult_modifier global_add_modifier + } + + {% if index.lexical_fields -%} + rank-profile bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + {% endif -%} + + {# We provide this rank profile even without the tensor field to support embed requests -#} + rank-profile embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + {% if index.tensor_fields -%} + match-features inherits base_rank_profile { + {%- for field in index.tensor_fields %} + closest({{ field.embeddings_field_name }}) + {%- endfor %} + {%- for field in index.tensor_fields %} + distance(field, {{ field.embeddings_field_name }}) + {%- endfor %} + } + {%- endif %} + } + + {% if index.lexical_fields and index.tensor_fields -%} + rank-profile hybrid_custom_searcher inherits default { + inputs { + query(marqo__fields_to_rank_lexical) tensor(p{}) + query(marqo__fields_to_rank_tensor) tensor(p{}) + query(marqo__query_embedding) tensor(x[{{ dimension }}]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + } + + rank-profile hybrid_bm25_then_embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + second-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + match-features inherits base_rank_profile { + {%- for field in index.tensor_fields %} + closest({{ field.embeddings_field_name }}) + {%- endfor %} + {%- for field in index.tensor_fields %} + distance(field, {{ field.embeddings_field_name }}) + {%- endfor %} + } + } + + rank-profile hybrid_embedding_similarity_then_bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + {%- endif %} + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__field_types type map {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + {% for string_array_field in index.string_array_fields -%} + summary {{ string_array_field.string_array_field_name }} type array {source: {{ string_array_field.string_array_field_name }}} + {% endfor -%} + {%- for lexical_field in index.lexical_fields %} + summary {{ lexical_field.name }} type string {source: {{ lexical_field.lexical_field_name }}} + {%- endfor %} + {%- for field in index.tensor_fields %} + summary {{ field.chunk_field_name }} type array {} + {%- endfor %} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__field_types type map {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + {% for string_array_field in index.string_array_fields -%} + summary {{ string_array_field.string_array_field_name }} type array {source: {{ string_array_field.string_array_field_name }}} + {% endfor -%} + {%- for lexical_field in index.lexical_fields %} + summary {{ lexical_field.name }} type string {source: {{ lexical_field.lexical_field_name }}} + {%- endfor %} + {%- for field in index.tensor_fields %} + summary {{ field.chunk_field_name }} type array {} + summary {{ field.embeddings_field_name }} type tensor(p{}, x[{{ dimension }}]) {} + {%- endfor %} + } +} \ No newline at end of file diff --git a/src/marqo/core/structured_vespa_index/structured_vespa_index.py b/src/marqo/core/structured_vespa_index/structured_vespa_index.py index 815896002..04f5e39fb 100644 --- a/src/marqo/core/structured_vespa_index/structured_vespa_index.py +++ b/src/marqo/core/structured_vespa_index/structured_vespa_index.py @@ -62,7 +62,7 @@ class StructuredVespaIndex(VespaIndex): def get_vespa_id_field(self) -> str: return common.FIELD_ID - def to_vespa_partial_document(self, marqo_document: Dict[str, Any]) -> Dict[str, Any]: + def to_vespa_partial_document(self, marqo_document: Dict[str, Any], existing_vespa_document: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: vespa_id: Optional[str] = None vespa_fields: Dict[str, Any] = dict() score_modifiers_2_8: Dict[str, float] = dict() diff --git a/src/marqo/core/unstructured_vespa_index/common.py b/src/marqo/core/unstructured_vespa_index/common.py index a6258c45b..1554d5fce 100644 --- a/src/marqo/core/unstructured_vespa_index/common.py +++ b/src/marqo/core/unstructured_vespa_index/common.py @@ -21,6 +21,7 @@ MARQO_DOC_MULTIMODAL_PARAMS = "multimodal_params" VESPA_DOC_MULTIMODAL_PARAMS = "marqo__multimodal_params" +MARQO_DOC_MULTIMODAL_PARAMS_WEIGHTS = "weights" SUMMARY_ALL_NON_VECTOR = 'all-non-vector-summary' SUMMARY_ALL_VECTOR = 'all-vector-summary' diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_vespa_index.py b/src/marqo/core/unstructured_vespa_index/unstructured_vespa_index.py index f1dd7404a..c9b76ff27 100644 --- a/src/marqo/core/unstructured_vespa_index/unstructured_vespa_index.py +++ b/src/marqo/core/unstructured_vespa_index/unstructured_vespa_index.py @@ -25,7 +25,7 @@ class UnstructuredVespaIndex(VespaIndex): def get_vespa_id_field(self) -> str: return unstructured_common.VESPA_FIELD_ID - def to_vespa_partial_document(self, marqo_document: Dict[str, Any]) -> Dict[str, Any]: + def to_vespa_partial_document(self, marqo_document: Dict[str, Any], existing_vespa_document: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: raise NotImplementedError("Partial document update is not supported for unstructured indexes. This" "function should not be called.") diff --git a/src/marqo/core/vespa_index/add_documents_handler.py b/src/marqo/core/vespa_index/add_documents_handler.py index 53ab8e0e3..9b507b38f 100644 --- a/src/marqo/core/vespa_index/add_documents_handler.py +++ b/src/marqo/core/vespa_index/add_documents_handler.py @@ -158,9 +158,8 @@ def add_documents(self): # retrieve existing docs for existing tensor if self.add_docs_params.use_existing_tensors: - # TODO capture the telemetry data for retrieving exiting docs? - result = self.vespa_client.get_batch(list(self.add_docs_response_collector.valid_original_ids()), - self.marqo_index.schema_name) + result = self.vespa_client.get_batch(ids = list(self.add_docs_response_collector.valid_original_ids()), + schema = self.marqo_index.schema_name) existing_vespa_docs = [r.document for r in result.responses if r.status == 200] self._populate_existing_tensors(existing_vespa_docs) @@ -168,17 +167,15 @@ def add_documents(self): self._vectorise_tensor_fields() # FIXME this step is not timed in the original implementation - vespa_docs = self._convert_to_vespa_docs() + vespa_docs = self._convert_to_vespa_docs() # Responsible for converting the marqo docs (i.e the dictionary that we collected all documents to earlier) to vespa docs self._pre_persist_to_vespa() # persist to vespa if there are still valid docs - with RequestMetricsStore.for_request().time("add_documents.vespa._bulk"): - response = self.vespa_client.feed_batch(vespa_docs, self.marqo_index.schema_name) + response = self.vespa_client.feed_batch(vespa_docs, self.marqo_index.schema_name) - with RequestMetricsStore.for_request().time("add_documents.postprocess"): - self._handle_vespa_response(response) - return self.add_docs_response_collector.to_add_doc_responses(self.marqo_index.name) + self._handle_vespa_response(response) + return self.add_docs_response_collector.to_add_doc_responses(self.marqo_index.name) @abstractmethod def _create_tensor_fields_container(self) -> TensorFieldsContainer: diff --git a/src/marqo/core/vespa_index/vespa_index.py b/src/marqo/core/vespa_index/vespa_index.py index 403631e2b..dccc20ada 100644 --- a/src/marqo/core/vespa_index/vespa_index.py +++ b/src/marqo/core/vespa_index/vespa_index.py @@ -1,9 +1,4 @@ -from abc import ABC, abstractmethod -from typing import Dict, Any, Optional, List - -from marqo.core import constants -from marqo.core.models import MarqoQuery, MarqoHybridQuery, MarqoTensorQuery, MarqoLexicalQuery, MarqoIndex -from marqo.core.models.marqo_index import StructuredMarqoIndex, UnstructuredMarqoIndex +from marqo.core.models import MarqoQuery, MarqoHybridQuery, MarqoTensorQuery, MarqoLexicalQuery from marqo.core.models.score_modifier import ScoreModifier, ScoreModifierType from marqo.core.models.marqo_index import * from marqo.exceptions import InternalError @@ -82,7 +77,7 @@ def get_vector_count_query(self) -> Dict[str, Any]: pass @abstractmethod - def to_vespa_partial_document(self, marqo_partial_document: Dict[str, Any]) -> Dict[str, Any]: + def to_vespa_partial_document(self, marqo_partial_document: Dict[str, Any], existing_vespa_document: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """ Convert a marqo_partial_update_document to a Vespa partial document. @@ -90,6 +85,7 @@ def to_vespa_partial_document(self, marqo_partial_document: Dict[str, Any]) -> D the fields that are require to be updated. Args: + existing_vespa_document: An optional existing Vespa document to construct the partial update from marqo_partial_document: The marqo_partial_document to convert Returns: diff --git a/src/marqo/marqo_docs.py b/src/marqo/marqo_docs.py index b6e135619..4ecc41ab0 100644 --- a/src/marqo/marqo_docs.py +++ b/src/marqo/marqo_docs.py @@ -79,3 +79,6 @@ def search_api_score_modifiers_parameter(): def hugging_face_trust_remote_code(): return _build_url('models/marqo/bring-your-own-model/#bring-your-own-hugging-face-sentence-transformers-models') + +def update_documents_response(): + return _build_url('reference/api/documents/update-documents/#response') diff --git a/src/marqo/tensor_search/enums.py b/src/marqo/tensor_search/enums.py index 2d4f6ac00..b3e5e3d5e 100644 --- a/src/marqo/tensor_search/enums.py +++ b/src/marqo/tensor_search/enums.py @@ -82,6 +82,7 @@ class EnvVars: MARQO_INFERENCE_CACHE_SIZE = "MARQO_INFERENCE_CACHE_SIZE" MARQO_INFERENCE_CACHE_TYPE = "MARQO_INFERENCE_CACHE_TYPE" MARQO_MAX_TENSOR_FIELD_COUNT_UNSTRUCTURED = "MARQO_MAX_TENSOR_FIELD_COUNT_UNSTRUCTURED" + MARQO_MAX_STRING_ARRAY_FIELD_COUNT_UNSTRUCTURED = "MARQO_MAX_STRING_ARRAY_FIELD_COUNT_UNSTRUCTURED" MARQO_MAX_LEXICAL_FIELD_COUNT_UNSTRUCTURED = "MARQO_MAX_LEXICAL_FIELD_COUNT_UNSTRUCTURED" MARQO_INDEX_DEPLOYMENT_LOCK_TIMEOUT = "MARQO_INDEX_DEPLOYMENT_LOCK_TIMEOUT" ZOOKEEPER_HOSTS = "ZOOKEEPER_HOSTS" diff --git a/src/marqo/version.py b/src/marqo/version.py index 31acb3941..9042a44dc 100644 --- a/src/marqo/version.py +++ b/src/marqo/version.py @@ -1,4 +1,4 @@ -__version__ = "2.15.2" +__version__ = "2.16.0" def get_version() -> str: return f"{__version__}" diff --git a/src/marqo/vespa/models/vespa_document.py b/src/marqo/vespa/models/vespa_document.py index 8b9120cdc..4a7efed3e 100644 --- a/src/marqo/vespa/models/vespa_document.py +++ b/src/marqo/vespa/models/vespa_document.py @@ -5,4 +5,6 @@ class VespaDocument(BaseModel): id: Optional[str] + field_types: Optional[Dict[str, str]] # A metadata field to store the type of each field in the document fields: Dict[str, Any] + create_timestamp: Optional[float] diff --git a/src/marqo/vespa/vespa_client.py b/src/marqo/vespa/vespa_client.py index 5253e72b9..e8e34a412 100644 --- a/src/marqo/vespa/vespa_client.py +++ b/src/marqo/vespa/vespa_client.py @@ -16,6 +16,9 @@ import marqo.logging import marqo.vespa.concurrency as conc from marqo.core.models import MarqoIndex +from marqo.core.semi_structured_vespa_index.common import VESPA_DOC_FIELD_TYPES, VESPA_DOC_CREATE_TIMESTAMP +from marqo.core.semi_structured_vespa_index.marqo_field_types import MarqoFieldTypes +from marqo.marqo_docs import update_documents_response from marqo.vespa.exceptions import (VespaStatusError, VespaError, InvalidVespaApplicationError, VespaTimeoutError, VespaNotConvergedError, VespaActivationConflictError) from marqo.vespa.models import VespaDocument, QueryResult, Error, FeedBatchDocumentResponse, FeedBatchResponse, \ @@ -412,6 +415,7 @@ def get_all_documents(self, def get_batch(self, ids: List[str], schema: str, + fields: Optional[List[str]] = None, concurrency: Optional[int] = None, timeout: int = 60) -> GetBatchResponse: """ @@ -424,6 +428,7 @@ def get_batch(self, Args: ids: List of document IDs to get schema: Schema to get from + fields: A optional list of fields to fetch from the document concurrency: Number of concurrent get requests timeout: Timeout in seconds per request @@ -437,7 +442,7 @@ def get_batch(self, concurrency = self.get_pool_size batch_response = conc.run_coroutine( - self._get_batch_async(ids, schema, concurrency, timeout) + self._get_batch_async(ids, fields, schema, concurrency, timeout) ) return batch_response @@ -596,8 +601,7 @@ def translate_vespa_document_response(self, status: int, message: Optional[str]= vespa_status_code_to_marqo_doc_error_map = { 200: (200, None), 404: (404, "Document does not exist in the index"), - # Update documents get 412 from Vespa for document not found as we use condition - 412: (404, "Document does not exist in the index"), + 412: (400, "Marqo vector store couldn't update the document. Please see: " + update_documents_response() + " for more details"), # Update documents get 412 from Vespa for document not found as we use condition 429: (429, "Marqo vector store receives too many requests. Please try again later"), 507: (400, "Marqo vector store is out of memory or disk space"), } @@ -829,15 +833,27 @@ async def _update_document_async(self, semaphore: asyncio.Semaphore, async_clien timeout: int, vespa_id_field: str) -> UpdateDocumentResponse: doc_id = document.id data = {'fields': document.fields} + types = document.field_types + create_timestamp = document.create_timestamp # only used for documents that are not updated error_doc_path_id = f"/document/v1/{schema}/{schema}/docid/{doc_id}" - async with semaphore: end_point = f'{self.document_url}/document/v1/{schema}/{schema}/docid/{doc_id}?create=false' data["condition"] = f'{schema}.{vespa_id_field}==\"{doc_id}\"' + if types is not None: # Types will be none for structured index as we are not storing types at the time of Add docs. + for key, value in types.items(): + data["condition"] += (f' and (not {schema}.{VESPA_DOC_FIELD_TYPES}{{\"{key}\"}} or {schema}.{VESPA_DOC_FIELD_TYPES}{{\"{key}\"}}==\"{value}\")' + f' and (not ({schema}.{VESPA_DOC_FIELD_TYPES}{{\"{key}\"}}=="{MarqoFieldTypes.TENSOR.value}"))') + if create_timestamp is not None: + data["condition"] += f' and {schema}.{VESPA_DOC_CREATE_TIMESTAMP}=={create_timestamp}' try: resp = await async_client.put(end_point, json=data, timeout=timeout) + if resp.status_code == 412 and types is None and create_timestamp is None: + # If Vespa response is 412, and the request is for structured index, it means the document does not exist + # in the index, as we don't have type checks / timestamp (version) checks for structured indexes. + # We return a 404 error for this case. + resp.status_code = 404 except httpx.RequestError as e: logger.error(e, exc_info=True) return UpdateDocumentResponse(status=500, message="Network Error", id=doc_id, path_id=error_doc_path_id) @@ -920,6 +936,7 @@ def _feed_document_sync(self, sync_client: httpx.Client, document: VespaDocument async def _get_batch_async(self, ids: List[str], + fields: Optional[List[str]], schema: str, connections: int, timeout: int) -> GetBatchResponse: async with httpx.AsyncClient(limits=httpx.Limits(max_keepalive_connections=connections, @@ -927,7 +944,7 @@ async def _get_batch_async(self, semaphore = asyncio.Semaphore(connections) tasks = [ asyncio.create_task( - self._get_document_async(semaphore, async_client, id, schema, timeout) + self._get_document_async(semaphore, async_client, id, fields, schema, timeout) ) for id in ids ] @@ -947,12 +964,39 @@ async def _get_document_async(self, semaphore: asyncio.Semaphore, async_client: httpx.AsyncClient, id: str, + fields: Optional[List[str]], + schema: str, + timeout: int) -> GetBatchDocumentResponse: + async with semaphore: + try: + if fields is not None: + resp = await async_client.get( + f'{self.document_url}/document/v1/{schema}/{schema}/docid/{id}?fieldSet={schema}:{",".join(fields)}', + timeout=timeout + ) + else: + resp = await async_client.get( + f'{self.document_url}/document/v1/{schema}/{schema}/docid/{id}', timeout=timeout + ) + except httpx.HTTPError as e: + raise VespaError(e) from e + + if resp.status_code in [200, 404]: + return GetBatchDocumentResponse(**resp.json(), status=resp.status_code) + + self._raise_for_status(resp) + + async def _get_document_async_with_specific_fields(self, + semaphore: asyncio.Semaphore, + async_client: httpx.AsyncClient, + id: str, + fields: List[str], schema: str, timeout: int) -> GetBatchDocumentResponse: async with semaphore: try: resp = await async_client.get( - f'{self.document_url}/document/v1/{schema}/{schema}/docid/{id}', timeout=timeout + f'{self.document_url}/document/v1/{schema}/{schema}/docid/{id}?fieldSet={schema}:{",".join(fields)}', timeout=timeout ) except httpx.HTTPError as e: raise VespaError(e) from e @@ -962,6 +1006,7 @@ async def _get_document_async(self, self._raise_for_status(resp) + async def _delete_batch_async(self, ids: List[str], schema: str, diff --git a/tests/api_tests/v1/tests/api_tests/structured_index/test_partial_update_document.py b/tests/api_tests/v1/tests/api_tests/structured_index/test_partial_update_document.py index 32cd638ae..97456fe0b 100644 --- a/tests/api_tests/v1/tests/api_tests/structured_index/test_partial_update_document.py +++ b/tests/api_tests/v1/tests/api_tests/structured_index/test_partial_update_document.py @@ -16,9 +16,6 @@ class TestStructuredUpdateDocuments(MarqoTestCase): large_score_modifier_index_name = ("update_doc_api_test_score_modifier_index" + str(uuid.uuid4()).replace('-', '')) - test_unstructured_index_name = ("update_doc_api_test_unstructured_index" + - str(uuid.uuid4()).replace('-', '')) - @classmethod def setUpClass(cls): super().setUpClass() @@ -61,17 +58,11 @@ def setUpClass(cls): ["score_modifier", "filter"]} for i in range(100)] + [{"name": "text_field_tensor", "type": "text"}], "tensorFields": ["text_field_tensor"], - }, - { - "indexName": cls.test_unstructured_index_name, - "type": "unstructured", - "model": "random/small", } ] ) - cls.indexes_to_delete = [cls.update_doc_index_name, cls.large_score_modifier_index_name, - cls.test_unstructured_index_name] + cls.indexes_to_delete = [cls.update_doc_index_name, cls.large_score_modifier_index_name] def tearDown(self): if self.indexes_to_delete: @@ -744,15 +735,3 @@ def test_too_many_documents_exceeds_max_batch_size(self): self.client.index(self.update_doc_index_name).update_documents(documents=documents) self.assertIn("Number of docs in update_documents request (129) exceeds", str(e.exception)) - - def test_proper_error_on_unstructured_index(self): - """Test that an error is thrown when attempting to update a document in an unstructured index.""" - updated_doc = { - "text_field": "updated text field", - "_id": "1" - } - - with self.assertRaises(MarqoWebError) as e: - self.client.index(self.test_unstructured_index_name).update_documents(documents=[updated_doc]) - - self.assertIn("is not supported for unstructured indexes", str(e.exception)) \ No newline at end of file diff --git a/tests/api_tests/v1/tests/api_tests/unstructured_index/test_update_documents_unstructured_index.py b/tests/api_tests/v1/tests/api_tests/unstructured_index/test_update_documents_unstructured_index.py new file mode 100644 index 000000000..7194e179d --- /dev/null +++ b/tests/api_tests/v1/tests/api_tests/unstructured_index/test_update_documents_unstructured_index.py @@ -0,0 +1,157 @@ +import uuid + +from marqo.client import Client + +from tests.marqo_test import MarqoTestCase + + +class TestUpdateDocumentsInUnstructuredIndex(MarqoTestCase): + """ + Support for partial updates for unstructured indexes was added in 2.16.0. Unstructured indexes are internally implemented as semi-structured indexes. + """ + + @classmethod + def setUpClass(cls): + super().setUpClass() + + cls.client = Client(**cls.client_settings) + + cls.text_index_name = "api_test_unstructured_index" + str(uuid.uuid4()).replace('-', '') + + cls.create_indexes([ + { + "indexName": cls.text_index_name, + "type": "unstructured", + "model": "random/small", + "normalizeEmbeddings": False, + } + ]) + + cls.indexes_to_delete = [cls.text_index_name] + + def tearDown(self): + if self.indexes_to_delete: + self.clear_indexes(self.indexes_to_delete) + + def test_update_document_with_ids(self): + text_docs = [{ + '_id': '1', + 'tensor_field': 'title', + 'tensor_subfield': 'description', + "short_string_field": "shortstring", + "long_string_field": "Thisisaverylongstring" * 10, + "int_field": 123, + "float_field": 123.0, + "string_array": ["aaa", "bbb"], + "string_array2": ["123", "456"], + "int_map": {"a": 1, "b": 2}, + "float_map": {"c": 1.0, "d": 2.0}, + "bool_field": True, + "bool_field2": False, + "custom_vector_field": { + "content": "abcd", + "vector": [1.0] * 32 + } + }] + + mappings = { + "custom_vector_field": {"type": "custom_vector"}, + "multimodal_combo_field": { + "type": "multimodal_combination", + "weights": {"tensor_field": 1.0, "tensor_subfield": 2.0} + } + } + + tensor_fields = ['tensor_field', 'custom_vector_field', 'multimodal_combo_field'] + + add_docs_response = self.client.index(self.text_index_name).add_documents(documents = text_docs, mappings = mappings, tensor_fields = tensor_fields) + + self.assertFalse(add_docs_response["errors"]) + + update_docs_response = self.client.index(self.text_index_name).update_documents( + [{ + '_id': '1', + 'bool_field': False, + 'update_field_that_doesnt_exist': 500, + 'int_field': 1, + 'float_field': 500.0, + 'int_map': { + 'a': 2, + }, + 'float_map': { + 'c': 3.0, + }, + 'string_array': ["ccc"] + }] + ) + + assert update_docs_response["errors"] == False + + get_docs_response = self.client.index(self.text_index_name).get_document(document_id = '1') + + self.assertEqual(get_docs_response['bool_field'], False) + self.assertEqual(get_docs_response['int_field'], 1) + self.assertEqual(get_docs_response['float_field'], 500.0) + self.assertEqual(get_docs_response['int_map.a'], 2) + self.assertEqual(get_docs_response['float_map.c'], 3.0) + self.assertEqual(get_docs_response['string_array'], ["ccc"]) + self.assertEqual(get_docs_response['update_field_that_doesnt_exist'], 500) + self.assertEqual(get_docs_response['string_array2'], ["123", "456"]) + + def test_update_document_with_ids_change_field_type(self): + text_docs = [{ + '_id': '1', + 'tensor_field': 'title', + 'tensor_subfield': 'description', + "short_string_field": "shortstring", + "long_string_field": "Thisisaverylongstring" * 10, + "int_field": 123, + "float_field": 123.0, + "string_array": ["aaa", "bbb"], + "string_array2": ["123", "456"], + "int_map": {"a": 1, "b": 2}, + "float_map": {"c": 1.0, "d": 2.0}, + "bool_field": True, + "bool_field2": False, + "custom_vector_field": { + "content": "abcd", + "vector": [1.0] * 32 + } + }] + + mappings = { + "custom_vector_field": {"type": "custom_vector"}, + "multimodal_combo_field": { + "type": "multimodal_combination", + "weights": {"tensor_field": 1.0, "tensor_subfield": 2.0} + } + } + + tensor_fields = ['tensor_field', 'custom_vector_field', 'multimodal_combo_field'] + + add_docs_response = self.client.index(self.text_index_name).add_documents(documents = text_docs, mappings = mappings, tensor_fields = tensor_fields) + + self.assertFalse(add_docs_response["errors"]) + + update_docs_response = self.client.index(self.text_index_name).update_documents( + [{ + '_id': '1', + 'bool_field': False, + 'update_field_that_doesnt_exist': 500, + 'int_field': 1, + 'float_field': 500, # The request is same as the test case test_update_document_with_ids, except the float_field value is changed to int. This will result in a 412 condition check failed error. + 'int_map': { + 'a': 2, + }, + 'float_map': { + 'c': 3.0, + }, + 'string_array': ["ccc"] + }] + ) + + self.assertTrue(update_docs_response["errors"]) + + self.assertEqual(update_docs_response['items'][0]['status'], 400) + self.assertIn("Marqo vector store couldn't update the document. Please see", update_docs_response['items'][0]['message']) + self.assertIn("reference/api/documents/update-documents/#response", update_docs_response['items'][0]['message']) \ No newline at end of file diff --git a/tests/compatibility_tests/update_documents/test_update_documents_unstructured.py b/tests/compatibility_tests/update_documents/test_update_documents_unstructured.py new file mode 100644 index 000000000..6bc17f7c8 --- /dev/null +++ b/tests/compatibility_tests/update_documents/test_update_documents_unstructured.py @@ -0,0 +1,165 @@ +import traceback + +import pytest + +from tests.compatibility_tests.base_test_case.base_compatibility_test import BaseCompatibilityTestCase + + +@pytest.mark.marqo_version('2.16.0') +class TestUpdateDocumentsUnstructured2_16(BaseCompatibilityTestCase): + """ + Partial updates for unstructured indexes was introduced in 2.16.0. This is a backwards compatibility test which runs on + from_version >= 2.16.0. It will test that any future releases post 2.16.0 don't break the partial update functionality for unstructured indexes. + """ + unstructured_index_name = "test_update_documents_unstructured_index" + indexes_to_test_on = [ + { + "indexName": unstructured_index_name, + "type": "unstructured", + "model": "random/small", + "normalizeEmbeddings": True, + }] + + text_docs = [{ + '_id': '1', + 'tensor_field': 'title', + 'tensor_subfield': 'description', + "short_string_field": "shortstring", + "long_string_field": "Thisisaverylongstring" * 10, + "int_field": 123, + "float_field": 123.0, + "string_array": ["aaa", "bbb"], + "string_array2": ["123", "456"], + "int_map": {"a": 1, "b": 2}, + "float_map": {"c": 1.0, "d": 2.0}, + "bool_field": True, + "bool_field2": False, + "custom_vector_field": { + "content": "abcd", + "vector": [1.0] * 32 + } + }] + + + mappings = { + "custom_vector_field": {"type": "custom_vector"}, + "multimodal_combo_field": { + "type": "multimodal_combination", + "weights": {"tensor_field": 1.0, "tensor_subfield": 2.0} + } + } + + tensor_fields = ['tensor_field', 'custom_vector_field', 'multimodal_combo_field'] + + partial_update_test_cases = [{ + '_id': '1', + 'bool_field': False, + 'update_field_that_doesnt_exist': 500, + 'int_field': 1, + 'float_field': 500.0, + 'int_map': { + 'a': 2, # update int to int + }, + 'float_map': { + 'c': 3.0, # update float to int + }, + 'string_array': ["ccc"] + }] + + @classmethod + def tearDownClass(cls) -> None: + cls.indexes_to_delete = [index['indexName'] for index in cls.indexes_to_test_on] + super().tearDownClass() + + @classmethod + def setUpClass(cls) -> None: + cls.indexes_to_delete = [index['indexName'] for index in cls.indexes_to_test_on] + super().setUpClass() + + + def prepare(self): + self.logger.debug(f"Creating indexes {self.indexes_to_test_on} in test case: {self.__class__.__name__}") + self.create_indexes(self.indexes_to_test_on) + + self.logger.debug(f'Feeding documents to {self.indexes_to_test_on}') + for index in self.indexes_to_test_on: + index_name = index['indexName'] + with self.subTest(indexName=index_name): + self.client.index(index_name = index['indexName']).add_documents(documents = self.text_docs, mappings = self.mappings, tensor_fields = self.tensor_fields) + + + all_results = {} + for index in self.indexes_to_test_on: + index_name = index['indexName'] + all_results[index_name] = {} + for docs in self.text_docs: + with self.subTest(indexName=index_name, doc_id = docs['_id']): + get_docs_result = self.client.index(index_name).get_document(document_id = docs['_id']) + all_results[index_name] = get_docs_result + + def test_update_doc(self): + self.logger.info(f"Running test_update_doc on {self.__class__.__name__}") + + test_failures = [] + + for index in self.indexes_to_test_on: + index_name = index['indexName'] + try: + with self.subTest(indexName = index_name): + result = self.client.index(index_name).update_documents( + self.partial_update_test_cases + ) + self.logger.debug(f"Printing result {result}") + + except Exception as e: + test_failures.append((index_name, traceback.format_exc())) + + assert result["index_name"] == self.unstructured_index_name + assert result["errors"] == False + + for test_cases in self.partial_update_test_cases: + doc_id = test_cases['_id'] + get_docs_result = self.client.index(index).get_document(document_id = doc_id) + self._assert_updates_have_happened(get_docs_result, test_cases) + + if test_failures: + failure_message = "\n".join([ + f"Failure in index {idx}, {error}" + for idx, error in test_failures + ]) + self.fail(f"Some subtests failed:\n{failure_message}") + + def _assert_updates_have_happened(self, result, partial_update_test_case): + """ + { + '_id': '1', + 'bool_field': False, + 'update_field_that_doesnt_exist': 500, + 'int_field': 1, + 'float_field': 500.0, + 'int_map': { + 'a': 2, # update int to int + }, + 'float_map': { + 'c': 3.0, # update float to int #TODO: This should work. + }, + 'string_array': ["ccc"] + } + Args: + result: + partial_update_test_cases: + + Returns: + + """ + for field in partial_update_test_case: + if field == "_id": + continue + if isinstance(field, dict): + for key, value in field.items(): + key_in_result = key + '.' + value + if result.get(key_in_result) != partial_update_test_case.get(field).get(key): + self.fail(f"Field {key_in_result} does not match expected value {partial_update_test_case.get(field).get(key)}") + + if result.get(field) != partial_update_test_case.get(field): + self.fail(f"Field {field} does not match expected value {partial_update_test_case.get(field)}") diff --git a/tests/integ_tests/core/document/test_partial_document_update.py b/tests/integ_tests/core/document/test_partial_document_update.py index 1e53c1f88..ffde84e52 100644 --- a/tests/integ_tests/core/document/test_partial_document_update.py +++ b/tests/integ_tests/core/document/test_partial_document_update.py @@ -556,8 +556,8 @@ def test_update_array_text_field_filter(self): def test_update_a_document_that_does_not_exist(self): """""" updated_doc = { - "text_field": "updated text field", - "_id": "1" + "_id": "gibberish", + "text_field": "some value" } r = self.config.document.partial_update_documents_by_index_name( partial_documents=[updated_doc], @@ -566,7 +566,6 @@ def test_update_a_document_that_does_not_exist(self): self.assertEqual(True, r["errors"]) self.assertIn("Document does not exist in the index", r["items"][0]["error"]) self.assertEqual(404, r["items"][0]["status"]) - self.assertEqual(0, self.monitoring.get_index_stats_by_name(self.structured_index_name).number_of_documents) def test_update_a_document_without_id(self): updated_doc = { @@ -805,18 +804,6 @@ def test_proper_error_raised_if_received_too_many_documents(self): r = update_documents(body=UpdateDocumentsBodyParams(documents=[{"_id": "1"}] * 129), index_name=self.structured_index_name, marqo_config=self.config) - def test_proper_error_is_raised_for_unstructured_index(self): - updated_doc = { - "text_field_tensor": "I can't be updated", - "_id": "1" - } - with self.assertRaises(UnsupportedFeatureError) as cm: - r = self.config.document.partial_update_documents_by_index_name( - partial_documents=[updated_doc], - index_name=self.test_unstructured_index_name).dict(exclude_none=True, by_alias=True) - - self.assertIn("is not supported for unstructured indexes", str(cm.exception)) - def test_duplicate_ids_in_one_batch(self): """Test the behaviour when there are duplicate ids in a single batch. diff --git a/tests/integ_tests/core/document/test_partial_update_semi_structured.py b/tests/integ_tests/core/document/test_partial_update_semi_structured.py new file mode 100644 index 000000000..b7bafbcd5 --- /dev/null +++ b/tests/integ_tests/core/document/test_partial_update_semi_structured.py @@ -0,0 +1,784 @@ +from typing import List, Dict, Any + +import pytest + +from marqo.api.exceptions import InvalidFieldNameError +from marqo.core.models.add_docs_params import AddDocsParams +from marqo.tensor_search import tensor_search +from integ_tests.marqo_test import MarqoTestCase + +class TestPartialUpdate(MarqoTestCase): + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + + semi_structured_index_request = cls.unstructured_marqo_index_request(name='test_partial_update_semi_structured_14') + cls.create_indexes([semi_structured_index_request]) + cls.index = cls.indexes[0] + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + + def setUp(self) -> None: + super().setUp() + self.doc = { + '_id': '1', + "string_array": ["aaa", "bbb"], + "string_array2": ["123", "456"], + } + self.doc2 = { + '_id': '2', + 'tensor_field': 'title', + 'tensor_subfield': 'description', + "short_string_field": "shortstring", + "long_string_field": "Thisisaverylongstring" * 10, + "int_field": 123, + "float_field": 123.0, + "string_array": ["aaa", "bbb"], + "string_array2": ["123", "456"], + "int_map": {"a": 1, "b": 2}, + "float_map": {"c": 1.0, "d": 2.0}, + "bool_field": True, + "bool_field2": False, + "custom_vector_field": { + "content": "abcd", + "vector": [1.0] * 32 + }, + "lexical_field": "some string that signifies lexical field" + } + self.doc3 = { + '_id': '3', + 'tensor_field': 'title', + 'tensor_subfield': 'description', + "short_string_field": "shortstring", + "long_string_field": "Thisisaverylongstring" * 10, + "int_field": 123, + "float_field": 123.0, + "int_map": {"a": 1, "b": 2}, + "float_map": {"c": 1.0, "d": 2.0}, + "bool_field": True, + "bool_field2": False, + "custom_vector_field": { + "content": "abcd", + "vector": [1.0] * 32 + } + } + self.id_to_doc = { + '1': self.doc, + '2': self.doc2, + '3': self.doc3 + } + self.add_documents(self.config, add_docs_params=AddDocsParams( + index_name=self.index.name, + docs=[self.doc, self.doc2, self.doc3], + tensor_fields=['tensor_field', 'custom_vector_field', 'multimodal_combo_field'], + mappings = { + "custom_vector_field": {"type": "custom_vector"}, + "multimodal_combo_field": { + "type": "multimodal_combination", + "weights": {"tensor_field": 1.0, "tensor_subfield": 2.0} + } + } + )) + self.index = self.config.index_management.get_index(self.index.name) + + def _assert_fields_unchanged(self, doc: Dict[str, Any], excluded_fields: List[str]): + """Verify that fields in the document remain unchanged except for the specified excluded fields. + + This helper method checks that all fields in the document match their expected values, + excluding the fields that were intentionally modified during the test. + + Args: + doc: The document to check + excluded_fields: List of field names that were intentionally modified and should be excluded from verification + """ + doc_id = doc['_id'] + doc_to_compare_against = self.id_to_doc[doc_id] + for field, value in doc_to_compare_against.items(): + if field in excluded_fields: + continue + elif field == 'custom_vector_field': + self.assertEqual(value['content'], doc.get(field, None), f'{field} is changed.') + elif isinstance(value, dict): + for k, v in value.items(): + flattened_field_name = f'{field}.{k}' + if flattened_field_name in excluded_fields: + continue + self.assertEqual(v, doc.get(flattened_field_name, None), f'{flattened_field_name} is changed.') + else: + self.assertEqual(value, doc.get(field, None), f'{field} is changed.') + + # Test update single field + def test_partial_update_should_update_bool_field(self): + """Test that boolean fields can be updated correctly via partial updates. + + This test verifies that boolean fields can be updated for multiple documents + while ensuring other fields remain unchanged. + """ + test_docs = [self.doc, self.doc2] + + # First update the documents + for doc in test_docs: + with self.subTest(f"Updating document with ID {doc['_id']}"): + id = doc['_id'] + res = self.config.document.partial_update_documents([{'_id': id, 'bool_field': False}], self.index) + self.assertFalse(res.errors, f"Expected no errors when updating document {id}") + + # Then verify the updates + for doc in test_docs: + with self.subTest(f"Verifying document with ID {doc['_id']}"): + id = doc['_id'] + updated_doc = tensor_search.get_document_by_id(self.config, self.index.name, id) + self.assertFalse(updated_doc['bool_field'], f"Expected bool_field to be False for document {id}") + self._assert_fields_unchanged(updated_doc, ['bool_field']) + + def test_partial_update_should_update_int_field_to_int(self): + """Test that integer fields can be updated correctly via partial updates. + + This test verifies that integer fields can be updated for multiple documents + while ensuring other fields remain unchanged. + """ + test_docs = [self.doc, self.doc2, self.doc3] + + # First update the documents + for doc in test_docs: + with self.subTest(f"Updating document with ID {doc['_id']}"): + id = doc['_id'] + res = self.config.document.partial_update_documents([{'_id': id, 'int_field': 500}], self.index) + self.assertFalse(res.errors, f"Expected no errors when updating document {id}") + + # Then verify the updates + for doc in test_docs: + with self.subTest(f"Verifying document with ID {doc['_id']}"): + id = doc['_id'] + updated_doc = tensor_search.get_document_by_id(self.config, self.index.name, id) + self.assertEqual(500, updated_doc['int_field'], f"Expected int_field to be 500 for document {id}") + self._assert_fields_unchanged(updated_doc, ['int_field']) + + def test_partial_update_to_non_existent_field(self): + """Test that partial updates to non-existent fields are successful. + + This test case basically verifies that we can add new fields via partial updates + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'update_field_that_doesnt_exist': 500}], self.index) + self.assertFalse(res.errors) + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(500, doc['update_field_that_doesnt_exist']) + self._assert_fields_unchanged(doc, ['update_field_that_doesnt_exist']) + + def test_partial_update_should_not_update_int_field_to_float(self): + """Test that partial updates to int fields are rejected when the value is a float. + + This test verifies that partial updates to int fields are rejected when the value is a float. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'int_field': 1.0}], self.index) + self.assertTrue(res.errors) + self.assertIn('reference/api/documents/update-documents/#response', res.items[0].error) + self.assertIn("Marqo vector store couldn't update the document. Please see", res.items[0].error) + self.assertEqual(400, res.items[0].status) + + def test_partial_update_should_update_float_field_to_float(self): + """Test that partial updates to float fields are successful. + + This test verifies that partial updates to float fields are successful. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'float_field': 500.0}], self.index) + self.assertFalse(res.errors) + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(500.0, doc['float_field']) + self._assert_fields_unchanged(doc, ['float_field']) + + def test_partial_update_should_update_int_map(self): + """Test that partial updates to int maps are successful. + + This test verifies that partial updates to int maps are successful. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'int_map': {'a': 2, 'b': 3}}], self.index) + self.assertFalse(res.errors) + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(doc['int_map.a'], 2) + self.assertEqual(doc['int_map.b'], 3) + self._assert_fields_unchanged(doc, ['int_map']) + + def test_partial_update_should_update_int_map_with_new_value(self): + """Test that partial updates to int maps with new values are successful. + + This test verifies that partial updates to int maps with new values are successful. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'int_map': { + 'd': 2 + } + }], self.index) + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertIsNone(doc.get('int_map.a')) + self.assertIsNone(doc.get('int_map.b')) + self.assertEqual(doc['int_map.d'], 2) + + + def test_partial_update_should_update_float_map(self): + """Test that partial updates to float maps are successful. + + This test verifies that partial updates to float maps are successful. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'float_map': {'c': 2.0, 'd': 3.0}}], + self.index) + self.assertFalse(res.errors) + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(doc['float_map.c'], 2.0) + self.assertEqual(doc['float_map.d'], 3.0) + self._assert_fields_unchanged(doc, ['float_map']) + + + def test_partial_update_should_allow_changing_multiple_maps_in_same_request(self): + """Test that partial updates to multiple maps in the same request are successful. + + This test verifies that partial updates to multiple maps in the same request are successful. + """ + + + res = self.config.document.partial_update_documents([{'_id': '2', 'int_field': 2, 'int_map': { + 'a': 2, # update int to int + }, 'float_map': { + 'c': 3.0, # update float to float + }, 'bool_field': False, 'float_field': 500.0}], self.index) + self.assertFalse(res.errors) + + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(2, doc['int_field']) + self.assertFalse(doc['bool_field']) + self.assertEqual(500.0, doc['float_field']) + self.assertEqual(doc['int_map.a'], 2) + self.assertIsNone(doc.get('int_map.b')) + self.assertEqual(doc['float_map.c'], 3.0) + self.assertIsNone(doc.get('float_map.d')) + self._assert_fields_unchanged(doc, ['int_map.a', 'int_map.b', 'float_map.d', 'float_map.c', 'int_field', 'bool_field', 'float_field']) + + def test_partial_update_should_update_string_array(self): + """Test that partial updates to string arrays are successful. + + This test verifies that partial updates to string arrays are successful. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'string_array': ["ccc"]}], self.index) + self.assertFalse(res.errors) + + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(["ccc"], doc['string_array']) + self._assert_fields_unchanged(doc, ['string_array']) + + def test_partial_update_should_reject_new_string_array_field(self): + """Test that partial updates to string arrays are successful. + + This test verifies that partial updates to string arrays are successful. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'string_array3': ["ccc"]}], + self.index) + self.assertTrue(res.errors) + self.assertEqual(400, res.items[0].status) + self.assertIn('Unstructured index updates only support updating existing string array fields', res.items[0].error) + + def test_partial_update_should_allow_adding_new_string_string_array_field_if_present_in_other_docs_in_same_index(self): + """Tests that partial updates allow adding new string / string array fields if they are present in some other document in the same index. + + For example, doc2 contains lexical_field and string_array. Hence when we try to add lexical_field and string_array to doc1, it should be allowed. + """ + res = self.config.document.partial_update_documents([{'_id': '1', "lexical_field": "some value 2", 'string_array': ["ccc"]}], + self.config.index_management.get_index(self.index.name)) + self.assertFalse(res.errors) + doc = tensor_search.get_document_by_id(self.config, self.index.name, '1') + self.assertEqual("some value 2", doc['lexical_field']) + self.assertEqual(["ccc"], doc['string_array']) + + def test_partial_update_should_update_short_string(self): + """Test that partial updates to short strings are successful. + + This test verifies that partial updates to short strings are successful. + """ + index = self.config.index_management.get_index(self.index.name) + res = self.config.document.partial_update_documents( + [{'_id': '2', 'short_string_field': 'updated_short_string'}], index) + self.assertFalse(res.errors) + + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual('updated_short_string', doc['short_string_field']) + self._assert_fields_unchanged(doc, ['short_string_field']) + + def test_partial_update_should_update_long_string(self): + """Test that partial updates to long strings are successful. + + This test verifies that partial updates to long strings are successful. + """ + index = self.config.index_management.get_index(self.index.name) + res = self.config.document.partial_update_documents( + [{'_id': '2', 'long_string_field': 'updated_long_string' * 10}], index) + self.assertFalse(res.errors) + + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual('updated_long_string' * 10, doc['long_string_field']) + self._assert_fields_unchanged(doc, ['long_string_field']) + + def test_partial_update_should_update_long_string_to_short_string(self): + """Test that partial updates to long strings are successful. + + This test verifies that partial updates to long strings are successful. + """ + res = tensor_search.search(self.config, self.index.name, text='*', + filter=f'long_string_field:{self.doc2["long_string_field"]}') + self.assertEqual(0, len(res['hits'])) + index = self.config.index_management.get_index(self.index.name) + + res = self.config.document.partial_update_documents([{'_id': '2', 'long_string_field': 'short'}], index) + self.assertFalse(res.errors) + + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual('short', doc['long_string_field']) + self._assert_fields_unchanged(doc, ['long_string_field']) + + res = tensor_search.search(self.config, self.index.name, text='*', filter=f'long_string_field:short') + self.assertEqual(1, len(res['hits'])) + + def test_partial_update_should_update_short_string_to_long_string(self): + """Test that partial updates to short strings are successful. + + This test verifies that partial updates to short strings are successful. + """ + res = tensor_search.search(self.config, self.index.name, text='*', + filter=f'short_string_field:{self.doc2["short_string_field"]}') + self.assertEqual(2, len(res['hits'])) + + index = self.config.index_management.get_index(self.index.name) + + res = self.config.document.partial_update_documents([{'_id': '2', 'short_string_field': 'verylongstring'*10}], index) + self.assertFalse(res.errors) + + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual('verylongstring'*10, doc['short_string_field']) + self._assert_fields_unchanged(doc, ['short_string_field']) + + res = tensor_search.search(self.config, self.index.name, text='*', + filter=f'short_string_field:{self.doc3["short_string_field"]}') + self.assertEqual(1, len(res['hits'])) + + def test_partial_update_should_update_score_modifiers(self): + """Test that partial updates to score modifiers are successful. + + This test verifies that partial updates to score modifiers are successful. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'int_map': { + 'a': 2, # update int to int + 'd': 5, # new entry in int map + }, 'float_map': { + 'c': 3.0, # update float to float + }, 'new_int' : 1, # new int field + 'new_float': 2.0, # new float field + 'new_map': {'a': 1, 'b': 2.0}, # new map field + }], self.index) + self.assertFalse(res.errors) + res = self.config.vespa_client.get_document('2', self.config.index_management.get_index(self.index.name).schema_name) + doc = res.document.dict().get('fields') + self.assertEqual(doc['marqo__score_modifiers']['cells']['int_field'], 123.0) + self.assertEqual(doc['marqo__score_modifiers']['cells']['float_field'], 123.0) + self.assertEqual(doc['marqo__score_modifiers']['cells']['int_map.a'], 2.0) + self.assertEqual(doc['marqo__score_modifiers']['cells']['float_map.c'], 3.0) + self.assertEqual(doc['marqo__score_modifiers']['cells'].get('int_map.b', None), None) + self.assertEqual(doc['marqo__score_modifiers']['cells'].get('float_map.d', None), None) + self.assertEqual(doc['marqo__score_modifiers']['cells']['new_int'], 1.0) + self.assertEqual(doc['marqo__score_modifiers']['cells']['new_float'], 2.0) + self.assertEqual(doc['marqo__score_modifiers']['cells']['new_map.a'], 1.0) + self.assertEqual(doc['marqo__score_modifiers']['cells']['new_map.b'], 2.0) + self.assertEqual(doc['marqo__score_modifiers']['cells']['int_map.d'], 5.0) + + + def test_partial_update_should_add_new_fields(self): + """Test that partial updates to new fields are successful. + + This test verifies that partial updates to new fields are successful. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'new_field': 500, 'new_float': 500.0, + 'new_int_map':{'a':2}, + 'new_bool_field': True, + 'new_float_field': 10.0 + }], self.config.index_management.get_index(self.index.name)) + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self._assert_fields_unchanged(doc, []) + self.assertEqual(500.0, doc['new_float']) + self.assertEqual(2, doc['new_int_map.a']) + self.assertEqual(500, doc['new_field']) + self.assertEqual(True, doc['new_bool_field']) + self.assertEqual(10.0, doc['new_float_field']) + + # Reject any tensor field change + def test_partial_update_should_reject_tensor_field(self): + """Test that partial updates to tensor fields are rejected. + + This test verifies that partial updates to tensor fields are rejected. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'tensor_field': 'new_title'}], self.index) + self.assertTrue(res.errors) + self.assertIn('reference/api/documents/update-documents/#response', res.items[0].error) + self.assertIn("Marqo vector store couldn't update the document. Please see", res.items[0].error) + self.assertEqual(400, res.items[0].status) + + def test_partial_update_should_reject_multi_modal_field_subfield(self): + """Test that partial updates to tensor subfields are rejected. + + This test verifies that partial updates to tensor subfields are rejected. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'tensor_subfield': 'new_description'}], self.index) + self.assertTrue(res.errors) + self.assertIn('reference/api/documents/update-documents/#response', res.items[0].error) + self.assertIn("Marqo vector store couldn't update the document. Please see", res.items[0].error) + self.assertEqual(400, res.items[0].status) + + def test_partial_update_should_reject_custom_vector_field(self): + """Test that partial updates to custom vector fields are rejected. + + This test verifies that partial updates to custom vector fields are rejected. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'custom_vector_field': { + "content": "efgh", + "vector": [1.0] * 32 + }}], self.index) + self.assertTrue(res.errors) + self.assertEqual(400, res.items[0].status) + self.assertIn("Unsupported field type for field custom_vector_field in doc 2. " + "We only support int and float types for map values when updating a document", res.items[0].error) + + def test_partial_update_should_reject_multimodal_combo_field(self): + """Test that partial updates to multimodal combo fields are rejected. + + This test verifies that partial updates to multimodal combo fields are rejected. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'multimodal_combo_field': { + "tensor_field": "new_title", + "tensor_subfield": "new_description" + }}], self.index) + self.assertTrue(res.errors) + self.assertIn("Unsupported field type for field multimodal_combo_field in doc 2", res.items[0].error) + self.assertEqual(400, res.items[0].status) + + def test_partial_update_should_reject_numeric_array_field_type(self): + """Test that partial updates to numeric array fields are rejected. + + This test verifies that partial updates to numeric array fields are rejected. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'int_array': [1, 2, 3]}], self.index) + self.assertTrue(res.errors) + self.assertIn("Unstructured index updates only support updating existing string array fields", res.items[0].error) + self.assertEqual(400, res.items[0].status) + + def test_partial_update_should_reject_new_lexical_field(self): + """Test that partial updates to new lexical fields are rejected. + + This test verifies that partial updates to new lexical fields are rejected. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'new_lexical_field': 'some string that signifies new lexical field'}], self.index) + self.assertTrue(res.errors) + self.assertIn("new_lexical_field of type str does not exist in the original document. We do not support adding new lexical fields in partial updates", res.items[0].error) + self.assertEqual(400, res.items[0].status) + + def test_partial_update_invalid_field_name(self): + """Test that partial updates to invalid field names are rejected. + + This test verifies that partial updates to invalid field names are rejected. + """ + with pytest.raises(InvalidFieldNameError): + res = self.config.document.partial_update_documents([{'_id': '2', 'marqo__': 1}], self.index) + + + def test_partial_update_should_handle_mixed_numeric_map_updates(self): + """Test updating maps with mix of additions and removals + + This test verifies that partial updates can correctly handle numeric maps + with a mixture of operations: + 1. Updating existing key-value pairs + 2. Adding new key-value pairs + + The test performs updates on both integer maps and float maps, then + verifies that all changes were applied correctly by retrieving the + document and checking each individual key-value pair. + """ + res = self.config.document.partial_update_documents([{ + '_id': '2', + 'int_map': { + 'a': 10, # Update existing + 'c': 3, # Add new + 'b': 20 # Update existing + }, + 'float_map': { + 'c': 10.5, # Update existing + 'e': 5.5 # Add new + } + }], self.index) + self.assertFalse(res.errors) + + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(10, doc['int_map.a']) + self.assertEqual(20, doc['int_map.b']) + self.assertEqual(3, doc['int_map.c']) + self.assertEqual(10.5, doc['float_map.c']) + self.assertEqual(5.5, doc['float_map.e']) + self.assertEqual(None, doc.get('float_map.d', None)) + self._assert_fields_unchanged(doc, ['int_map.a', 'int_map.b', 'int_map.c', 'float_map.c', 'float_map.e', 'float_map.d']) + + def test_partial_update_should_reject_invalid_map_values(self): + """Test rejection of invalid value types in numeric maps + + This test verifies that partial updates reject invalid value types in numeric maps. + """ + res = self.config.document.partial_update_documents([{ + '_id': '2', + 'int_map': { + 'a': 'string', # Invalid type + 'b': 2.5, # Invalid type + 'c': True # Invalid type + } + }], self.index) + self.assertTrue(res.errors) + self.assertIn("Unsupported field type for field int_map in doc 2", res.items[0].error) + self.assertEqual(400, res.items[0].status) + + # Verify original values unchanged + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(1, doc['int_map.a']) + self.assertEqual(2, doc['int_map.b']) + self._assert_fields_unchanged(doc, ['int_map.a', 'int_map.b']) + + def test_partial_update_should_handle_multiple_docs(self): + """Test updating multiple documents in one request + + This test verifies that partial updates can correctly handle multiple documents + in a single request. + """ + updates = [ + { + '_id': '2', + 'int_field': 1000, + 'float_map': {'c': 99.9} + }, + { + '_id': '3', + 'bool_field': False, + 'int_map': {'a': 777} + } + ] + res = self.config.document.partial_update_documents(updates, self.index) + self.assertFalse(res.errors) + + # Verify updates + doc2 = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(1000, doc2['int_field']) + self.assertEqual(99.9, doc2['float_map.c']) + self._assert_fields_unchanged(doc2, ['int_field', 'float_map.c', 'float_map.d']) + + doc3 = tensor_search.get_document_by_id(self.config, self.index.name, '3') + self.assertFalse(doc3['bool_field']) + self.assertEqual(777, doc3['int_map.a']) + self._assert_fields_unchanged(doc3, ['bool_field', 'int_map.a', 'int_map.b']) + + def test_partial_update_should_handle_duplicate_doc_ids(self): + """Test handling of duplicate document IDs in update request + + This test verifies that partial updates can correctly handle duplicate document IDs + in an update request. + """ + updates = [ + { + '_id': '2', + 'int_field': 100 + }, + { + '_id': '2', + 'int_field': 200 + } + ] + res = self.config.document.partial_update_documents(updates, self.index) + self.assertFalse(res.errors) + + # Verify last update wins + doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(200, doc['int_field']) + self._assert_fields_unchanged(doc, ['int_field']) + + def test_partial_update_should_handle_non_existent_doc_id(self): + """Test updating non-existent document + + This test verifies that partial updates can correctly handle non-existent document IDs. + """ + res = self.config.document.partial_update_documents([{ + '_id': 'non_existent', + 'int_field': 100 + }], self.index) + self.assertTrue(res.errors) + self.assertIn('reference/api/documents/update-documents/#response', res.items[0].error) + self.assertIn("Marqo vector store couldn't update the document. Please see", res.items[0].error) + + def test_partial_update_should_handle_none_id(self): + """Test handling of None _id field + + This test verifies that partial updates can correctly handle None document IDs. + """ + res = self.config.document.partial_update_documents([{ + '_id': None, + 'int_field': 100 + }], self.index) + self.assertTrue(res.errors) + self.assertIn('document _id must be a string type! received _id none of type `nonetype`', res.items[0].error.lower()) + self.assertEqual(400, res.items[0].status) + + def test_partial_update_should_handle_missing_id(self): + """Test handling of document without _id field + + This test verifies that partial updates can correctly handle documents + without an _id field. + """ + res = self.config.document.partial_update_documents([{ + 'int_field': 100 + }], self.index) + self.assertTrue(res.errors) + self.assertIn("'_id' is a required field", res.items[0].error.lower()) + self.assertEqual(400, res.items[0].status) + + def test_partial_update_should_handle_empty_update_list(self): + """Test handling of empty document list + + This test verifies that partial updates can correctly handle empty document lists. + """ + res = self.config.document.partial_update_documents([], self.index) + self.assertFalse(res.errors) + self.assertEqual(0, len(res.items)) + + def test_partial_update_should_handle_mixed_valid_invalid_docs(self): + """Test batch with mix of valid and invalid documents""" + updates = [ + { + '_id': '2', + 'int_field': 100 + }, + { + '_id': '3', + 'bool_field': True + }, + { + 'missing_id': True + } + ] + res = self.config.document.partial_update_documents(updates, self.index) + self.assertTrue(res.errors) + + # Verify valid updates succeeded + doc2 = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertEqual(100, doc2['int_field']) + self._assert_fields_unchanged(doc2, ['int_field']) + + doc3 = tensor_search.get_document_by_id(self.config, self.index.name, '3') + self.assertTrue(doc3['bool_field']) + self._assert_fields_unchanged(doc2, ['bool_field','int_field']) + + self.assertEqual(3, len(res.items)) + self.assertFalse(res.items[0].error) # Valid doc + self.assertFalse(res.items[1].error) # Valid doc + + # Verify error responses for invalid docs + self.assertIn("'_id' is a required field", res.items[2].error) # Missing ID + + def test_partial_update_should_handle_nested_maps(self): + """Test handling of nested maps in updates + + This test verifies that partial updates can correctly handle nested maps. + """ + res = self.config.document.partial_update_documents([{ + '_id': '2', + 'int_map': { + 'nested': { + 'too': 'deep' + } + } + }], self.index) + self.assertTrue(res.errors) + self.assertEqual(400, res.items[0].status) + self.assertIn('unsupported field type', res.items[0].error.lower()) + + def test_partial_update_should_handle_empty_string_id(self): + """Test handling of empty string as document ID + + This test verifies that partial updates can correctly handle empty string document IDs. + """ + res = self.config.document.partial_update_documents([{ + '_id': '', + 'int_field': 100 + }], self.index) + self.assertTrue(res.errors) + self.assertIn("document id can't be empty", res.items[0].error.lower()) + + def test_partial_update_should_handle_random_dict_field(self): + """Test handling of random dictionary fields + + This test verifies that partial updates can correctly handle random dictionary fields. + """ + res = self.config.document.partial_update_documents( + [{ + '_id': '2', + "random_field": { + "content1": "abcd", + "content2": "efgh" + } + }], self.index) + self.assertTrue(res.errors) + self.assertIn('Unsupported field type', res.items[0].error) + + def test_partial_update_should_handle_random_field_type(self): + """Test handling of random field types + + This test verifies that partial updates can correctly handle random field types. + """ + res = self.config.document.partial_update_documents( + [{ + '_id': '2', + "random_field": None + }], self.index) + self.assertTrue(res.errors) + self.assertIn('Unsupported field type', res.items[0].error) + + def test_partial_update_should_handle_empty_dict_field(self): + """Test handling of empty dictionary fields + + This test verifies that partial updates can correctly handle empty dictionary fields. + """ + res = self.config.document.partial_update_documents( + [{ + '_id': '2', + "float_map": {} + }], self.index + ) + self.assertFalse(res.errors) + updated_doc = tensor_search.get_document_by_id(self.config, self.index.name, '2') + self.assertIsNone(updated_doc.get('float_map.c', None)) + self.assertIsNone(updated_doc.get('float_map.d', None)) + self._assert_fields_unchanged(updated_doc, ['float_map.c', 'float_map.d']) + + def test_partial_update_should_reject_updating_dict_to_int_field(self): + """Test that partial updates to dictionary fields are rejected when the value is an integer. + + This test verifies that partial updates to dictionary fields are rejected when the value is an integer. + """ + res = self.config.document.partial_update_documents([ + { + '_id': '2', + "float_map": 100 + } + ], self.index) + self.assertTrue(res.errors) + self.assertIn("Marqo vector store couldn't update the document. Please see", res.items[0].error) + self.assertIn('reference/api/documents/update-documents/#response', res.items[0].error) + + def test_updating_int_map_to_int(self): + """Test that partial updates to int maps are successful. + + This test verifies that partial updates to int maps are rejected. + """ + res = self.config.document.partial_update_documents([{'_id': '2', 'int_map': 100}], self.index) + self.assertIn('reference/api/documents/update-documents/#response', res.items[0].error) + self.assertIn("Marqo vector store couldn't update the document. Please see", res.items[0].error) + self.assertTrue(res.errors) + self.assertEqual(400, res.items[0].status) diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_mutliple_lexical_tensor_fields.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_multiple_lexical_tensor_fields.sd similarity index 85% rename from tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_mutliple_lexical_tensor_fields.sd rename to tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_multiple_lexical_tensor_fields.sd index dbbac95f5..0f5b73cd6 100644 --- a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_mutliple_lexical_tensor_fields.sd +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_multiple_lexical_tensor_fields.sd @@ -110,7 +110,6 @@ schema marqo__test_00semi_00structured_00schema { fieldset default { fields: marqo__lexical_text_field1, marqo__lexical_text_field2 } - rank-profile base_rank_profile inherits default { inputs { query(marqo__lexical_text_field1): 0 @@ -123,10 +122,24 @@ schema marqo__test_00semi_00structured_00schema { query(marqo__add_weights_lexical) tensor(p{}) query(marqo__mult_weights_tensor) tensor(p{}) query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) } + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } function modify(score, mult_weights, add_weights) { - expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) * score + reduce(add_weights * attribute(marqo__score_modifiers), sum) + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) } function lexical_score_sum() { @@ -147,6 +160,8 @@ schema marqo__test_00semi_00structured_00schema { function embedding_score() { expression: max(if (query(marqo__embeddings_tensor_field1) > 0, closeness(field, marqo__embeddings_tensor_field1), 0), if (query(marqo__embeddings_tensor_field2) > 0, closeness(field, marqo__embeddings_tensor_field2), 0)) } + + match-features: global_mult_modifier global_add_modifier } rank-profile bm25 inherits base_rank_profile { @@ -158,8 +173,14 @@ schema marqo__test_00semi_00structured_00schema { first-phase { expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) } - match-features: closest(marqo__embeddings_tensor_field1) closest(marqo__embeddings_tensor_field2) distance(field, marqo__embeddings_tensor_field1) distance(field, marqo__embeddings_tensor_field2) + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field1) + closest(marqo__embeddings_tensor_field2) + distance(field, marqo__embeddings_tensor_field1) + distance(field, marqo__embeddings_tensor_field2) + } } + rank-profile hybrid_custom_searcher inherits default { inputs { query(marqo__fields_to_rank_lexical) tensor(p{}) @@ -169,6 +190,8 @@ schema marqo__test_00semi_00structured_00schema { query(marqo__add_weights_lexical) tensor(p{}) query(marqo__mult_weights_tensor) tensor(p{}) query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) } } @@ -179,7 +202,13 @@ schema marqo__test_00semi_00structured_00schema { second-phase { expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) } - match-features: closest(marqo__embeddings_tensor_field1) closest(marqo__embeddings_tensor_field2) distance(field, marqo__embeddings_tensor_field1) distance(field, marqo__embeddings_tensor_field2)} + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field1) + closest(marqo__embeddings_tensor_field2) + distance(field, marqo__embeddings_tensor_field1) + distance(field, marqo__embeddings_tensor_field2) + } + } rank-profile hybrid_embedding_similarity_then_bm25 inherits base_rank_profile { first-phase { diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_no_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_no_field.sd new file mode 100644 index 000000000..e614ebe15 --- /dev/null +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_no_field.sd @@ -0,0 +1,127 @@ +schema marqo__test_00semi_00structured_00schema { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__short_string_fields type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__vector_count type int { + indexing: attribute | summary + } + } + + rank-profile base_rank_profile inherits default { + inputs { + query(marqo__bm25_aggregator): 0 + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } + function modify(score, mult_weights, add_weights) { + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) + } + + function embedding_score() { + expression: 0 + } + + match-features: global_mult_modifier global_add_modifier + } + + rank-profile embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + + } + + + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + } +} \ No newline at end of file diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_lexical_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_lexical_field.sd new file mode 100644 index 000000000..d24e0bbad --- /dev/null +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_lexical_field.sd @@ -0,0 +1,157 @@ +schema marqo__test_00semi_00structured_00schema { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__short_string_fields type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__lexical_text_field type string { + indexing: index | summary + index: enable-bm25 + } + field marqo__vector_count type int { + indexing: attribute | summary + } + } + + fieldset default { + fields: marqo__lexical_text_field + } + rank-profile base_rank_profile inherits default { + inputs { + query(marqo__lexical_text_field): 0 + query(marqo__bm25_aggregator): 0 + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } + function modify(score, mult_weights, add_weights) { + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) + } + + function lexical_score_sum() { + expression: if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0) + } + + function lexical_score_avg() { + expression: (if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0)) / max(1, if (query(marqo__lexical_text_field) > 0, 1, 0)) + } + + function lexical_score_max() { + expression: if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0) + } + + function lexical_score() { + expression: if (query(marqo__bm25_aggregator) == 0, lexical_score_sum(), if (query(marqo__bm25_aggregator) == 1, lexical_score_avg(), lexical_score_max())) + } + function embedding_score() { + expression: 0 + } + + match-features: global_mult_modifier global_add_modifier + } + + rank-profile bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + rank-profile embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + + } + + + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary text_field type string {source: marqo__lexical_text_field} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary text_field type string {source: marqo__lexical_text_field} + } +} \ No newline at end of file diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_lexical_one_tensor_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_lexical_one_tensor_field.sd new file mode 100644 index 000000000..8080fe570 --- /dev/null +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_lexical_one_tensor_field.sd @@ -0,0 +1,211 @@ +schema marqo__test_00semi_00structured_00schema { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__short_string_fields type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__lexical_text_field type string { + indexing: index | summary + index: enable-bm25 + } + field marqo__chunks_tensor_field type array { + indexing: summary + } + + field marqo__embeddings_tensor_field type tensor(p{}, x[32]) { + indexing: attribute | index | summary + attribute { + distance-metric: prenormalized-angular + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + field marqo__vector_count type int { + indexing: attribute | summary + } + } + + fieldset default { + fields: marqo__lexical_text_field + } + rank-profile base_rank_profile inherits default { + inputs { + query(marqo__lexical_text_field): 0 + query(marqo__embeddings_tensor_field): 0 + query(marqo__bm25_aggregator): 0 + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } + function modify(score, mult_weights, add_weights) { + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) + } + + function lexical_score_sum() { + expression: if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0) + } + + function lexical_score_avg() { + expression: (if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0)) / max(1, if (query(marqo__lexical_text_field) > 0, 1, 0)) + } + + function lexical_score_max() { + expression: if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0) + } + + function lexical_score() { + expression: if (query(marqo__bm25_aggregator) == 0, lexical_score_sum(), if (query(marqo__bm25_aggregator) == 1, lexical_score_avg(), lexical_score_max())) + } + function embedding_score() { + expression: if (query(marqo__embeddings_tensor_field) > 0, closeness(field, marqo__embeddings_tensor_field), 0) + } + + match-features: global_mult_modifier global_add_modifier + } + + rank-profile bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + rank-profile embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field) + distance(field, marqo__embeddings_tensor_field) + } + } + + rank-profile hybrid_custom_searcher inherits default { + inputs { + query(marqo__fields_to_rank_lexical) tensor(p{}) + query(marqo__fields_to_rank_tensor) tensor(p{}) + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + } + + rank-profile hybrid_bm25_then_embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + second-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field) + distance(field, marqo__embeddings_tensor_field) + } + } + + rank-profile hybrid_embedding_similarity_then_bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary text_field type string {source: marqo__lexical_text_field} + summary marqo__chunks_tensor_field type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary text_field type string {source: marqo__lexical_text_field} + summary marqo__chunks_tensor_field type array {} + summary marqo__embeddings_tensor_field type tensor(p{}, x[32]) {} + } +} \ No newline at end of file diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_tensor_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_tensor_field.sd new file mode 100644 index 000000000..873e15d2e --- /dev/null +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/pre_2_16/semi_structured_vespa_index_schema_one_tensor_field.sd @@ -0,0 +1,150 @@ +schema marqo__test_00semi_00structured_00schema { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__short_string_fields type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__string_array type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__chunks_tensor_field type array { + indexing: summary + } + + field marqo__embeddings_tensor_field type tensor(p{}, x[32]) { + indexing: attribute | index | summary + attribute { + distance-metric: prenormalized-angular + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + field marqo__vector_count type int { + indexing: attribute | summary + } + } + + rank-profile base_rank_profile inherits default { + inputs { + query(marqo__embeddings_tensor_field): 0 + query(marqo__bm25_aggregator): 0 + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } + function modify(score, mult_weights, add_weights) { + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) + } + + function embedding_score() { + expression: if (query(marqo__embeddings_tensor_field) > 0, closeness(field, marqo__embeddings_tensor_field), 0) + } + + match-features: global_mult_modifier global_add_modifier + } + + rank-profile embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field) + distance(field, marqo__embeddings_tensor_field) + } + } + + + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks_tensor_field type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__string_array type array {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__chunks_tensor_field type array {} + summary marqo__embeddings_tensor_field type tensor(p{}, x[32]) {} + } +} \ No newline at end of file diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_multiple_lexical_tensor_and_string_array_fields.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_multiple_lexical_tensor_and_string_array_fields.sd new file mode 100644 index 000000000..9f8554551 --- /dev/null +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_multiple_lexical_tensor_and_string_array_fields.sd @@ -0,0 +1,267 @@ +schema marqo__test_00semi_00structured_00schema { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__short_string_fields type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__lexical_text_field1 type string { + indexing: index | summary + index: enable-bm25 + } + field marqo__lexical_text_field2 type string { + indexing: index | summary + index: enable-bm25 + } + field marqo__string_array_string_array_1 type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + field marqo__string_array_string_array_2 type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + field marqo__chunks_tensor_field1 type array { + indexing: summary + } + + field marqo__embeddings_tensor_field1 type tensor(p{}, x[32]) { + indexing: attribute | index | summary + attribute { + distance-metric: prenormalized-angular + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + field marqo__chunks_tensor_field2 type array { + indexing: summary + } + + field marqo__embeddings_tensor_field2 type tensor(p{}, x[32]) { + indexing: attribute | index | summary + attribute { + distance-metric: prenormalized-angular + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + field marqo__vector_count type int { + indexing: attribute | summary + } + } + + fieldset default { + fields: marqo__lexical_text_field1, marqo__lexical_text_field2 + } + rank-profile base_rank_profile inherits default { + inputs { + query(marqo__lexical_text_field1): 0 + query(marqo__lexical_text_field2): 0 + query(marqo__embeddings_tensor_field1): 0 + query(marqo__embeddings_tensor_field2): 0 + query(marqo__bm25_aggregator): 0 + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } + function modify(score, mult_weights, add_weights) { + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) + } + + function lexical_score_sum() { + expression: if (query(marqo__lexical_text_field1) > 0, bm25(marqo__lexical_text_field1), 0) + if (query(marqo__lexical_text_field2) > 0, bm25(marqo__lexical_text_field2), 0) + } + + function lexical_score_avg() { + expression: (if (query(marqo__lexical_text_field1) > 0, bm25(marqo__lexical_text_field1), 0) + if (query(marqo__lexical_text_field2) > 0, bm25(marqo__lexical_text_field2), 0)) / max(1, if (query(marqo__lexical_text_field1) > 0, 1, 0) + if (query(marqo__lexical_text_field2) > 0, 1, 0)) + } + + function lexical_score_max() { + expression: max(if (query(marqo__lexical_text_field1) > 0, bm25(marqo__lexical_text_field1), 0), if (query(marqo__lexical_text_field2) > 0, bm25(marqo__lexical_text_field2), 0)) + } + + function lexical_score() { + expression: if (query(marqo__bm25_aggregator) == 0, lexical_score_sum(), if (query(marqo__bm25_aggregator) == 1, lexical_score_avg(), lexical_score_max())) + } + function embedding_score() { + expression: max(if (query(marqo__embeddings_tensor_field1) > 0, closeness(field, marqo__embeddings_tensor_field1), 0), if (query(marqo__embeddings_tensor_field2) > 0, closeness(field, marqo__embeddings_tensor_field2), 0)) + } + + match-features: global_mult_modifier global_add_modifier + } + + rank-profile bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + rank-profile embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field1) + closest(marqo__embeddings_tensor_field2) + distance(field, marqo__embeddings_tensor_field1) + distance(field, marqo__embeddings_tensor_field2) + } + } + + rank-profile hybrid_custom_searcher inherits default { + inputs { + query(marqo__fields_to_rank_lexical) tensor(p{}) + query(marqo__fields_to_rank_tensor) tensor(p{}) + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + } + + rank-profile hybrid_bm25_then_embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + second-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field1) + closest(marqo__embeddings_tensor_field2) + distance(field, marqo__embeddings_tensor_field1) + distance(field, marqo__embeddings_tensor_field2) + } + } + + rank-profile hybrid_embedding_similarity_then_bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__field_types type map {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__string_array_string_array_1 type array {source: marqo__string_array_string_array_1} + summary marqo__string_array_string_array_2 type array {source: marqo__string_array_string_array_2} + + summary text_field1 type string {source: marqo__lexical_text_field1} + summary text_field2 type string {source: marqo__lexical_text_field2} + summary marqo__chunks_tensor_field1 type array {} + summary marqo__chunks_tensor_field2 type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__field_types type map {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__string_array_string_array_1 type array {source: marqo__string_array_string_array_1} + summary marqo__string_array_string_array_2 type array {source: marqo__string_array_string_array_2} + + summary text_field1 type string {source: marqo__lexical_text_field1} + summary text_field2 type string {source: marqo__lexical_text_field2} + summary marqo__chunks_tensor_field1 type array {} + summary marqo__embeddings_tensor_field1 type tensor(p{}, x[32]) {} + summary marqo__chunks_tensor_field2 type array {} + summary marqo__embeddings_tensor_field2 type tensor(p{}, x[32]) {} + } +} \ No newline at end of file diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_multiple_lexical_tensor_fields.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_multiple_lexical_tensor_fields.sd index d377dda82..ed436cdd7 100644 --- a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_multiple_lexical_tensor_fields.sd +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_multiple_lexical_tensor_fields.sd @@ -6,6 +6,21 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + field marqo__int_fields type map { indexing: summary struct-field key { indexing : attribute @@ -46,12 +61,6 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } } - field marqo__string_array type array { - indexing: attribute | summary - attribute: fast-search - rank: filter - } - field marqo__score_modifiers type tensor(p{}) { indexing: attribute | summary } @@ -216,10 +225,11 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-non-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} + summary text_field1 type string {source: marqo__lexical_text_field1} summary text_field2 type string {source: marqo__lexical_text_field2} summary marqo__chunks_tensor_field1 type array {} @@ -228,10 +238,11 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} + summary text_field1 type string {source: marqo__lexical_text_field1} summary text_field2 type string {source: marqo__lexical_text_field2} summary marqo__chunks_tensor_field1 type array {} diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_no_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_no_field.sd index e614ebe15..96601ef5c 100644 --- a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_no_field.sd +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_no_field.sd @@ -6,6 +6,21 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + field marqo__int_fields type map { indexing: summary struct-field key { indexing : attribute @@ -46,12 +61,6 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } } - field marqo__string_array type array { - indexing: attribute | summary - attribute: fast-search - rank: filter - } - field marqo__score_modifiers type tensor(p{}) { indexing: attribute | summary } @@ -111,7 +120,7 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-non-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} @@ -119,7 +128,7 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_field.sd index d24e0bbad..6d3e5989e 100644 --- a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_field.sd +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_field.sd @@ -6,6 +6,21 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + field marqo__int_fields type map { indexing: summary struct-field key { indexing : attribute @@ -46,12 +61,6 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } } - field marqo__string_array type array { - indexing: attribute | summary - attribute: fast-search - rank: filter - } - field marqo__score_modifiers type tensor(p{}) { indexing: attribute | summary } @@ -139,7 +148,7 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-non-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} @@ -148,7 +157,7 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_one_tensor_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_one_tensor_field.sd index 8080fe570..e1215898a 100644 --- a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_one_tensor_field.sd +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_one_tensor_field.sd @@ -6,6 +6,21 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + field marqo__int_fields type map { indexing: summary struct-field key { indexing : attribute @@ -46,12 +61,6 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } } - field marqo__string_array type array { - indexing: attribute | summary - attribute: fast-search - rank: filter - } - field marqo__score_modifiers type tensor(p{}) { indexing: attribute | summary } @@ -190,20 +199,22 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-non-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} + summary text_field type string {source: marqo__lexical_text_field} summary marqo__chunks_tensor_field type array {} } document-summary all-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} + summary text_field type string {source: marqo__lexical_text_field} summary marqo__chunks_tensor_field type array {} summary marqo__embeddings_tensor_field type tensor(p{}, x[32]) {} diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_one_tensor_one_string_array_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_one_tensor_one_string_array_field.sd new file mode 100644 index 000000000..f89a06d5c --- /dev/null +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_lexical_one_tensor_one_string_array_field.sd @@ -0,0 +1,229 @@ +schema marqo__test_00semi_00structured_00schema { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__short_string_fields type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__lexical_text_field type string { + indexing: index | summary + index: enable-bm25 + } + field marqo__string_array_string_array_1 type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + field marqo__chunks_tensor_field type array { + indexing: summary + } + + field marqo__embeddings_tensor_field type tensor(p{}, x[32]) { + indexing: attribute | index | summary + attribute { + distance-metric: prenormalized-angular + } + index { + hnsw { + max-links-per-node: 16 + neighbors-to-explore-at-insert: 512 + } + } + } + field marqo__vector_count type int { + indexing: attribute | summary + } + } + + fieldset default { + fields: marqo__lexical_text_field + } + rank-profile base_rank_profile inherits default { + inputs { + query(marqo__lexical_text_field): 0 + query(marqo__embeddings_tensor_field): 0 + query(marqo__bm25_aggregator): 0 + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } + function modify(score, mult_weights, add_weights) { + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) + } + + function lexical_score_sum() { + expression: if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0) + } + + function lexical_score_avg() { + expression: (if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0)) / max(1, if (query(marqo__lexical_text_field) > 0, 1, 0)) + } + + function lexical_score_max() { + expression: if (query(marqo__lexical_text_field) > 0, bm25(marqo__lexical_text_field), 0) + } + + function lexical_score() { + expression: if (query(marqo__bm25_aggregator) == 0, lexical_score_sum(), if (query(marqo__bm25_aggregator) == 1, lexical_score_avg(), lexical_score_max())) + } + function embedding_score() { + expression: if (query(marqo__embeddings_tensor_field) > 0, closeness(field, marqo__embeddings_tensor_field), 0) + } + + match-features: global_mult_modifier global_add_modifier + } + + rank-profile bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + rank-profile embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field) + distance(field, marqo__embeddings_tensor_field) + } + } + + rank-profile hybrid_custom_searcher inherits default { + inputs { + query(marqo__fields_to_rank_lexical) tensor(p{}) + query(marqo__fields_to_rank_tensor) tensor(p{}) + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + } + + rank-profile hybrid_bm25_then_embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + second-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + match-features inherits base_rank_profile { + closest(marqo__embeddings_tensor_field) + distance(field, marqo__embeddings_tensor_field) + } + } + + rank-profile hybrid_embedding_similarity_then_bm25 inherits base_rank_profile { + first-phase { + expression: modify(lexical_score(), query(marqo__mult_weights_lexical), query(marqo__add_weights_lexical)) + } + } + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__field_types type map {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__string_array_string_array_1 type array {source: marqo__string_array_string_array_1} + + summary text_field type string {source: marqo__lexical_text_field} + summary marqo__chunks_tensor_field type array {} + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__field_types type map {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__string_array_string_array_1 type array {source: marqo__string_array_string_array_1} + + summary text_field type string {source: marqo__lexical_text_field} + summary marqo__chunks_tensor_field type array {} + summary marqo__embeddings_tensor_field type tensor(p{}, x[32]) {} + } +} \ No newline at end of file diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_string_array_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_string_array_field.sd new file mode 100644 index 000000000..4c852b3f1 --- /dev/null +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_string_array_field.sd @@ -0,0 +1,145 @@ +schema marqo__test_00semi_00structured_00schema { + document { + field marqo__id type string { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__int_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__bool_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__float_fields type map { + indexing: summary + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__short_string_fields type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + + field marqo__score_modifiers type tensor(p{}) { + indexing: attribute | summary + } + + field marqo__multimodal_params type map { + indexing: summary + } + + field marqo__string_array_string_array_1 type array { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + field marqo__vector_count type int { + indexing: attribute | summary + } + } + + rank-profile base_rank_profile inherits default { + inputs { + query(marqo__bm25_aggregator): 0 + query(marqo__query_embedding) tensor(x[32]) + query(marqo__mult_weights_lexical) tensor(p{}) + query(marqo__add_weights_lexical) tensor(p{}) + query(marqo__mult_weights_tensor) tensor(p{}) + query(marqo__add_weights_tensor) tensor(p{}) + query(marqo__mult_weights_global) tensor(p{}) + query(marqo__add_weights_global) tensor(p{}) + } + + function mult_modifier(mult_weights) { + expression: if (count(mult_weights * attribute(marqo__score_modifiers)) == 0, 1, reduce(mult_weights * attribute(marqo__score_modifiers), prod)) + } + function add_modifier(add_weights) { + expression: reduce(add_weights * attribute(marqo__score_modifiers), sum) + } + function modify(score, mult_weights, add_weights) { + expression: mult_modifier(mult_weights) * score + add_modifier(add_weights) + } + function global_mult_modifier() { + expression: mult_modifier(query(marqo__mult_weights_global)) + } + function global_add_modifier() { + expression: add_modifier(query(marqo__add_weights_global)) + } + + function embedding_score() { + expression: 0 + } + + match-features: global_mult_modifier global_add_modifier + } + + rank-profile embedding_similarity inherits base_rank_profile { + first-phase { + expression: modify(embedding_score(), query(marqo__mult_weights_tensor), query(marqo__add_weights_tensor)) + } + + } + + + + document-summary all-non-vector-summary { + summary marqo__id type string {} + summary marqo__field_types type map {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__string_array_string_array_1 type array {source: marqo__string_array_string_array_1} + + } + + document-summary all-vector-summary { + summary marqo__id type string {} + summary marqo__field_types type map {} + summary marqo__bool_fields type map {} + summary marqo__int_fields type map {} + summary marqo__float_fields type map {} + summary marqo__string_array_string_array_1 type array {source: marqo__string_array_string_array_1} + + } +} diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_tensor_field.sd b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_tensor_field.sd index 873e15d2e..ec9685946 100644 --- a/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_tensor_field.sd +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_schemas/semi_structured_vespa_index_schema_one_tensor_field.sd @@ -6,6 +6,21 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } + field marqo__create_timestamp type double { + indexing: attribute | summary + attribute: fast-search + rank: filter + } + + field marqo__field_types type map { + struct-field key { indexing : attribute + attribute: fast-search + rank: filter } + struct-field value { indexing : attribute + attribute: fast-search + rank: filter } + } + field marqo__int_fields type map { indexing: summary struct-field key { indexing : attribute @@ -46,12 +61,6 @@ schema marqo__test_00semi_00structured_00schema { rank: filter } } - field marqo__string_array type array { - indexing: attribute | summary - attribute: fast-search - rank: filter - } - field marqo__score_modifiers type tensor(p{}) { indexing: attribute | summary } @@ -131,7 +140,7 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-non-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} @@ -140,7 +149,7 @@ schema marqo__test_00semi_00structured_00schema { document-summary all-vector-summary { summary marqo__id type string {} - summary marqo__string_array type array {} + summary marqo__field_types type map {} summary marqo__bool_fields type map {} summary marqo__int_fields type map {} summary marqo__float_fields type map {} diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py b/tests/integ_tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py index 0198e3770..90174cfe0 100644 --- a/tests/integ_tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_semi_structured_vespa_index.py @@ -1,12 +1,13 @@ import re import time import unittest -from typing import List, Set +from typing import List, Set, Optional from marqo import version from marqo.core.models import MarqoTensorQuery, MarqoLexicalQuery from marqo.core.models.marqo_index import SemiStructuredMarqoIndex, Model, TextPreProcessing, TextSplitMethod, \ - ImagePreProcessing, HnswConfig, VectorNumericType, DistanceMetric, Field, FieldType, FieldFeature, TensorField + ImagePreProcessing, HnswConfig, VectorNumericType, DistanceMetric, Field, FieldType, FieldFeature, TensorField, \ + StringArrayField from marqo.core.semi_structured_vespa_index.common import STRING_ARRAY, BOOL_FIELDS, INT_FIELDS, FLOAT_FIELDS, \ VESPA_FIELD_ID from marqo.core.semi_structured_vespa_index.semi_structured_vespa_index import SemiStructuredVespaIndex @@ -20,6 +21,59 @@ def test_to_vespa_query_should_include_static_fields_when_attributes_to_retrieve tensor_field_names=['title']) vespa_index = SemiStructuredVespaIndex(marqo_index) + for marqo_query in [ + MarqoTensorQuery(index_name=marqo_index.name, limit=10, offset=0, + attributes_to_retrieve=['title'], vector_query=[1.0] * 10), + MarqoLexicalQuery(index_name=marqo_index.name, limit=10, offset=0, + attributes_to_retrieve=['title'], and_phrases=['hello'], or_phrases=['world']), + # MarqoHybridSearch yql is just a placeholder and is generated in customer search component. + ]: + with self.subTest(test_query=marqo_query): + query = vespa_index.to_vespa_query(marqo_query) + fields = self._extract_fields_from_yql(query['yql']) + + self.assertSetEqual({VESPA_FIELD_ID, 'title', 'marqo__chunks_title', + BOOL_FIELDS, INT_FIELDS, FLOAT_FIELDS}, fields) + + def test_to_vespa_query_should_include_string_array_fields_when_string_array_attributes_to_retrieve_is_not_empty(self): + """Tests that string array fields are correctly included in the Vespa query when requested. + + This test verifies that when string array fields are included in the attributes_to_retrieve + parameter of a query, both the original field names and their corresponding Vespa internal + field names (with the marqo__string_array_ prefix) are included in the generated YQL query. + + The test checks this behavior for both tensor queries and lexical queries to ensure + consistent handling across query types. + """ + marqo_index = self._semi_structured_marqo_index(name='index1', lexical_field_names=['title'], + tensor_field_names=['title'], string_array_fields = ['string_array_field1', 'string_array_field2']) + vespa_index = SemiStructuredVespaIndex(marqo_index) + + for marqo_query in [ + MarqoTensorQuery(index_name=marqo_index.name, limit=10, offset=0, + attributes_to_retrieve=['title', 'string_array_field1', 'string_array_field2'], vector_query=[1.0] * 10), + MarqoLexicalQuery(index_name=marqo_index.name, limit=10, offset=0, + attributes_to_retrieve=['title', 'string_array_field1', 'string_array_field2'], and_phrases=['hello'], or_phrases=['world']), + # MarqoHybridSearch yql is just a placeholder and is generated in customer search component. + ]: + with self.subTest(test_query=marqo_query): + query = vespa_index.to_vespa_query(marqo_query) + fields = self._extract_fields_from_yql(query['yql']) + + self.assertSetEqual({VESPA_FIELD_ID, 'title', 'marqo__chunks_title', + BOOL_FIELDS, INT_FIELDS, FLOAT_FIELDS, + 'string_array_field1', 'string_array_field2', 'marqo__string_array_string_array_field1', 'marqo__string_array_string_array_field2'}, fields) + + def test_to_vespa_query_should_include_static_fields_when_attributes_to_retrieve_is_not_empty_pre_2_16_indexes(self): + """ + Test that a vespa query is correctly generated for a schema with marqo version < 2.16.0 + Returns: + + """ + marqo_index = self._semi_structured_marqo_index(name='index1', lexical_field_names=['title'], + tensor_field_names=['title'], marqo_version = '2.15.0') + vespa_index = SemiStructuredVespaIndex(marqo_index) + for marqo_query in [ MarqoTensorQuery(index_name=marqo_index.name, limit=10, offset=0, attributes_to_retrieve=['title'], vector_query=[1.0] * 10), @@ -34,6 +88,7 @@ def test_to_vespa_query_should_include_static_fields_when_attributes_to_retrieve self.assertSetEqual({VESPA_FIELD_ID, 'title', 'marqo__chunks_title', STRING_ARRAY, BOOL_FIELDS, INT_FIELDS, FLOAT_FIELDS}, fields) + def test_to_vespa_query_should_not_include_static_fields_when_attributes_to_retrieve_is_empty(self): marqo_index = self._semi_structured_marqo_index(name='index1', lexical_field_names=['title'], tensor_field_names=['title']) @@ -68,7 +123,9 @@ def _extract_fields_from_yql(self, yql: str) -> Set[str]: def _semi_structured_marqo_index(self, name='index_name', lexical_field_names: List[str] = [], - tensor_field_names: List[str] = []): + tensor_field_names: List[str] = [], + marqo_version: Optional[str] = None, + string_array_fields: List[str] = []) -> SemiStructuredMarqoIndex: return SemiStructuredMarqoIndex( name=name, schema_name=name, @@ -88,7 +145,7 @@ def _semi_structured_marqo_index(self, name='index_name', ef_construction=128, m=16 ), - marqo_version=version.get_version(), + marqo_version=version.get_version() if marqo_version is None else marqo_version, created_at=time.time(), updated_at=time.time(), treat_urls_and_pointers_as_images=True, @@ -107,5 +164,14 @@ def _semi_structured_marqo_index(self, name='index_name', embeddings_field_name=f'{SemiStructuredVespaSchema.FIELD_EMBEDDING_PREFIX}{field_name}', ) for field_name in tensor_field_names + ], + string_array_fields=[ + StringArrayField( + name=field_name, + type=FieldType.ArrayText, + string_array_field_name=f'{SemiStructuredVespaSchema.FIELD_STRING_ARRAY_PREFIX}{field_name}', + features=[] + ) + for field_name in string_array_fields ] ) \ No newline at end of file diff --git a/tests/integ_tests/core/semi_structured_vespa_index/test_semi_structured_vespa_schema.py b/tests/integ_tests/core/semi_structured_vespa_index/test_semi_structured_vespa_schema.py index 256d45695..6da722a73 100644 --- a/tests/integ_tests/core/semi_structured_vespa_index/test_semi_structured_vespa_schema.py +++ b/tests/integ_tests/core/semi_structured_vespa_index/test_semi_structured_vespa_schema.py @@ -21,6 +21,69 @@ def _remove_empty_lines_in_schema(self, schema: str) -> str: def test_semi_structured_index_schema_random_model(self): + test_cases = [ + # test_case_name, lexical_fields, tensor_fields, expected schema file, string_array_fields + ('no_field', [], [], 'semi_structured_vespa_index_schema_no_field.sd', []), + ('one_lexical_field', ['text_field'], [], 'semi_structured_vespa_index_schema_one_lexical_field.sd', []), + ('one_tensor_field', [], ['tensor_field'], 'semi_structured_vespa_index_schema_one_tensor_field.sd', []), + ('one_string_array_field', [], [], 'semi_structured_vespa_index_schema_one_string_array_field.sd', ['string_array_1']), + ('one_lexical_one_tensor_field', ['text_field'], ['tensor_field'], 'semi_structured_vespa_index_schema_one_lexical_one_tensor_field.sd', []), + ('one_lexical_one_tensor_field_one_string_array_field', ['text_field'], ['tensor_field'], 'semi_structured_vespa_index_schema_one_lexical_one_tensor_one_string_array_field.sd', ['string_array_1']), + ('multiple_lexical_tensor_fields', ['text_field1', 'text_field2'], ['tensor_field1', 'tensor_field2'], 'semi_structured_vespa_index_schema_multiple_lexical_tensor_and_string_array_fields.sd', ['string_array_1', 'string_array_2']), + ] + + for test_case in test_cases: + with (self.subTest(msg=test_case[0])): + lexical_fields = test_case[1] + tensor_fields = test_case[2] + string_array_fields = test_case[4] + expected_schema = self._read_schema_from_file(f'test_schemas/{test_case[3]}') + + test_marqo_index_request = self.unstructured_marqo_index_request( + name="test_semi_structured_schema", + hnsw_config=HnswConfig(ef_construction=512, m=16), + distance_metric=DistanceMetric.PrenormalizedAngular + ) + + _, index = SemiStructuredVespaSchema(test_marqo_index_request).generate_schema() + marqo_index = cast(SemiStructuredMarqoIndex, index) + + for lexical_field in lexical_fields: + marqo_index.lexical_fields.append( + Field(name=lexical_field, type=FieldType.Text, + features=[FieldFeature.LexicalSearch], + lexical_field_name=f'{SemiStructuredVespaSchema.FIELD_INDEX_PREFIX}{lexical_field}')) + for tensor_field in tensor_fields: + marqo_index.tensor_fields.append(TensorField( + name=tensor_field, + chunk_field_name=f'{SemiStructuredVespaSchema.FIELD_CHUNKS_PREFIX}{tensor_field}', + embeddings_field_name=f'{SemiStructuredVespaSchema.FIELD_EMBEDDING_PREFIX}{tensor_field}', + )) + for string_array_field in string_array_fields: + marqo_index.string_array_fields.append(StringArrayField( + name=string_array_field, + type=FieldType.ArrayText, + string_array_field_name=f'{SemiStructuredVespaSchema.FIELD_STRING_ARRAY_PREFIX}{string_array_field}', + features=[] + )) + marqo_index.clear_cache() + generated_schema = SemiStructuredVespaSchema.generate_vespa_schema(marqo_index) + + self.maxDiff = None + self.assertEqual( + self._remove_empty_lines_in_schema(expected_schema), + self._remove_empty_lines_in_schema(generated_schema) + ) + + def test_semi_structured_index_schema_with_pre_2_16(self): + """ + Test that the schema is generated correctly when the marqo version is older than 2.16.0. + 2.16.0 is the version where partial update support was added to the semi-structured index, to do this we + had to change what the schema looks like. This is why we have a different test for this case. + Returns: + + """ + test_cases = [ # test_case_name, lexical_fields, tensor_fields, expected schema file ('no_field', [], [], 'semi_structured_vespa_index_schema_no_field.sd'), @@ -31,19 +94,24 @@ def test_semi_structured_index_schema_random_model(self): ] for test_case in test_cases: - with (self.subTest(msg=test_case[0])): + with (self.subTest(msg=f"mocked_version_{test_case[0]}")): lexical_fields = test_case[1] tensor_fields = test_case[2] - expected_schema = self._read_schema_from_file(f'test_schemas/{test_case[3]}') + expected_schema = self._read_schema_from_file(f'test_schemas/pre_2_16/{test_case[3]}') test_marqo_index_request = self.unstructured_marqo_index_request( name="test_semi_structured_schema", hnsw_config=HnswConfig(ef_construction=512, m=16), - distance_metric=DistanceMetric.PrenormalizedAngular + distance_metric=DistanceMetric.PrenormalizedAngular, + marqo_version = "2.15.0" ) + self.assertEqual("2.15.0", test_marqo_index_request.marqo_version) + _, index = SemiStructuredVespaSchema(test_marqo_index_request).generate_schema() marqo_index = cast(SemiStructuredMarqoIndex, index) + + # Set the marqo_version explicitly to ensure it uses our mocked version for lexical_field in lexical_fields: marqo_index.lexical_fields.append( @@ -64,4 +132,6 @@ def test_semi_structured_index_schema_random_model(self): self._remove_empty_lines_in_schema(expected_schema), self._remove_empty_lines_in_schema(generated_schema) ) - + + # Verify the version was used in the index + self.assertEqual("2.15.0", marqo_index.marqo_version) \ No newline at end of file diff --git a/tests/integ_tests/tensor_search/integ_tests/test_add_documents_semi_structured_add_fields.py b/tests/integ_tests/tensor_search/integ_tests/test_add_documents_semi_structured_add_fields.py index 025599c52..a97aa4773 100644 --- a/tests/integ_tests/tensor_search/integ_tests/test_add_documents_semi_structured_add_fields.py +++ b/tests/integ_tests/tensor_search/integ_tests/test_add_documents_semi_structured_add_fields.py @@ -309,7 +309,7 @@ def test_add_documents_should_allow_the_same_field_to_have_different_types_in_di self.assertEqual({'2', '3'}, {hit['_id'] for hit in res['hits']}) def test_add_documents_should_raise_error_when_field_count_exceeds_limit(self): - field_count_config = SemiStructuredFieldCountConfig(max_lexical_field_count=6, max_tensor_field_count=5) + field_count_config = SemiStructuredFieldCountConfig(max_lexical_field_count=6, max_tensor_field_count=5, max_string_array_field_count=5) self.config.document.add_documents( AddDocsParams( index_name=self.text_index_6, @@ -320,6 +320,11 @@ def test_add_documents_should_raise_error_when_field_count_exceeds_limit(self): "tensor_field3": "content 3", "tensor_field4": "content 4", "tensor_field5": "content 5", + "string_array_field1": ["content 1"], + "string_array_field2": ["content 2"], + "string_array_field3": ["content 3"], + "string_array_field4": ["content 4"], + "string_array_field5": ["content 5"], }], device="cpu", tensor_fields=[ "tensor_field1", @@ -361,3 +366,18 @@ def test_add_documents_should_raise_error_when_field_count_exceeds_limit(self): 'rejected since it exceeds the limit of 6. Please set a larger limit in ' 'MARQO_MAX_LEXICAL_FIELD_COUNT_UNSTRUCTURED environment variable.', str(err2.exception)) + with self.assertRaises(TooManyFieldsError) as err3: + self.config.document.add_documents(AddDocsParams( + index_name=self.text_index_6, + docs=[{ + "_id": "4", + "string_array_field6": ["content 1"], + "string_array_field7": ["content 2"], + }], + tensor_fields=[]), + field_count_config=field_count_config + ) + self.assertIn('has 5 string array fields. Your request to add string_array_field7 as a string array field is ' + 'rejected since it exceeds the limit of 5. Please set a larger limit in ' + 'MARQO_MAX_STRING_ARRAY_FIELD_COUNT_UNSTRUCTURED environment variable.', str(err3.exception)) + diff --git a/tests/integ_tests/tensor_search/integ_tests/test_search_semi_structured.py b/tests/integ_tests/tensor_search/integ_tests/test_search_semi_structured.py index 8b522f6ce..a98277964 100644 --- a/tests/integ_tests/tensor_search/integ_tests/test_search_semi_structured.py +++ b/tests/integ_tests/tensor_search/integ_tests/test_search_semi_structured.py @@ -890,6 +890,7 @@ def test_attributes_to_retrieve(self): (["int_field"], {"int_field"}), (["float_field"], {"float_field"}), (["string_array"], {"string_array"}), + (["non_existent_string_array"], set()), # non_existent field is provided (["int_map"], {"int_map.a", "int_map.b"}), (["float_map"], {"float_map.c", "float_map.d"}), (["bool_field"], {"bool_field"}), diff --git a/tests/integ_tests/vespa/test_update_documents_batch.py b/tests/integ_tests/vespa/test_update_documents_batch.py index acfec4164..f8b3b17c3 100644 --- a/tests/integ_tests/vespa/test_update_documents_batch.py +++ b/tests/integ_tests/vespa/test_update_documents_batch.py @@ -69,7 +69,7 @@ def test_feed_batch_documents_do_not_exists(self): ids = [response.id.split("::")[-1] for response in batch_response.responses if response.status == 200] messages = [response.message for response in batch_response.responses] - self.assertEqual([412, 412], statuses) + self.assertEqual([404, 404], statuses) self.assertEqual(["doc1", "doc2"], path_ids) self.assertEqual([], ids) self.assertIn("not exist", messages[0]) diff --git a/tests/integ_tests/vespa/test_vespa_client.py b/tests/integ_tests/vespa/test_vespa_client.py index 9dc35748a..00e6db6ba 100644 --- a/tests/integ_tests/vespa/test_vespa_client.py +++ b/tests/integ_tests/vespa/test_vespa_client.py @@ -436,7 +436,7 @@ def test_translate_vespa_document_response_status(self): test_cases = [ (200, 200, None), (404, 404, "Document does not exist in the index"), - (412, 404, "Document does not exist in the index"), + (412, 400, "Marqo vector store couldn't update the document"), (429, 429, "Marqo vector store receives too many requests. Please try again later"), (507, 400, "Marqo vector store is out of memory or disk space"), (123, 500, "Marqo vector store returns an unexpected error with this document"),