Skip to content

Commit

Permalink
semantic_text add index_options
Browse files Browse the repository at this point in the history
  • Loading branch information
weizijun committed Feb 20, 2025
1 parent 5023cdd commit d421439
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,7 @@ public abstract static class IndexOptions implements ToXContent {

abstract KnnVectorsFormat getVectorsFormat(ElementType elementType);

final void validateElementType(ElementType elementType) {
public final void validateElementType(ElementType elementType) {
if (type.supportsElementType(elementType) == false) {
throw new IllegalArgumentException(
"[element_type] cannot be [" + elementType.toString() + "] when using index type [" + type + "]"
Expand Down Expand Up @@ -2324,6 +2324,10 @@ public FieldMapper.Builder getMergeBuilder() {
return new Builder(leafName(), indexCreatedVersion).init(this);
}

public IndexOptions indexOptions() {
return indexOptions;
}

public static IndexOptions parseIndexOptions(String fieldName, Object propNode) {
@SuppressWarnings("unchecked")
Map<String, ?> indexOptionsMap = (Map<String, ?>) propNode;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -997,7 +997,11 @@ private static Mapper.Builder createEmbeddingsField(
}
denseVectorMapperBuilder.dimensions(modelSettings.dimensions());
denseVectorMapperBuilder.elementType(modelSettings.elementType());
denseVectorMapperBuilder.indexOptions(indexOptions);
if (indexOptions != null) {
indexOptions.validateDimension(modelSettings.dimensions());
indexOptions.validateElementType(modelSettings.elementType());
denseVectorMapperBuilder.indexOptions(indexOptions);
}

yield denseVectorMapperBuilder;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import org.apache.lucene.search.join.BitSetProducer;
import org.apache.lucene.search.join.QueryBitSetProducer;
import org.apache.lucene.search.join.ScoreMode;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.CheckedBiConsumer;
import org.elasticsearch.common.CheckedBiFunction;
Expand Down Expand Up @@ -73,6 +72,7 @@

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
Expand Down Expand Up @@ -879,17 +879,29 @@ private MapperService mapperServiceForFieldWithModelSettings(
String searchInferenceId,
MinimalServiceSettings modelSettings
) throws IOException {
String mappingParams = "type=semantic_text,inference_id=" + inferenceId;
return mapperServiceForFieldWithModelSettingsAndIndexOptions(fieldName, inferenceId, searchInferenceId, modelSettings, null);
}

private MapperService mapperServiceForFieldWithModelSettingsAndIndexOptions(
String fieldName,
String inferenceId,
String searchInferenceId,
MinimalServiceSettings modelSettings,
DenseVectorFieldMapper.IndexOptions indexOptions
) throws IOException {
XContentBuilder mappingBuilder = JsonXContent.contentBuilder().startObject();
mappingBuilder.startObject("properties").startObject(fieldName).field("type", "semantic_text").field("inference_id", inferenceId);
if (searchInferenceId != null) {
mappingParams += ",search_inference_id=" + searchInferenceId;
mappingBuilder.field("search_inference_id", searchInferenceId);
}
if (indexOptions != null) {
mappingBuilder.field("index_options", indexOptions);
}

mappingBuilder.endObject().endObject().endObject();

MapperService mapperService = createMapperService(mapping(b -> {}), useLegacyFormat);
mapperService.merge(
"_doc",
new CompressedXContent(Strings.toString(PutMappingRequest.simpleMapping(fieldName, mappingParams))),
MapperService.MergeReason.MAPPING_UPDATE
);
mapperService.merge("_doc", new CompressedXContent(Strings.toString(mappingBuilder)), MapperService.MergeReason.MAPPING_UPDATE);

SemanticTextField semanticTextField = new SemanticTextField(
useLegacyFormat,
Expand Down Expand Up @@ -951,6 +963,105 @@ public void testExistsQueryDenseVector() throws IOException {
assertThat(existsQuery, instanceOf(ESToParentBlockJoinQuery.class));
}

public void testDenseVectorIndexOptions() throws IOException {
final String fieldName = "field";
final String inferenceId = "test_service";

List<DenseVectorFieldMapper.IndexOptions> indexOptionsList = List.of(
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "hnsw"))),
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "int8_hnsw"))),
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "int4_hnsw"))),
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "bbq_hnsw"))),
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "flat"))),
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "int8_flat"))),
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "int4_flat"))),
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "bbq_flat"))),
DenseVectorFieldMapper.parseIndexOptions(fieldName, new HashMap<>(Map.of("type", "hnsw", "m", 32, "ef_construction", 200)))
);

for (DenseVectorFieldMapper.IndexOptions indexOptions : indexOptionsList) {
BiConsumer<MapperService, DenseVectorFieldMapper.IndexOptions> assertMapperService = (m, e) -> {
Mapper mapper = m.mappingLookup().getMapper(fieldName);
assertThat(mapper, instanceOf(SemanticTextFieldMapper.class));
SemanticTextFieldMapper semanticTextFieldMapper = (SemanticTextFieldMapper) mapper;

FieldMapper fieldMapper = semanticTextFieldMapper.fieldType().getEmbeddingsField();
assertThat(fieldMapper, instanceOf(DenseVectorFieldMapper.class));
DenseVectorFieldMapper denseVectorFieldMapper = (DenseVectorFieldMapper) fieldMapper;

assertThat(denseVectorFieldMapper.indexOptions(), equalTo(e));
};

MapperService floatMapperService = mapperServiceForFieldWithModelSettingsAndIndexOptions(
fieldName,
inferenceId,
inferenceId,
new MinimalServiceSettings(
TaskType.TEXT_EMBEDDING,
1024,
SimilarityMeasure.COSINE,
DenseVectorFieldMapper.ElementType.FLOAT
),
indexOptions
);
assertMapperService.accept(floatMapperService, indexOptions);
}
}

public void testDenseVectorIndexOptionsVaild() {
final String fieldName = "field";
final String inferenceId = "test_service";

{
DenseVectorFieldMapper.IndexOptions indexOptions = DenseVectorFieldMapper.parseIndexOptions(
fieldName,
new HashMap<>(Map.of("type", "int8_hnsw"))
);
MinimalServiceSettings invalidSettings = new MinimalServiceSettings(
TaskType.TEXT_EMBEDDING,
1024,
SimilarityMeasure.L2_NORM,
DenseVectorFieldMapper.ElementType.BYTE
);

Exception e = expectThrows(
DocumentParsingException.class,
() -> mapperServiceForFieldWithModelSettingsAndIndexOptions(
fieldName,
inferenceId,
inferenceId,
invalidSettings,
indexOptions
)
);
assertThat(e.getCause().getMessage(), containsString("cannot be [byte] when using index type [int8_hnsw]"));
}

{
DenseVectorFieldMapper.IndexOptions indexOptions = DenseVectorFieldMapper.parseIndexOptions(
fieldName,
new HashMap<>(Map.of("type", "bbq_hnsw"))
);
MinimalServiceSettings invalidSettings = new MinimalServiceSettings(
TaskType.TEXT_EMBEDDING,
10,
SimilarityMeasure.COSINE,
DenseVectorFieldMapper.ElementType.BYTE
);
Exception e = expectThrows(
DocumentParsingException.class,
() -> mapperServiceForFieldWithModelSettingsAndIndexOptions(
fieldName,
inferenceId,
inferenceId,
invalidSettings,
indexOptions
)
);
assertThat(e.getCause().getMessage(), containsString("bbq_hnsw does not support dimensions fewer than 64"));
}
}

@Override
protected void assertExistsQuery(MappedFieldType fieldType, Query query, LuceneDocument fields) {
// Until a doc is indexed, the query is rewritten as match no docs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,61 @@ setup:
- match: { "test-index.mappings.properties.dense_field.model_settings.task_type": text_embedding }
- length: { "test-index.mappings.properties.dense_field": 3 }

---
"Indexes dense vector document with index_options":

- do:
indices.create:
index: test-index-options
body:
mappings:
properties:
dense_field:
type: semantic_text
inference_id: dense-inference-id
index_options:
type: "hnsw"
m: 24
ef_construction: 200

- do:
index:
index: test-index-options
id: doc_2
body:
dense_field:
text: "these are not the droids you're looking for. He's free to go around"
inference:
inference_id: "dense-inference-id"
model_settings:
task_type: "text_embedding"
dimensions: 4
similarity: "cosine"
element_type: "float"
index_options:
type: "int8_hnsw"
m: 24
ef_construction: 100
confidence_interval: 0.9
chunks:
- text: "these are not the droids you're looking for"
embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416]
- text: "He's free to go around"
embeddings: [0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896]

# Checks mapping is updated when first doc arrives
- do:
indices.get_mapping:
index: test-index-options

- match: { "test-index-options.mappings.properties.dense_field.type": "semantic_text" }
- match: { "test-index-options.mappings.properties.dense_field.inference_id": "dense-inference-id" }
- match: { "test-index-options.mappings.properties.dense_field.model_settings.task_type": "text_embedding" }
- match: { "test-index-options.mappings.properties.dense_field.index_options.type": "hnsw" }
- match: { "test-index-options.mappings.properties.dense_field.index_options.m": 24 }
- match: { "test-index-options.mappings.properties.dense_field.index_options.ef_construction": 200 }
- length: { "test-index.mappings.properties.dense_field": 4 }

---
"Field caps with text embedding":
- requires:
Expand Down

0 comments on commit d421439

Please sign in to comment.