From b721e831dbd1b6e476c1bfe82bbefa657c364cbb Mon Sep 17 00:00:00 2001 From: John Mazanec Date: Thu, 6 Mar 2025 10:21:12 -0800 Subject: [PATCH] Further refactor for readability Signed-off-by: John Mazanec --- .../DerivedSourceStoredFieldsWriter.java | 8 +- .../DerivedSourceLuceneHelper.java | 18 +++ .../derivedsource/DerivedSourceMapHelper.java | 68 ++++++++ .../NestedPerFieldDerivedVectorInjector.java | 55 +++---- .../DerivedSourceMapHelperTests.java | 153 ++++++++++++++++++ 5 files changed, 264 insertions(+), 38 deletions(-) create mode 100644 src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceMapHelper.java create mode 100644 src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceMapHelperTests.java diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriter.java b/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriter.java index 0c43f6a493..8cdba93fa7 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriter.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriter.java @@ -14,12 +14,12 @@ import org.opensearch.common.collect.Tuple; import org.opensearch.common.io.stream.BytesStreamOutput; import org.opensearch.common.xcontent.XContentHelper; -import org.opensearch.common.xcontent.support.XContentMapValues; import org.opensearch.core.common.bytes.BytesReference; import org.opensearch.core.xcontent.MediaType; import org.opensearch.core.xcontent.MediaTypeRegistry; import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.index.mapper.SourceFieldMapper; +import org.opensearch.knn.index.codec.derivedsource.DerivedSourceMapHelper; import java.io.IOException; import java.nio.ByteBuffer; @@ -83,8 +83,10 @@ public void writeField(FieldInfo fieldInfo, BytesRef bytesRef) throws IOExceptio true, MediaTypeRegistry.JSON ); - Map filteredSource = XContentMapValues.filter(null, vectorFieldTypes.toArray(new String[0])) - .apply(mapTuple.v2()); + Map filteredSource = DerivedSourceMapHelper.filterFields( + vectorFieldTypes.toArray(new String[0]), + mapTuple.v2() + ); BytesStreamOutput bStream = new BytesStreamOutput(); MediaType actualContentType = mapTuple.v1(); XContentBuilder builder = MediaTypeRegistry.contentBuilder(actualContentType, bStream).map(filteredSource); diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceLuceneHelper.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceLuceneHelper.java index 692ec9b1cb..65824cdc07 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceLuceneHelper.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceLuceneHelper.java @@ -88,6 +88,24 @@ public boolean isNestedParent(int offset, int parentDocId, String parentFieldNam return termMatchesInRange(offset, parentDocId - 1, "_nested_path", parentFieldName).isEmpty() == false; } + /** + * Given a document id, get the next matching doc for the given parent. + * + * @param offset Place to start searching (inclusive) + * @param endDocId Last eligible doc (inclusive) + * @param parentFieldName Field to check + * @return Doc Id of first matching doc on or after offset that contains the parent field name. {@link DocIdSetIterator#NO_MORE_DOCS} if not found + * @throws IOException if unable to read segment + */ + public int getNextDocIdWithParent(int offset, int endDocId, String parentFieldName) throws IOException { + List matches = termMatchesInRange(offset, endDocId, "_nested_path", parentFieldName); + if (matches.isEmpty()) { + return NO_MORE_DOCS; + } + + return matches.getFirst(); + } + /** * Check if the field exists for the given document. * diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceMapHelper.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceMapHelper.java new file mode 100644 index 0000000000..8c4ec194e5 --- /dev/null +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceMapHelper.java @@ -0,0 +1,68 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.derivedsource; + +import lombok.extern.log4j.Log4j2; +import org.opensearch.common.xcontent.support.XContentMapValues; + +import java.util.HashMap; +import java.util.Map; + +/** + * Utility class for manipulating the source map + */ +@Log4j2 +public class DerivedSourceMapHelper { + + /** + * Removes all fields in the array from the source + * + * @param fields Fields to remove + * @param source Source map to remove from + * @return Map with filtered fields + */ + public static Map filterFields(String[] fields, Map source) { + return XContentMapValues.filter(null, fields).apply(source); + } + + /** + * Check whether field exists in the document + * + * @param source source document + * @param fieldName field to check. Field should be flattened. i.e. my.path.field + * @return whether or not the field exists in the object + */ + public static boolean fieldExists(Map source, String fieldName) { + return XContentMapValues.extractValue(fieldName, source, NullValue.INSTANCE) != null; + } + + /** + * Injects vector into source, handling object field path if necessary + * + * @param sourceAsMap source to be injected into + * @param vector vector to inject + * @param fieldName field name injecting at + */ + public static void injectVector(Map sourceAsMap, Object vector, String fieldName) { + // If a field contains ".", we need to ensure that we properly nest it. + String[] fields = ParentChildHelper.splitPath(fieldName); + if (fields.length < 2) { + sourceAsMap.put(fieldName, vector); + } + + Map currentMap = sourceAsMap; + for (int i = 0; i < fields.length - 1; i++) { + String field = fields[i]; + currentMap = (Map) currentMap.computeIfAbsent(field, k -> new HashMap<>()); + } + currentMap.put(fields[fields.length - 1], vector); + + } + + private static class NullValue { + private static final NullValue INSTANCE = new NullValue(); + } +} diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java index bfec33ad74..c882d31268 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java @@ -8,7 +8,6 @@ import lombok.extern.log4j.Log4j2; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.SegmentReadState; -import org.opensearch.common.xcontent.support.XContentMapValues; import org.opensearch.knn.index.vectorvalues.KNNVectorValues; import org.opensearch.knn.index.vectorvalues.KNNVectorValuesFactory; @@ -76,17 +75,10 @@ private void injectObject(int docId, Map sourceAsMap, KNNVectorV if (vectorValues.docId() != docId && vectorValues.advance(docId) != docId) { return; } - - // Translate the flat path to a nested map of maps - String[] fields = ParentChildHelper.splitPath(childFieldInfo.name); - Map currentMap = sourceAsMap; - for (int i = 0; i < fields.length - 1; i++) { - String field = fields[i]; - currentMap = (Map) currentMap.computeIfAbsent(field, k -> new HashMap<>()); - } - currentMap.put( - fields[fields.length - 1], - formatVector(childFieldInfo, vectorValues::getVector, vectorValues::conditionalCloneVector) + DerivedSourceMapHelper.injectVector( + sourceAsMap, + formatVector(childFieldInfo, vectorValues::getVector, vectorValues::conditionalCloneVector), + childFieldInfo.name ); } @@ -115,6 +107,7 @@ private void injectNested(int parentDocId, Map sourceAsMap, KNNV reconstructedSource = (List>) originalParentValue; } + // TODO: Update this to call out that empty objects are removed from arrays. // In order to inject vectors into source for nested documents, we need to be able to map the existing // maps to document positions. This lets us know what place to put the vector back into. For example: // Assume we have the following document from the user and we are deriving the value for nested.vector @@ -181,15 +174,15 @@ private void injectNested(int parentDocId, Map sourceAsMap, KNNV reconstructedSource.add(position, new HashMap<>()); docIdsInNestedList.add(position, docId); } - reconstructedSource.get(position) - .put( - childFieldName, - formatVector( - childFieldInfo, - nestedPerFieldParentToChildDocIdIterator::getVector, - nestedPerFieldParentToChildDocIdIterator::getVectorClone - ) - ); + DerivedSourceMapHelper.injectVector( + reconstructedSource.get(position), + formatVector( + childFieldInfo, + nestedPerFieldParentToChildDocIdIterator::getVector, + nestedPerFieldParentToChildDocIdIterator::getVectorClone + ), + childFieldName + ); offsetPositionsIndex = position + 1; } sourceAsMap.put(parentFieldName, reconstructedSource); @@ -212,8 +205,7 @@ private List mapObjectsInNestedListToDocIds(List> o List positions = new ArrayList<>(); int currentOffset = offset; for (Map docWithFields : originals) { - int fieldMapping = mapToDocId(docWithFields, currentOffset, parent); - assert fieldMapping != NO_MORE_DOCS; + int fieldMapping = getDocIdOfMap(docWithFields, currentOffset, parent); positions.add(fieldMapping); currentOffset = fieldMapping + 1; } @@ -225,10 +217,11 @@ private List mapObjectsInNestedListToDocIds(List> o * * @param doc doc to find the docId for * @param offset offset to start searching from + * @param parent parent docId * @return doc id the map must map to * @throws IOException if there is an issue reading from the formats */ - private int mapToDocId(Map doc, int offset, int parent) throws IOException { + private int getDocIdOfMap(Map doc, int offset, int parent) throws IOException { // First, we identify a field that the doc in question must have FieldInfo fieldInfoOfDoc = getAnyMatchingFieldInfoForDoc(doc); assert fieldInfoOfDoc != null; @@ -240,15 +233,12 @@ private int mapToDocId(Map doc, int offset, int parent) throws I // The field in question may be a nested field. In this case, we need to find the next parent on the same level // as the child to figure out where to put back the vector. - List matches = derivedSourceLuceneHelper.termMatchesInRange( + int position = derivedSourceLuceneHelper.getNextDocIdWithParent( firstMatchingDocWithField, parent - 1, - "_nested_path", ParentChildHelper.getParentField(childFieldInfo.name) ); - assert matches != null; - assert matches.isEmpty() == false; - int position = matches.getFirst(); + // We should not advance past the parent assert position < parent; return position; } @@ -267,15 +257,10 @@ private FieldInfo getAnyMatchingFieldInfoForDoc(Map doc) { continue; } - Object object = XContentMapValues.extractValue(extractedFieldName, doc, NullValue.INSTANCE); - if (object != null) { + if (DerivedSourceMapHelper.fieldExists(doc, extractedFieldName)) { return fieldInfo; } } return null; } - - private static class NullValue { - private static final NullValue INSTANCE = new NullValue(); - } } diff --git a/src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceMapHelperTests.java b/src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceMapHelperTests.java new file mode 100644 index 0000000000..3095a04d35 --- /dev/null +++ b/src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceMapHelperTests.java @@ -0,0 +1,153 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.derivedsource; + +import lombok.SneakyThrows; +import org.opensearch.common.xcontent.XContentFactory; +import org.opensearch.common.xcontent.XContentHelper; +import org.opensearch.core.common.bytes.BytesReference; +import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.knn.KNNTestCase; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class DerivedSourceMapHelperTests extends KNNTestCase { + + @SneakyThrows + public void testFilterFields() { + // Basic filter + String[] filterFields = new String[] { "field1", "field2", "field3" }; + XContentBuilder builder = XContentFactory.jsonBuilder() + .startObject() + .field("field1", "value") + .field("field2", "value") + .field("field3", "value") + .endObject(); + Map source = DerivedSourceMapHelper.filterFields(filterFields, xContentToMap(builder)); + assertEquals(Collections.emptyMap(), source); + source = DerivedSourceMapHelper.filterFields(new String[] { "field1" }, xContentToMap(builder)); + assertTrue(DerivedSourceMapHelper.fieldExists(source, "field2")); + assertTrue(DerivedSourceMapHelper.fieldExists(source, "field3")); + assertFalse(DerivedSourceMapHelper.fieldExists(source, "field1")); + + // Object case + filterFields = new String[] { "level1.level2.test", "field.nonexist" }; + builder = XContentFactory.jsonBuilder() + .startObject() + .startObject("level1") + .startObject("level2") + .field("test", "test") + .endObject() + .endObject() + .endObject(); + source = DerivedSourceMapHelper.filterFields(filterFields, xContentToMap(builder)); + assertEquals(Map.of("level1", Map.of("level2", Collections.emptyMap())), source); + + // Nested case. In this case, if filtering out a value leads it to be an empty map, then it will be removed + // from the array + filterFields = new String[] { "nested.deep.value", "nested.vector" }; + builder = XContentFactory.jsonBuilder() + .startObject() + .startArray("nested") + .startObject() + .field("text", "text1") + .startArray("deep") + .startObject() + .field("value", "text2") + .endObject() + .endArray() + .endObject() + .startObject() + .field("vector", "text1") + .endObject() + .startObject() + .field("text", "text1") + .field("vector", "text1") + .endObject() + .endArray() + .endObject(); + source = DerivedSourceMapHelper.filterFields(filterFields, xContentToMap(builder)); + assertTrue(source.containsKey("nested")); + Object nested = source.get("nested"); + assertTrue(nested instanceof List); + List nestedList = (List) nested; + assertEquals(2, nestedList.size()); + assertTrue(nestedList.get(0) instanceof Map); + Map nestedMap = (Map) nestedList.get(0); + assertTrue(nestedMap.containsKey("deep")); + Object deep = nestedMap.get("deep"); + assertTrue(deep instanceof List); + assertEquals(0, ((List) deep).size()); + } + + @SneakyThrows + public void testFieldExists() { + // Basic, flat validation + XContentBuilder builder = XContentFactory.jsonBuilder().startObject().field("field", "value").endObject(); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field")); + assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "non-existent-field")); + + // Object type + builder = XContentFactory.jsonBuilder().startObject().startObject("field").field("test", "test").endObject().endObject(); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field.test")); + assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "test")); + + // Complex nested object + builder = XContentFactory.jsonBuilder() + .startObject() + .startArray("nested1") + .startObject() + .field("field1", "test") + .endObject() + .startObject() + .field("field1", "test") + .field("field2", "test") + .endObject() + .startObject() + .field("field3", "test") + .endObject() + .endArray() + .startArray("nested2") + .startObject() + .field("field1", "test") + .endObject() + .startObject() + .field("field1", "test") + .field("field2", "test") + .endObject() + .startObject() + .field("field3", "test") + .endObject() + .endArray() + .field("notnested", "test") + .endObject(); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested1")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested2")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "notnested")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested1.field1")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested1.field2")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested1.field3")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested2.field1")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested2.field2")); + assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested2.field3")); + assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field1")); + assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field2")); + assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field3")); + + // Null test + Map map = new HashMap<>(); + map.put("test", null); + assertTrue(DerivedSourceMapHelper.fieldExists(map, "test")); + } + + private Map xContentToMap(XContentBuilder xContentBuilder) { + return XContentHelper.convertToMap(BytesReference.bytes(xContentBuilder), true, xContentBuilder.contentType()).v2(); + } +}