Skip to content

Commit

Permalink
Further refactor for readability
Browse files Browse the repository at this point in the history
Signed-off-by: John Mazanec <jmazane@amazon.com>
  • Loading branch information
jmazanec15 committed Mar 6, 2025
1 parent f006b33 commit b721e83
Show file tree
Hide file tree
Showing 5 changed files with 264 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
import org.opensearch.common.collect.Tuple;
import org.opensearch.common.io.stream.BytesStreamOutput;
import org.opensearch.common.xcontent.XContentHelper;
import org.opensearch.common.xcontent.support.XContentMapValues;
import org.opensearch.core.common.bytes.BytesReference;
import org.opensearch.core.xcontent.MediaType;
import org.opensearch.core.xcontent.MediaTypeRegistry;
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.index.mapper.SourceFieldMapper;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceMapHelper;

import java.io.IOException;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -83,8 +83,10 @@ public void writeField(FieldInfo fieldInfo, BytesRef bytesRef) throws IOExceptio
true,
MediaTypeRegistry.JSON
);
Map<String, Object> filteredSource = XContentMapValues.filter(null, vectorFieldTypes.toArray(new String[0]))
.apply(mapTuple.v2());
Map<String, Object> filteredSource = DerivedSourceMapHelper.filterFields(
vectorFieldTypes.toArray(new String[0]),
mapTuple.v2()
);
BytesStreamOutput bStream = new BytesStreamOutput();
MediaType actualContentType = mapTuple.v1();
XContentBuilder builder = MediaTypeRegistry.contentBuilder(actualContentType, bStream).map(filteredSource);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,24 @@ public boolean isNestedParent(int offset, int parentDocId, String parentFieldNam
return termMatchesInRange(offset, parentDocId - 1, "_nested_path", parentFieldName).isEmpty() == false;
}

/**
* Given a document id, get the next matching doc for the given parent.
*
* @param offset Place to start searching (inclusive)
* @param endDocId Last eligible doc (inclusive)
* @param parentFieldName Field to check
* @return Doc Id of first matching doc on or after offset that contains the parent field name. {@link DocIdSetIterator#NO_MORE_DOCS} if not found
* @throws IOException if unable to read segment
*/
public int getNextDocIdWithParent(int offset, int endDocId, String parentFieldName) throws IOException {
List<Integer> matches = termMatchesInRange(offset, endDocId, "_nested_path", parentFieldName);
if (matches.isEmpty()) {
return NO_MORE_DOCS;
}

return matches.getFirst();
}

/**
* Check if the field exists for the given document.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.index.codec.derivedsource;

import lombok.extern.log4j.Log4j2;
import org.opensearch.common.xcontent.support.XContentMapValues;

import java.util.HashMap;
import java.util.Map;

/**
* Utility class for manipulating the source map
*/
@Log4j2
public class DerivedSourceMapHelper {

/**
* Removes all fields in the array from the source
*
* @param fields Fields to remove
* @param source Source map to remove from
* @return Map with filtered fields
*/
public static Map<String, Object> filterFields(String[] fields, Map<String, Object> source) {
return XContentMapValues.filter(null, fields).apply(source);
}

/**
* Check whether field exists in the document
*
* @param source source document
* @param fieldName field to check. Field should be flattened. i.e. my.path.field
* @return whether or not the field exists in the object
*/
public static boolean fieldExists(Map<String, Object> source, String fieldName) {
return XContentMapValues.extractValue(fieldName, source, NullValue.INSTANCE) != null;
}

/**
* Injects vector into source, handling object field path if necessary
*
* @param sourceAsMap source to be injected into
* @param vector vector to inject
* @param fieldName field name injecting at
*/
public static void injectVector(Map<String, Object> sourceAsMap, Object vector, String fieldName) {
// If a field contains ".", we need to ensure that we properly nest it.
String[] fields = ParentChildHelper.splitPath(fieldName);
if (fields.length < 2) {
sourceAsMap.put(fieldName, vector);
}

Map<String, Object> currentMap = sourceAsMap;
for (int i = 0; i < fields.length - 1; i++) {
String field = fields[i];
currentMap = (Map<String, Object>) currentMap.computeIfAbsent(field, k -> new HashMap<>());
}
currentMap.put(fields[fields.length - 1], vector);

}

private static class NullValue {
private static final NullValue INSTANCE = new NullValue();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import lombok.extern.log4j.Log4j2;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentReadState;
import org.opensearch.common.xcontent.support.XContentMapValues;
import org.opensearch.knn.index.vectorvalues.KNNVectorValues;
import org.opensearch.knn.index.vectorvalues.KNNVectorValuesFactory;

Expand Down Expand Up @@ -76,17 +75,10 @@ private void injectObject(int docId, Map<String, Object> sourceAsMap, KNNVectorV
if (vectorValues.docId() != docId && vectorValues.advance(docId) != docId) {
return;
}

// Translate the flat path to a nested map of maps
String[] fields = ParentChildHelper.splitPath(childFieldInfo.name);
Map<String, Object> currentMap = sourceAsMap;
for (int i = 0; i < fields.length - 1; i++) {
String field = fields[i];
currentMap = (Map<String, Object>) currentMap.computeIfAbsent(field, k -> new HashMap<>());
}
currentMap.put(
fields[fields.length - 1],
formatVector(childFieldInfo, vectorValues::getVector, vectorValues::conditionalCloneVector)
DerivedSourceMapHelper.injectVector(
sourceAsMap,
formatVector(childFieldInfo, vectorValues::getVector, vectorValues::conditionalCloneVector),
childFieldInfo.name
);
}

Expand Down Expand Up @@ -115,6 +107,7 @@ private void injectNested(int parentDocId, Map<String, Object> sourceAsMap, KNNV
reconstructedSource = (List<Map<String, Object>>) originalParentValue;
}

// TODO: Update this to call out that empty objects are removed from arrays.
// In order to inject vectors into source for nested documents, we need to be able to map the existing
// maps to document positions. This lets us know what place to put the vector back into. For example:
// Assume we have the following document from the user and we are deriving the value for nested.vector
Expand Down Expand Up @@ -181,15 +174,15 @@ private void injectNested(int parentDocId, Map<String, Object> sourceAsMap, KNNV
reconstructedSource.add(position, new HashMap<>());
docIdsInNestedList.add(position, docId);
}
reconstructedSource.get(position)
.put(
childFieldName,
formatVector(
childFieldInfo,
nestedPerFieldParentToChildDocIdIterator::getVector,
nestedPerFieldParentToChildDocIdIterator::getVectorClone
)
);
DerivedSourceMapHelper.injectVector(
reconstructedSource.get(position),
formatVector(
childFieldInfo,
nestedPerFieldParentToChildDocIdIterator::getVector,
nestedPerFieldParentToChildDocIdIterator::getVectorClone
),
childFieldName
);
offsetPositionsIndex = position + 1;
}
sourceAsMap.put(parentFieldName, reconstructedSource);
Expand All @@ -212,8 +205,7 @@ private List<Integer> mapObjectsInNestedListToDocIds(List<Map<String, Object>> o
List<Integer> positions = new ArrayList<>();
int currentOffset = offset;
for (Map<String, Object> docWithFields : originals) {
int fieldMapping = mapToDocId(docWithFields, currentOffset, parent);
assert fieldMapping != NO_MORE_DOCS;
int fieldMapping = getDocIdOfMap(docWithFields, currentOffset, parent);
positions.add(fieldMapping);
currentOffset = fieldMapping + 1;
}
Expand All @@ -225,10 +217,11 @@ private List<Integer> mapObjectsInNestedListToDocIds(List<Map<String, Object>> o
*
* @param doc doc to find the docId for
* @param offset offset to start searching from
* @param parent parent docId
* @return doc id the map must map to
* @throws IOException if there is an issue reading from the formats
*/
private int mapToDocId(Map<String, Object> doc, int offset, int parent) throws IOException {
private int getDocIdOfMap(Map<String, Object> doc, int offset, int parent) throws IOException {
// First, we identify a field that the doc in question must have
FieldInfo fieldInfoOfDoc = getAnyMatchingFieldInfoForDoc(doc);
assert fieldInfoOfDoc != null;
Expand All @@ -240,15 +233,12 @@ private int mapToDocId(Map<String, Object> doc, int offset, int parent) throws I

// The field in question may be a nested field. In this case, we need to find the next parent on the same level
// as the child to figure out where to put back the vector.
List<Integer> matches = derivedSourceLuceneHelper.termMatchesInRange(
int position = derivedSourceLuceneHelper.getNextDocIdWithParent(
firstMatchingDocWithField,
parent - 1,
"_nested_path",
ParentChildHelper.getParentField(childFieldInfo.name)
);
assert matches != null;
assert matches.isEmpty() == false;
int position = matches.getFirst();
// We should not advance past the parent
assert position < parent;
return position;
}
Expand All @@ -267,15 +257,10 @@ private FieldInfo getAnyMatchingFieldInfoForDoc(Map<String, Object> doc) {
continue;
}

Object object = XContentMapValues.extractValue(extractedFieldName, doc, NullValue.INSTANCE);
if (object != null) {
if (DerivedSourceMapHelper.fieldExists(doc, extractedFieldName)) {
return fieldInfo;
}
}
return null;
}

private static class NullValue {
private static final NullValue INSTANCE = new NullValue();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.index.codec.derivedsource;

import lombok.SneakyThrows;
import org.opensearch.common.xcontent.XContentFactory;
import org.opensearch.common.xcontent.XContentHelper;
import org.opensearch.core.common.bytes.BytesReference;
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.knn.KNNTestCase;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class DerivedSourceMapHelperTests extends KNNTestCase {

@SneakyThrows
public void testFilterFields() {
// Basic filter
String[] filterFields = new String[] { "field1", "field2", "field3" };
XContentBuilder builder = XContentFactory.jsonBuilder()
.startObject()
.field("field1", "value")
.field("field2", "value")
.field("field3", "value")
.endObject();
Map<String, Object> source = DerivedSourceMapHelper.filterFields(filterFields, xContentToMap(builder));
assertEquals(Collections.emptyMap(), source);
source = DerivedSourceMapHelper.filterFields(new String[] { "field1" }, xContentToMap(builder));
assertTrue(DerivedSourceMapHelper.fieldExists(source, "field2"));
assertTrue(DerivedSourceMapHelper.fieldExists(source, "field3"));
assertFalse(DerivedSourceMapHelper.fieldExists(source, "field1"));

// Object case
filterFields = new String[] { "level1.level2.test", "field.nonexist" };
builder = XContentFactory.jsonBuilder()
.startObject()
.startObject("level1")
.startObject("level2")
.field("test", "test")
.endObject()
.endObject()
.endObject();
source = DerivedSourceMapHelper.filterFields(filterFields, xContentToMap(builder));
assertEquals(Map.of("level1", Map.of("level2", Collections.emptyMap())), source);

// Nested case. In this case, if filtering out a value leads it to be an empty map, then it will be removed
// from the array
filterFields = new String[] { "nested.deep.value", "nested.vector" };
builder = XContentFactory.jsonBuilder()
.startObject()
.startArray("nested")
.startObject()
.field("text", "text1")
.startArray("deep")
.startObject()
.field("value", "text2")
.endObject()
.endArray()
.endObject()
.startObject()
.field("vector", "text1")
.endObject()
.startObject()
.field("text", "text1")
.field("vector", "text1")
.endObject()
.endArray()
.endObject();
source = DerivedSourceMapHelper.filterFields(filterFields, xContentToMap(builder));
assertTrue(source.containsKey("nested"));
Object nested = source.get("nested");
assertTrue(nested instanceof List<?>);
List<?> nestedList = (List<?>) nested;
assertEquals(2, nestedList.size());
assertTrue(nestedList.get(0) instanceof Map);
Map<String, Object> nestedMap = (Map<String, Object>) nestedList.get(0);
assertTrue(nestedMap.containsKey("deep"));
Object deep = nestedMap.get("deep");
assertTrue(deep instanceof List<?>);
assertEquals(0, ((List<?>) deep).size());
}

@SneakyThrows
public void testFieldExists() {
// Basic, flat validation
XContentBuilder builder = XContentFactory.jsonBuilder().startObject().field("field", "value").endObject();
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field"));
assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "non-existent-field"));

// Object type
builder = XContentFactory.jsonBuilder().startObject().startObject("field").field("test", "test").endObject().endObject();
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field.test"));
assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "test"));

// Complex nested object
builder = XContentFactory.jsonBuilder()
.startObject()
.startArray("nested1")
.startObject()
.field("field1", "test")
.endObject()
.startObject()
.field("field1", "test")
.field("field2", "test")
.endObject()
.startObject()
.field("field3", "test")
.endObject()
.endArray()
.startArray("nested2")
.startObject()
.field("field1", "test")
.endObject()
.startObject()
.field("field1", "test")
.field("field2", "test")
.endObject()
.startObject()
.field("field3", "test")
.endObject()
.endArray()
.field("notnested", "test")
.endObject();
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested1"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested2"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "notnested"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested1.field1"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested1.field2"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested1.field3"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested2.field1"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested2.field2"));
assertTrue(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "nested2.field3"));
assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field1"));
assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field2"));
assertFalse(DerivedSourceMapHelper.fieldExists(xContentToMap(builder), "field3"));

// Null test
Map<String, Object> map = new HashMap<>();
map.put("test", null);
assertTrue(DerivedSourceMapHelper.fieldExists(map, "test"));
}

private Map<String, Object> xContentToMap(XContentBuilder xContentBuilder) {
return XContentHelper.convertToMap(BytesReference.bytes(xContentBuilder), true, xContentBuilder.contentType()).v2();
}
}

0 comments on commit b721e83

Please sign in to comment.