Skip to content

Commit 0d7b898

Browse files
yizheliu-amazonheemin32
authored andcommitted
Support empty string for fields in text embedding processor (opensearch-project#1041) (opensearch-project#1046)
* Allow empty string for field in field map * Allow empty string when validation * Add to change log * Update CHANGELOG to: Support empty string for fields in text embedding processor --------- Signed-off-by: Yizhe Liu <yizheliu@amazon.com>
1 parent 10f3cb3 commit 0d7b898

File tree

4 files changed

+94
-86
lines changed

4 files changed

+94
-86
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1919
- Explainability in hybrid query ([#970](https://github.com/opensearch-project/neural-search/pull/970))
2020
- Implement pruning for neural sparse ingestion pipeline and two phase search processor ([#988](https://github.com/opensearch-project/neural-search/pull/988))
2121
- Support new knn query parameter expand_nested ([#1013](https://github.com/opensearch-project/neural-search/pull/1013))
22+
- Support empty string for fields in text embedding processor ([#1041](https://github.com/opensearch-project/neural-search/pull/1041))
2223
### Bug Fixes
2324
- Address inconsistent scoring in hybrid query results ([#998](https://github.com/opensearch-project/neural-search/pull/998))
2425
### Infrastructure

src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java

+19-4
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ Map<String, Object> buildMapWithTargetKeys(IngestDocument ingestDocument) {
307307
buildNestedMap(originalKey, targetKey, sourceAndMetadataMap, treeRes);
308308
mapWithProcessorKeys.put(originalKey, treeRes.get(originalKey));
309309
} else {
310-
mapWithProcessorKeys.put(String.valueOf(targetKey), sourceAndMetadataMap.get(originalKey));
310+
mapWithProcessorKeys.put(String.valueOf(targetKey), normalizeSourceValue(sourceAndMetadataMap.get(originalKey)));
311311
}
312312
}
313313
return mapWithProcessorKeys;
@@ -333,19 +333,34 @@ void buildNestedMap(String parentKey, Object processorKey, Map<String, Object> s
333333
} else if (sourceAndMetadataMap.get(parentKey) instanceof List) {
334334
for (Map.Entry<String, Object> nestedFieldMapEntry : ((Map<String, Object>) processorKey).entrySet()) {
335335
List<Map<String, Object>> list = (List<Map<String, Object>>) sourceAndMetadataMap.get(parentKey);
336-
List<Object> listOfStrings = list.stream().map(x -> x.get(nestedFieldMapEntry.getKey())).collect(Collectors.toList());
336+
List<Object> listOfStrings = list.stream().map(x -> {
337+
Object nestedSourceValue = x.get(nestedFieldMapEntry.getKey());
338+
return normalizeSourceValue(nestedSourceValue);
339+
}).collect(Collectors.toList());
337340
Map<String, Object> map = new LinkedHashMap<>();
338341
map.put(nestedFieldMapEntry.getKey(), listOfStrings);
339342
buildNestedMap(nestedFieldMapEntry.getKey(), nestedFieldMapEntry.getValue(), map, next);
340343
}
341344
}
342345
treeRes.merge(parentKey, next, REMAPPING_FUNCTION);
343346
} else {
347+
Object parentValue = sourceAndMetadataMap.get(parentKey);
344348
String key = String.valueOf(processorKey);
345-
treeRes.put(key, sourceAndMetadataMap.get(parentKey));
349+
treeRes.put(key, normalizeSourceValue(parentValue));
346350
}
347351
}
348352

353+
private boolean isBlankString(Object object) {
354+
return object instanceof String && StringUtils.isBlank((String) object);
355+
}
356+
357+
private Object normalizeSourceValue(Object value) {
358+
if (isBlankString(value)) {
359+
return null;
360+
}
361+
return value;
362+
}
363+
349364
/**
350365
* Process the nested key, such as "a.b.c" to "a", "b.c"
351366
* @param nestedFieldMapEntry
@@ -376,7 +391,7 @@ private void validateEmbeddingFieldsValue(IngestDocument ingestDocument) {
376391
indexName,
377392
clusterService,
378393
environment,
379-
false
394+
true
380395
);
381396
}
382397

src/test/java/org/opensearch/neuralsearch/processor/InferenceProcessorTests.java

-43
Original file line numberDiff line numberDiff line change
@@ -66,28 +66,6 @@ public void test_batchExecute_emptyInput() {
6666
verify(clientAccessor, never()).inferenceSentences(anyString(), anyList(), any());
6767
}
6868

69-
public void test_batchExecuteWithEmpty_allFailedValidation() {
70-
final int docCount = 2;
71-
TestInferenceProcessor processor = new TestInferenceProcessor(createMockVectorResult(), BATCH_SIZE, null);
72-
List<IngestDocumentWrapper> wrapperList = createIngestDocumentWrappers(docCount);
73-
wrapperList.get(0).getIngestDocument().setFieldValue("key1", Arrays.asList("", "value1"));
74-
wrapperList.get(1).getIngestDocument().setFieldValue("key1", Arrays.asList("", "value1"));
75-
Consumer resultHandler = mock(Consumer.class);
76-
processor.batchExecute(wrapperList, resultHandler);
77-
ArgumentCaptor<List<IngestDocumentWrapper>> captor = ArgumentCaptor.forClass(List.class);
78-
verify(resultHandler).accept(captor.capture());
79-
assertEquals(docCount, captor.getValue().size());
80-
for (int i = 0; i < docCount; ++i) {
81-
assertNotNull(captor.getValue().get(i).getException());
82-
assertEquals(
83-
"list type field [key1] has empty string, cannot process it",
84-
captor.getValue().get(i).getException().getMessage()
85-
);
86-
assertEquals(wrapperList.get(i).getIngestDocument(), captor.getValue().get(i).getIngestDocument());
87-
}
88-
verify(clientAccessor, never()).inferenceSentences(anyString(), anyList(), any());
89-
}
90-
9169
public void test_batchExecuteWithNull_allFailedValidation() {
9270
final int docCount = 2;
9371
TestInferenceProcessor processor = new TestInferenceProcessor(createMockVectorResult(), BATCH_SIZE, null);
@@ -107,27 +85,6 @@ public void test_batchExecuteWithNull_allFailedValidation() {
10785
verify(clientAccessor, never()).inferenceSentences(anyString(), anyList(), any());
10886
}
10987

110-
public void test_batchExecute_partialFailedValidation() {
111-
final int docCount = 2;
112-
TestInferenceProcessor processor = new TestInferenceProcessor(createMockVectorResult(), BATCH_SIZE, null);
113-
List<IngestDocumentWrapper> wrapperList = createIngestDocumentWrappers(docCount);
114-
wrapperList.get(0).getIngestDocument().setFieldValue("key1", Arrays.asList("", "value1"));
115-
wrapperList.get(1).getIngestDocument().setFieldValue("key1", Arrays.asList("value3", "value4"));
116-
Consumer resultHandler = mock(Consumer.class);
117-
processor.batchExecute(wrapperList, resultHandler);
118-
ArgumentCaptor<List<IngestDocumentWrapper>> captor = ArgumentCaptor.forClass(List.class);
119-
verify(resultHandler).accept(captor.capture());
120-
assertEquals(docCount, captor.getValue().size());
121-
assertNotNull(captor.getValue().get(0).getException());
122-
assertNull(captor.getValue().get(1).getException());
123-
for (int i = 0; i < docCount; ++i) {
124-
assertEquals(wrapperList.get(i).getIngestDocument(), captor.getValue().get(i).getIngestDocument());
125-
}
126-
ArgumentCaptor<List<String>> inferenceTextCaptor = ArgumentCaptor.forClass(List.class);
127-
verify(clientAccessor).inferenceSentences(anyString(), inferenceTextCaptor.capture(), any());
128-
assertEquals(2, inferenceTextCaptor.getValue().size());
129-
}
130-
13188
public void test_batchExecute_happyCase() {
13289
final int docCount = 2;
13390
List<List<Float>> inferenceResults = createMockVectorWithLength(6);

src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java

+74-39
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.util.function.Consumer;
2929
import java.util.function.Supplier;
3030

31+
import org.apache.commons.lang3.StringUtils;
3132
import org.apache.commons.lang3.tuple.Pair;
3233
import org.junit.Before;
3334
import org.mockito.ArgumentCaptor;
@@ -240,31 +241,6 @@ public void testExecute_withListTypeInput_successful() {
240241
verify(handler).accept(any(IngestDocument.class), isNull());
241242
}
242243

243-
public void testExecute_SimpleTypeWithEmptyStringValue_throwIllegalArgumentException() {
244-
Map<String, Object> sourceAndMetadata = new HashMap<>();
245-
sourceAndMetadata.put(IndexFieldMapper.NAME, "my_index");
246-
sourceAndMetadata.put("key1", " ");
247-
IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>());
248-
TextEmbeddingProcessor processor = createInstanceWithLevel1MapConfig();
249-
250-
BiConsumer handler = mock(BiConsumer.class);
251-
processor.execute(ingestDocument, handler);
252-
verify(handler).accept(isNull(), any(IllegalArgumentException.class));
253-
}
254-
255-
public void testExecute_listHasEmptyStringValue_throwIllegalArgumentException() {
256-
List<String> list1 = ImmutableList.of("", "test2", "test3");
257-
Map<String, Object> sourceAndMetadata = new HashMap<>();
258-
sourceAndMetadata.put(IndexFieldMapper.NAME, "my_index");
259-
sourceAndMetadata.put("key1", list1);
260-
IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>());
261-
TextEmbeddingProcessor processor = createInstanceWithLevel1MapConfig();
262-
263-
BiConsumer handler = mock(BiConsumer.class);
264-
processor.execute(ingestDocument, handler);
265-
verify(handler).accept(isNull(), any(IllegalArgumentException.class));
266-
}
267-
268244
public void testExecute_listHasNonStringValue_throwIllegalArgumentException() {
269245
List<Integer> list2 = ImmutableList.of(1, 2, 3);
270246
Map<String, Object> sourceAndMetadata = new HashMap<>();
@@ -549,20 +525,6 @@ public void testExecute_mapHasNonStringValue_throwIllegalArgumentException() {
549525
verify(handler).accept(isNull(), any(IllegalArgumentException.class));
550526
}
551527

552-
public void testExecute_mapHasEmptyStringValue_throwIllegalArgumentException() {
553-
Map<String, String> map1 = ImmutableMap.of("test1", "test2");
554-
Map<String, String> map2 = ImmutableMap.of("test3", " ");
555-
Map<String, Object> sourceAndMetadata = new HashMap<>();
556-
sourceAndMetadata.put(IndexFieldMapper.NAME, "my_index");
557-
sourceAndMetadata.put("key1", map1);
558-
sourceAndMetadata.put("key2", map2);
559-
IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>());
560-
TextEmbeddingProcessor processor = createInstanceWithLevel2MapConfig();
561-
BiConsumer handler = mock(BiConsumer.class);
562-
processor.execute(ingestDocument, handler);
563-
verify(handler).accept(isNull(), any(IllegalArgumentException.class));
564-
}
565-
566528
public void testExecute_mapDepthReachLimit_throwIllegalArgumentException() {
567529
Map<String, Object> ret = createMaxDepthLimitExceedMap(() -> 1);
568530
Map<String, Object> sourceAndMetadata = new HashMap<>();
@@ -785,6 +747,79 @@ public void testBuildVectorOutput_withNestedListHasNotForEmbeddingField_Level2_s
785747
assertNotNull(nestedObj.get(1).get("vectorField"));
786748
}
787749

750+
@SuppressWarnings("unchecked")
751+
public void testBuildVectorOutput_withPlainString_EmptyString_skipped() {
752+
Map<String, Object> config = createPlainStringConfiguration();
753+
IngestDocument ingestDocument = createPlainIngestDocument();
754+
Map<String, Object> sourceAndMetadata = ingestDocument.getSourceAndMetadata();
755+
sourceAndMetadata.put("oriKey1", StringUtils.EMPTY);
756+
757+
TextEmbeddingProcessor processor = createInstanceWithNestedMapConfiguration(config);
758+
Map<String, Object> knnMap = processor.buildMapWithTargetKeys(ingestDocument);
759+
List<List<Float>> modelTensorList = createRandomOneDimensionalMockVector(6, 100, 0.0f, 1.0f);
760+
processor.setVectorFieldsToDocument(ingestDocument, knnMap, modelTensorList);
761+
762+
/** IngestDocument
763+
* "oriKey1": "",
764+
* "oriKey2": "oriValue2",
765+
* "oriKey3": "oriValue3",
766+
* "oriKey4": "oriValue4",
767+
* "oriKey5": "oriValue5",
768+
* "oriKey6": [
769+
* "oriValue6",
770+
* "oriValue7"
771+
* ]
772+
*
773+
*/
774+
assertEquals(11, sourceAndMetadata.size());
775+
assertFalse(sourceAndMetadata.containsKey("oriKey1_knn"));
776+
}
777+
778+
@SuppressWarnings("unchecked")
779+
public void testBuildVectorOutput_withNestedField_EmptyString_skipped() {
780+
Map<String, Object> config = createNestedMapConfiguration();
781+
IngestDocument ingestDocument = createNestedMapIngestDocument();
782+
Map<String, Object> favorites = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("favorites");
783+
Map<String, Object> favorite = (Map<String, Object>) favorites.get("favorite");
784+
favorite.put("movie", StringUtils.EMPTY);
785+
786+
TextEmbeddingProcessor processor = createInstanceWithNestedMapConfiguration(config);
787+
Map<String, Object> knnMap = processor.buildMapWithTargetKeys(ingestDocument);
788+
List<List<Float>> modelTensorList = createRandomOneDimensionalMockVector(1, 100, 0.0f, 1.0f);
789+
processor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata());
790+
791+
/**
792+
* "favorites": {
793+
* "favorite": {
794+
* "movie": "",
795+
* "actor": "Charlie Chaplin",
796+
* "games" : {
797+
* "adventure": {
798+
* "action": "overwatch",
799+
* "rpg": "elden ring"
800+
* }
801+
* }
802+
* }
803+
* }
804+
*/
805+
Map<String, Object> favoritesMap = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("favorites");
806+
assertNotNull(favoritesMap);
807+
Map<String, Object> favoriteMap = (Map<String, Object>) favoritesMap.get("favorite");
808+
assertNotNull(favoriteMap);
809+
810+
Map<String, Object> favoriteGames = (Map<String, Object>) favoriteMap.get("games");
811+
assertNotNull(favoriteGames);
812+
Map<String, Object> adventure = (Map<String, Object>) favoriteGames.get("adventure");
813+
List<Float> adventureKnnVector = (List<Float>) adventure.get("with_action_knn");
814+
assertNotNull(adventureKnnVector);
815+
assertEquals(100, adventureKnnVector.size());
816+
for (float vector : adventureKnnVector) {
817+
assertTrue(vector >= 0.0f && vector <= 1.0f);
818+
}
819+
820+
assertFalse(favoriteMap.containsKey("favorite_movie_knn"));
821+
}
822+
788823
public void test_updateDocument_appendVectorFieldsToDocument_successful() {
789824
Map<String, Object> config = createPlainStringConfiguration();
790825
IngestDocument ingestDocument = createPlainIngestDocument();

0 commit comments

Comments
 (0)