properties) {
+ this.properties = properties;
+ }
+
+ public void setSourceIndexedField(String sourceIndexedField) {
+ this.sourceIndexedField = sourceIndexedField;
+ }
+
+ public void setFormat(String format) {
+ this.format = format;
+ }
+
+ public void setIgnoreMalformed(boolean ignoreMalformed) {
+ this.ignoreMalformed = ignoreMalformed;
+ }
+
@Override
public int hashCode() {
- return Objects.hash(name, type, script);
+ return Objects.hash(name, type, script, sourceIndexedField, properties, ignoreMalformed, format);
}
@Override
@@ -84,7 +152,12 @@ public boolean equals(Object obj) {
return false;
}
DerivedField other = (DerivedField) obj;
- return Objects.equals(name, other.name) && Objects.equals(type, other.type) && Objects.equals(script, other.script);
+ return Objects.equals(name, other.name)
+ && Objects.equals(type, other.type)
+ && Objects.equals(script, other.script)
+ && Objects.equals(sourceIndexedField, other.sourceIndexedField)
+ && Objects.equals(properties, other.properties)
+ && Objects.equals(ignoreMalformed, other.ignoreMalformed)
+ && Objects.equals(format, other.format);
}
-
}
diff --git a/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java b/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java
new file mode 100644
index 0000000000000..713bdc4e691cd
--- /dev/null
+++ b/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java
@@ -0,0 +1,181 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.mapper;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.ReaderUtil;
+import org.opensearch.common.Randomness;
+import org.opensearch.common.xcontent.XContentFactory;
+import org.opensearch.common.xcontent.json.JsonXContent;
+import org.opensearch.core.common.bytes.BytesReference;
+import org.opensearch.core.xcontent.XContentBuilder;
+import org.opensearch.search.lookup.SourceLookup;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeSet;
+
+/**
+ * This class performs type inference by analyzing the _source documents. It uses a random sample of documents to infer the field type, similar to dynamic mapping type guessing logic.
+ * Unlike guessing based on the first document, where field could be missing, this method generates a random sample to make a more accurate inference.
+ * This approach is especially useful for handling missing fields, which is common in nested fields within derived fields of object types.
+ *
+ * The sample size should be chosen carefully to ensure a high probability of selecting at least one document where the field is present.
+ * However, it's essential to strike a balance because a large sample size can lead to performance issues since each sample document's _source field is loaded and examined until the field is found.
+ *
+ *
Determining the sample size ({@code S}) is akin to deciding how many balls to draw from a bin, ensuring a high probability ({@code >=P}) of drawing at least one green ball (documents with the field) from a mixture of {@code R } red balls (documents without the field) and {@code G } green balls:
+ *
{@code
+ * P >= 1 - C(R, S) / C(R + G, S)
+ * }
+ * Here, {@code C()} represents the binomial coefficient.
+ * For a high confidence level, we aim for {@code P >= 0.95 }. For example, with {@code 10^7 } documents where the field is present in {@code 2% } of them, the sample size {@code S } should be around 149 to achieve a probability of {@code 0.95}.
+ */
+public class FieldTypeInference {
+ private final IndexReader indexReader;
+ private final String indexName;
+ private final MapperService mapperService;
+ // TODO expose using a index setting
+ private int sampleSize;
+ private static final int DEFAULT_SAMPLE_SIZE = 150;
+ private static final int MAX_SAMPLE_SIZE_ALLOWED = 1000;
+
+ public FieldTypeInference(String indexName, MapperService mapperService, IndexReader indexReader) {
+ this.indexName = indexName;
+ this.mapperService = mapperService;
+ this.indexReader = indexReader;
+ this.sampleSize = DEFAULT_SAMPLE_SIZE;
+ }
+
+ public void setSampleSize(int sampleSize) {
+ if (sampleSize > MAX_SAMPLE_SIZE_ALLOWED) {
+ throw new IllegalArgumentException("sample_size should be less than " + MAX_SAMPLE_SIZE_ALLOWED);
+ }
+ this.sampleSize = sampleSize;
+ }
+
+ public int getSampleSize() {
+ return sampleSize;
+ }
+
+ public Mapper infer(ValueFetcher valueFetcher) throws IOException {
+ RandomSourceValuesGenerator valuesGenerator = new RandomSourceValuesGenerator(sampleSize, indexReader, valueFetcher);
+ Mapper inferredMapper = null;
+ while (inferredMapper == null && valuesGenerator.hasNext()) {
+ List