diff --git a/plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java b/plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java
index d2d009f1b28b8..17a1abc64e8cd 100644
--- a/plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java
+++ b/plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java
@@ -41,7 +41,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
@@ -56,6 +55,7 @@
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.common.Strings;
import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
@@ -136,7 +136,8 @@ private void assertHighlightOneDoc(
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
- Integer.MAX_VALUE
+ Integer.MAX_VALUE,
+ null
);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml
index 462f4f5d25e0b..c38afc96590e9 100644
--- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml
+++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml
@@ -36,6 +36,17 @@ setup:
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
- match: { error.root_cause.0.type: "illegal_argument_exception" }
+---
+"Unified highlighter on a field WITHOUT OFFSETS using max_analyzer_offset should SUCCEED":
+ - skip:
+ version: " - 2.99.99"
+ reason: only starting supporting the parameter max_analyzer_offset on version 3.0
+ - do:
+ search:
+ rest_total_hits_as_int: true
+ index: test1
+ body: {"query" : {"match" : {"field1" : "quick"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {"max_analyzer_offset": 10}}}}
+ - match: {hits.hits.0.highlight.field1.0: "The quick brown fox went to the forest and saw another fox."}
---
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
index fb22eb583d9e1..cd4ee121f0f29 100644
--- a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
+++ b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
@@ -43,7 +43,6 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.CheckedSupplier;
import org.opensearch.common.Nullable;
@@ -79,6 +78,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
private final int noMatchSize;
private final FieldHighlighter fieldHighlighter;
private final int maxAnalyzedOffset;
+ private final Integer fieldMaxAnalyzedOffset;
/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -99,6 +99,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
* @param fieldMatcher decides which terms should be highlighted
* @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED
* offset source for it because it'd be super slow
+ * @param fieldMaxAnalyzedOffset this is used to limit the length of input that will be ANALYZED, this allows bigger fields to be partially highligthed
*/
public CustomUnifiedHighlighter(
IndexSearcher searcher,
@@ -113,7 +114,8 @@ public CustomUnifiedHighlighter(
int noMatchSize,
int maxPassages,
Predicate fieldMatcher,
- int maxAnalyzedOffset
+ int maxAnalyzedOffset,
+ Integer fieldMaxAnalyzedOffset
) throws IOException {
super(searcher, analyzer);
this.offsetSource = offsetSource;
@@ -126,6 +128,7 @@ public CustomUnifiedHighlighter(
this.setFieldMatcher(fieldMatcher);
this.maxAnalyzedOffset = maxAnalyzedOffset;
fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
+ this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset;
}
/**
@@ -141,7 +144,21 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier maxAnalyzedOffset)) {
+
+ if (fieldMaxAnalyzedOffset != null && fieldMaxAnalyzedOffset > maxAnalyzedOffset) {
+ throw new IllegalArgumentException(
+ "max_analyzer_offset has exceeded ["
+ + maxAnalyzedOffset
+ + "] - maximum allowed to be analyzed for highlighting. "
+ + "This maximum can be set by changing the ["
+ + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey()
+ + "] index level setting. "
+ + "For large texts, indexing with offsets or term vectors is recommended!"
+ );
+ }
+ // if fieldMaxAnalyzedOffset is not defined
+ // and if this happens we should fallback to the previous behavior
+ if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset && fieldMaxAnalyzedOffset == null)) {
throw new IllegalArgumentException(
"The length of ["
+ field
diff --git a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java
index e935d46ea1fb0..db0089fd5f180 100644
--- a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java
+++ b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java
@@ -34,6 +34,7 @@
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
+import org.opensearch.Version;
import org.opensearch.common.ParseField;
import org.opensearch.common.ParsingException;
import org.opensearch.common.Strings;
@@ -92,6 +93,7 @@ public abstract class AbstractHighlighterBuilder template, QueryBuilder queryBuilder) {
@@ -150,6 +154,7 @@ protected AbstractHighlighterBuilder(AbstractHighlighterBuilder> template, Que
phraseLimit = template.phraseLimit;
options = template.options;
requireFieldMatch = template.requireFieldMatch;
+ maxAnalyzerOffset = template.maxAnalyzerOffset;
}
/**
@@ -181,7 +186,13 @@ protected AbstractHighlighterBuilder(StreamInput in) throws IOException {
if (in.readBoolean()) {
options(in.readMap());
}
+
requireFieldMatch(in.readOptionalBoolean());
+
+ if (in.getVersion().onOrAfter(Version.V_3_0_0)) {
+ maxAnalyzerOffset(in.readOptionalVInt());
+ }
+
}
/**
@@ -223,6 +234,9 @@ public final void writeTo(StreamOutput out) throws IOException {
out.writeMap(options);
}
out.writeOptionalBoolean(requireFieldMatch);
+ if (out.getVersion().onOrAfter(Version.V_3_0_0)) {
+ out.writeOptionalVInt(maxAnalyzerOffset);
+ }
doWriteTo(out);
}
@@ -542,6 +556,21 @@ public Integer phraseLimit() {
return this.phraseLimit;
}
+ /**
+ * Sets the maximum offset for the highlighter
+ * @param maxAnalyzerOffset the maximum offset that the highlighter will consider
+ * @return this for chaining
+ */
+ @SuppressWarnings("unchecked")
+ public HB maxAnalyzerOffset(Integer maxAnalyzerOffset) {
+ this.maxAnalyzerOffset = maxAnalyzerOffset;
+ return (HB) this;
+ }
+
+ public Integer maxAnalyzerOffset() {
+ return this.maxAnalyzerOffset;
+ }
+
/**
* Forces the highlighting to highlight fields based on the source even if fields are stored separately.
*/
@@ -623,6 +652,10 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException {
if (phraseLimit != null) {
builder.field(PHRASE_LIMIT_FIELD.getPreferredName(), phraseLimit);
}
+ if (maxAnalyzerOffset != null) {
+ builder.field(MAX_ANALYZER_OFFSET_FIELD.getPreferredName(), maxAnalyzerOffset);
+ }
+
}
static > BiFunction setupParser(ObjectParser parser) {
@@ -642,6 +675,7 @@ static > BiFunction {
try {
return p.map();
diff --git a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/HighlightBuilder.java b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/HighlightBuilder.java
index a8a7d290c827e..7e7e240362a1a 100644
--- a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/HighlightBuilder.java
+++ b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/HighlightBuilder.java
@@ -399,6 +399,10 @@ private static void transferOptions(
if (highlighterBuilder.highlightQuery != null) {
targetOptionsBuilder.highlightQuery(highlighterBuilder.highlightQuery.toQuery(context));
}
+ if (highlighterBuilder.maxAnalyzerOffset != null) {
+ targetOptionsBuilder.maxAnalyzerOffset(highlighterBuilder.maxAnalyzerOffset);
+ }
+
}
static Character[] convertCharArray(char[] array) {
diff --git a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/SearchHighlightContext.java b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/SearchHighlightContext.java
index 33a5397ae964b..7464ba094b97e 100644
--- a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/SearchHighlightContext.java
+++ b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/SearchHighlightContext.java
@@ -154,6 +154,12 @@ public static class FieldOptions {
private int phraseLimit = -1;
+ private Integer maxAnalyzerOffset = null;
+
+ public Integer maxAnalyzerOffset() {
+ return maxAnalyzerOffset;
+ }
+
public int fragmentCharSize() {
return fragmentCharSize;
}
@@ -333,6 +339,15 @@ Builder phraseLimit(int phraseLimit) {
return this;
}
+ Builder maxAnalyzerOffset(Integer maxAnalyzerOffset) {
+ // throws an execption if the value is not a positive integer
+ if (maxAnalyzerOffset != null && maxAnalyzerOffset <= 0) {
+ throw new IllegalArgumentException("the value [" + maxAnalyzerOffset + "] of max_analyzer_offset is invalid");
+ }
+ fieldOptions.maxAnalyzerOffset = maxAnalyzerOffset;
+ return this;
+ }
+
Builder matchedFields(Set matchedFields) {
fieldOptions.matchedFields = matchedFields;
return this;
@@ -405,6 +420,9 @@ Builder merge(FieldOptions globalOptions) {
if (fieldOptions.phraseLimit == -1) {
fieldOptions.phraseLimit = globalOptions.phraseLimit;
}
+ if (fieldOptions.maxAnalyzerOffset == null) {
+ fieldOptions.maxAnalyzerOffset = globalOptions.maxAnalyzerOffset;
+ }
return this;
}
}
diff --git a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
index ddf32064d7f59..5efaa7c9f766b 100644
--- a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
+++ b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
@@ -32,6 +32,8 @@
package org.opensearch.search.fetch.subphase.highlight;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
@@ -133,13 +135,33 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
}
+ public AnalyzerWrapper getLimitedOffsetAnalyzer(Analyzer analyzer, int limit) {
+ return new AnalyzerWrapper(analyzer.getReuseStrategy()) {
+ @Override
+ protected Analyzer getWrappedAnalyzer(String fieldName) {
+ return analyzer;
+ }
+
+ @Override
+ protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+ return new TokenStreamComponents(components.getSource(), new LimitTokenOffsetFilter(components.getTokenStream(), limit));
+ }
+
+ };
+
+ }
+
CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException {
Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html")
? HighlightUtils.Encoders.HTML
: HighlightUtils.Encoders.DEFAULT;
int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
+ Integer fieldMaxAnalyzedOffset = fieldContext.field.fieldOptions().maxAnalyzerOffset();
int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper());
+ if (fieldMaxAnalyzedOffset != null) {
+ analyzer = getLimitedOffsetAnalyzer(analyzer, fieldMaxAnalyzedOffset);
+ }
PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
IndexSearcher searcher = fieldContext.context.searcher();
OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
@@ -174,7 +196,8 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
fieldContext.field.fieldOptions().noMatchSize(),
higlighterNumberOfFragments,
fieldMatcher(fieldContext),
- maxAnalyzedOffset
+ maxAnalyzedOffset,
+ fieldMaxAnalyzedOffset
);
}
diff --git a/server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
index 5383a153034e9..ba4d16c87bed1 100644
--- a/server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
+++ b/server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -43,7 +43,6 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
@@ -63,6 +62,7 @@
import org.apache.lucene.search.uhighlight.Snippet;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.common.Strings;
import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.opensearch.test.OpenSearchTestCase;
@@ -117,7 +117,8 @@ private void assertHighlightOneDoc(
noMatchSize,
expectedPassages.length,
name -> "text".equals(name),
- Integer.MAX_VALUE
+ Integer.MAX_VALUE,
+ null
);
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
assertEquals(snippets.length, expectedPassages.length);