diff --git a/plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java b/plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java index d2d009f1b28b8..17a1abc64e8cd 100644 --- a/plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java +++ b/plugins/mapper-annotated-text/src/test/java/org/opensearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java @@ -41,7 +41,6 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; @@ -56,6 +55,7 @@ import org.apache.lucene.search.uhighlight.Snippet; import org.apache.lucene.search.uhighlight.SplittingBreakIterator; import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; import org.opensearch.common.Strings; import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer; import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText; @@ -136,7 +136,8 @@ private void assertHighlightOneDoc( noMatchSize, expectedPassages.length, name -> "text".equals(name), - Integer.MAX_VALUE + Integer.MAX_VALUE, + null ); highlighter.setFieldMatcher((name) -> "text".equals(name)); final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue); diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml index 462f4f5d25e0b..c38afc96590e9 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml @@ -36,6 +36,17 @@ setup: body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}} - match: { error.root_cause.0.type: "illegal_argument_exception" } +--- +"Unified highlighter on a field WITHOUT OFFSETS using max_analyzer_offset should SUCCEED": + - skip: + version: " - 2.99.99" + reason: only starting supporting the parameter max_analyzer_offset on version 3.0 + - do: + search: + rest_total_hits_as_int: true + index: test1 + body: {"query" : {"match" : {"field1" : "quick"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {"max_analyzer_offset": 10}}}} + - match: {hits.hits.0.highlight.field1.0: "The quick brown fox went to the forest and saw another fox."} --- "Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL": diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java index fb22eb583d9e1..cd4ee121f0f29 100644 --- a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java +++ b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java @@ -43,7 +43,6 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag; import org.apache.lucene.util.BytesRef; import org.opensearch.common.CheckedSupplier; import org.opensearch.common.Nullable; @@ -79,6 +78,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { private final int noMatchSize; private final FieldHighlighter fieldHighlighter; private final int maxAnalyzedOffset; + private final Integer fieldMaxAnalyzedOffset; /** * Creates a new instance of {@link CustomUnifiedHighlighter} @@ -99,6 +99,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { * @param fieldMatcher decides which terms should be highlighted * @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED * offset source for it because it'd be super slow + * @param fieldMaxAnalyzedOffset this is used to limit the length of input that will be ANALYZED, this allows bigger fields to be partially highligthed */ public CustomUnifiedHighlighter( IndexSearcher searcher, @@ -113,7 +114,8 @@ public CustomUnifiedHighlighter( int noMatchSize, int maxPassages, Predicate fieldMatcher, - int maxAnalyzedOffset + int maxAnalyzedOffset, + Integer fieldMaxAnalyzedOffset ) throws IOException { super(searcher, analyzer); this.offsetSource = offsetSource; @@ -126,6 +128,7 @@ public CustomUnifiedHighlighter( this.setFieldMatcher(fieldMatcher); this.maxAnalyzedOffset = maxAnalyzedOffset; fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages); + this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset; } /** @@ -141,7 +144,21 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier maxAnalyzedOffset)) { + + if (fieldMaxAnalyzedOffset != null && fieldMaxAnalyzedOffset > maxAnalyzedOffset) { + throw new IllegalArgumentException( + "max_analyzer_offset has exceeded [" + + maxAnalyzedOffset + + "] - maximum allowed to be analyzed for highlighting. " + + "This maximum can be set by changing the [" + + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + + "] index level setting. " + + "For large texts, indexing with offsets or term vectors is recommended!" + ); + } + // if fieldMaxAnalyzedOffset is not defined + // and if this happens we should fallback to the previous behavior + if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset && fieldMaxAnalyzedOffset == null)) { throw new IllegalArgumentException( "The length of [" + field diff --git a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java index e935d46ea1fb0..db0089fd5f180 100644 --- a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java +++ b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java @@ -34,6 +34,7 @@ import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; +import org.opensearch.Version; import org.opensearch.common.ParseField; import org.opensearch.common.ParsingException; import org.opensearch.common.Strings; @@ -92,6 +93,7 @@ public abstract class AbstractHighlighterBuilder template, QueryBuilder queryBuilder) { @@ -150,6 +154,7 @@ protected AbstractHighlighterBuilder(AbstractHighlighterBuilder template, Que phraseLimit = template.phraseLimit; options = template.options; requireFieldMatch = template.requireFieldMatch; + maxAnalyzerOffset = template.maxAnalyzerOffset; } /** @@ -181,7 +186,13 @@ protected AbstractHighlighterBuilder(StreamInput in) throws IOException { if (in.readBoolean()) { options(in.readMap()); } + requireFieldMatch(in.readOptionalBoolean()); + + if (in.getVersion().onOrAfter(Version.V_3_0_0)) { + maxAnalyzerOffset(in.readOptionalVInt()); + } + } /** @@ -223,6 +234,9 @@ public final void writeTo(StreamOutput out) throws IOException { out.writeMap(options); } out.writeOptionalBoolean(requireFieldMatch); + if (out.getVersion().onOrAfter(Version.V_3_0_0)) { + out.writeOptionalVInt(maxAnalyzerOffset); + } doWriteTo(out); } @@ -542,6 +556,21 @@ public Integer phraseLimit() { return this.phraseLimit; } + /** + * Sets the maximum offset for the highlighter + * @param maxAnalyzerOffset the maximum offset that the highlighter will consider + * @return this for chaining + */ + @SuppressWarnings("unchecked") + public HB maxAnalyzerOffset(Integer maxAnalyzerOffset) { + this.maxAnalyzerOffset = maxAnalyzerOffset; + return (HB) this; + } + + public Integer maxAnalyzerOffset() { + return this.maxAnalyzerOffset; + } + /** * Forces the highlighting to highlight fields based on the source even if fields are stored separately. */ @@ -623,6 +652,10 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException { if (phraseLimit != null) { builder.field(PHRASE_LIMIT_FIELD.getPreferredName(), phraseLimit); } + if (maxAnalyzerOffset != null) { + builder.field(MAX_ANALYZER_OFFSET_FIELD.getPreferredName(), maxAnalyzerOffset); + } + } static > BiFunction setupParser(ObjectParser parser) { @@ -642,6 +675,7 @@ static > BiFunction { try { return p.map(); diff --git a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/HighlightBuilder.java b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/HighlightBuilder.java index a8a7d290c827e..7e7e240362a1a 100644 --- a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/HighlightBuilder.java +++ b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/HighlightBuilder.java @@ -399,6 +399,10 @@ private static void transferOptions( if (highlighterBuilder.highlightQuery != null) { targetOptionsBuilder.highlightQuery(highlighterBuilder.highlightQuery.toQuery(context)); } + if (highlighterBuilder.maxAnalyzerOffset != null) { + targetOptionsBuilder.maxAnalyzerOffset(highlighterBuilder.maxAnalyzerOffset); + } + } static Character[] convertCharArray(char[] array) { diff --git a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/SearchHighlightContext.java b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/SearchHighlightContext.java index 33a5397ae964b..7464ba094b97e 100644 --- a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/SearchHighlightContext.java +++ b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/SearchHighlightContext.java @@ -154,6 +154,12 @@ public static class FieldOptions { private int phraseLimit = -1; + private Integer maxAnalyzerOffset = null; + + public Integer maxAnalyzerOffset() { + return maxAnalyzerOffset; + } + public int fragmentCharSize() { return fragmentCharSize; } @@ -333,6 +339,15 @@ Builder phraseLimit(int phraseLimit) { return this; } + Builder maxAnalyzerOffset(Integer maxAnalyzerOffset) { + // throws an execption if the value is not a positive integer + if (maxAnalyzerOffset != null && maxAnalyzerOffset <= 0) { + throw new IllegalArgumentException("the value [" + maxAnalyzerOffset + "] of max_analyzer_offset is invalid"); + } + fieldOptions.maxAnalyzerOffset = maxAnalyzerOffset; + return this; + } + Builder matchedFields(Set matchedFields) { fieldOptions.matchedFields = matchedFields; return this; @@ -405,6 +420,9 @@ Builder merge(FieldOptions globalOptions) { if (fieldOptions.phraseLimit == -1) { fieldOptions.phraseLimit = globalOptions.phraseLimit; } + if (fieldOptions.maxAnalyzerOffset == null) { + fieldOptions.maxAnalyzerOffset = globalOptions.maxAnalyzerOffset; + } return this; } } diff --git a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java index ddf32064d7f59..5efaa7c9f766b 100644 --- a/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java +++ b/server/src/main/java/org/opensearch/search/fetch/subphase/highlight/UnifiedHighlighter.java @@ -32,6 +32,8 @@ package org.opensearch.search.fetch.subphase.highlight; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AnalyzerWrapper; +import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.highlight.Encoder; import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner; @@ -133,13 +135,33 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments)); } + public AnalyzerWrapper getLimitedOffsetAnalyzer(Analyzer analyzer, int limit) { + return new AnalyzerWrapper(analyzer.getReuseStrategy()) { + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return analyzer; + } + + @Override + protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { + return new TokenStreamComponents(components.getSource(), new LimitTokenOffsetFilter(components.getTokenStream(), limit)); + } + + }; + + } + CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException { Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset(); + Integer fieldMaxAnalyzedOffset = fieldContext.field.fieldOptions().maxAnalyzerOffset(); int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments(); Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper()); + if (fieldMaxAnalyzedOffset != null) { + analyzer = getLimitedOffsetAnalyzer(analyzer, fieldMaxAnalyzedOffset); + } PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder); IndexSearcher searcher = fieldContext.context.searcher(); OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType); @@ -174,7 +196,8 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th fieldContext.field.fieldOptions().noMatchSize(), higlighterNumberOfFragments, fieldMatcher(fieldContext), - maxAnalyzedOffset + maxAnalyzedOffset, + fieldMaxAnalyzedOffset ); } diff --git a/server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index 5383a153034e9..ba4d16c87bed1 100644 --- a/server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/server/src/test/java/org/opensearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -43,7 +43,6 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.BooleanClause; @@ -63,6 +62,7 @@ import org.apache.lucene.search.uhighlight.Snippet; import org.apache.lucene.search.uhighlight.UnifiedHighlighter; import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; import org.opensearch.common.Strings; import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery; import org.opensearch.test.OpenSearchTestCase; @@ -117,7 +117,8 @@ private void assertHighlightOneDoc( noMatchSize, expectedPassages.length, name -> "text".equals(name), - Integer.MAX_VALUE + Integer.MAX_VALUE, + null ); final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue); assertEquals(snippets.length, expectedPassages.length);