opensearch-project · andrross · Jul 22, 2022 · Jul 13, 2022 · Jul 13, 2022 · Jul 13, 2022
@@ -41,7 +41,6 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
@@ -56,6 +55,7 @@
 import org.apache.lucene.search.uhighlight.Snippet;
 import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.opensearch.common.Strings;
 import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
 import org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
@@ -136,7 +136,8 @@ private void assertHighlightOneDoc(
             noMatchSize,
             expectedPassages.length,
             name -> "text".equals(name),
-            Integer.MAX_VALUE
+            Integer.MAX_VALUE,
+            -1
         );
         highlighter.setFieldMatcher((name) -> "text".equals(name));
         final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);

@@ -36,6 +36,17 @@ setup:
           body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
   - match: { error.root_cause.0.type: "illegal_argument_exception" }
 
+---
+"Unified highlighter on a field WITHOUT OFFSETS using max_analyzer_offset should SUCCEED":
+  - skip:
+        version: "all"
+        reason: only starting supporting the parameter max_analyzer_offset on version 3.0
+  - do:
+      search:
+          rest_total_hits_as_int: true
+          index: test1
+          body: {"query" : {"match" : {"field1" : "quick"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {"max_analyzer_offset": 10}}}}
+  - match: {hits.hits.0.highlight.field1.0: "The <em>quick</em> brown fox went to the forest and saw another fox."}
 
 ---
 "Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":

@@ -43,7 +43,6 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
 import org.apache.lucene.util.BytesRef;
 import org.opensearch.common.CheckedSupplier;
 import org.opensearch.common.Nullable;
@@ -79,6 +78,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
     private final int noMatchSize;
     private final FieldHighlighter fieldHighlighter;
     private final int maxAnalyzedOffset;
+    private final int fieldMaxAnalyzedOffset;
 
     /**
      * Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -113,7 +113,8 @@ public CustomUnifiedHighlighter(
         int noMatchSize,
         int maxPassages,
         Predicate<String> fieldMatcher,
-        int maxAnalyzedOffset
+        int maxAnalyzedOffset,
+        int fieldMaxAnalyzedOffset
     ) throws IOException {
         super(searcher, analyzer);
         this.offsetSource = offsetSource;
@@ -126,6 +127,7 @@ public CustomUnifiedHighlighter(
         this.setFieldMatcher(fieldMatcher);
         this.maxAnalyzedOffset = maxAnalyzedOffset;
         fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
+        this.fieldMaxAnalyzedOffset = fieldMaxAnalyzedOffset;
     }
 
     /**
@@ -141,7 +143,10 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier<St
             return null;
         }
         int fieldValueLength = fieldValue.length();
-        if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {
+
+        if ((fieldMaxAnalyzedOffset < 0 || fieldMaxAnalyzedOffset > maxAnalyzedOffset)
+            && (offsetSource == OffsetSource.ANALYSIS)
+            && (fieldValueLength > maxAnalyzedOffset)) {
             throw new IllegalArgumentException(
                 "The length of ["
                     + field

@@ -92,6 +92,7 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
     public static final ParseField OPTIONS_FIELD = new ParseField("options");
     public static final ParseField HIGHLIGHT_QUERY_FIELD = new ParseField("highlight_query");
     public static final ParseField MATCHED_FIELDS_FIELD = new ParseField("matched_fields");
+    public static final ParseField MAX_ANALYZER_OFFSET_FIELD = new ParseField("max_analyzer_offset");
 
     protected String[] preTags;
 
@@ -129,6 +130,8 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
 
     protected Boolean requireFieldMatch;
 
+    protected int maxAnalyzerOffset;
+
     public AbstractHighlighterBuilder() {}
 
     protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, QueryBuilder queryBuilder) {
@@ -150,6 +153,7 @@ protected AbstractHighlighterBuilder(AbstractHighlighterBuilder<?> template, Que
         phraseLimit = template.phraseLimit;
         options = template.options;
         requireFieldMatch = template.requireFieldMatch;
+        maxAnalyzerOffset = template.maxAnalyzerOffset;
     }
 
     /**
@@ -542,6 +546,21 @@ public Integer phraseLimit() {
         return this.phraseLimit;
     }
 
+    /**
+     * Sets the maximum offset for the highlighter
+     * @param maxAnalyzerOffset the maximum offset that the highlighter will consider
+     * @return this for chaining
+     */
+    @SuppressWarnings("unchecked")
+    public HB maxAnalyzerOffset(int maxAnalyzerOffset) {
+        this.maxAnalyzerOffset = maxAnalyzerOffset;
+        return (HB) this;
+    }
+
+    public int maxAnalyzerOffset() {
+        return this.maxAnalyzerOffset;
+    }
+
     /**
      * Forces the highlighting to highlight fields based on the source even if fields are stored separately.
      */
@@ -623,6 +642,9 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException {
         if (phraseLimit != null) {
             builder.field(PHRASE_LIMIT_FIELD.getPreferredName(), phraseLimit);
         }
+        if (maxAnalyzerOffset > 0) {
+            builder.field(MAX_ANALYZER_OFFSET_FIELD.getPreferredName(), maxAnalyzerOffset);
+        }
     }
 
     static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB, HB> setupParser(ObjectParser<HB, Void> parser) {
@@ -642,6 +664,7 @@ static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<XContentParser, HB
         parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
         parser.declareBoolean(HB::forceSource, FORCE_SOURCE_FIELD);
         parser.declareInt(HB::phraseLimit, PHRASE_LIMIT_FIELD);
+        parser.declareInt(HB::maxAnalyzerOffset, MAX_ANALYZER_OFFSET_FIELD);
         parser.declareObject(HB::options, (XContentParser p, Void c) -> {
             try {
                 return p.map();

@@ -93,6 +93,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
     static final String[] DEFAULT_PRE_TAGS = new String[] { "<em>" };
     /** the default closing tag  */
     static final String[] DEFAULT_POST_TAGS = new String[] { "</em>" };
+    static final int DEFAULT_MAX_ANALYZER_OFFSET = -1;
 
     /** the default opening tags when {@code tag_schema = "styled"}  */
     public static final String[] DEFAULT_STYLED_PRE_TAG = {
@@ -126,6 +127,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
         .boundaryScannerLocale(Locale.ROOT)
         .noMatchSize(DEFAULT_NO_MATCH_SIZE)
         .phraseLimit(DEFAULT_PHRASE_LIMIT)
+        .maxAnalyzerOffset(DEFAULT_MAX_ANALYZER_OFFSET)
         .build();
 
     private final List<Field> fields;
@@ -399,6 +401,9 @@ private static void transferOptions(
         if (highlighterBuilder.highlightQuery != null) {
             targetOptionsBuilder.highlightQuery(highlighterBuilder.highlightQuery.toQuery(context));
         }
+        if (highlighterBuilder.maxAnalyzerOffset > 0) {
+            targetOptionsBuilder.maxAnalyzerOffset(highlighterBuilder.maxAnalyzerOffset);
+        }
     }
 
     static Character[] convertCharArray(char[] array) {

@@ -154,6 +154,12 @@ public static class FieldOptions {
 
         private int phraseLimit = -1;
 
+        private int maxAnalyzerOffset = -1;
+
+        public int maxAnalyzerOffset() {
+            return maxAnalyzerOffset;
+        }
+
         public int fragmentCharSize() {
             return fragmentCharSize;
         }
@@ -333,6 +339,11 @@ Builder phraseLimit(int phraseLimit) {
                 return this;
             }
 
+            Builder maxAnalyzerOffset(int maxAnalyzerOffset) {
+                fieldOptions.maxAnalyzerOffset = maxAnalyzerOffset;
+                return this;
+            }
+
             Builder matchedFields(Set<String> matchedFields) {
                 fieldOptions.matchedFields = matchedFields;
                 return this;

@@ -32,6 +32,8 @@
 package org.opensearch.search.fetch.subphase.highlight;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.highlight.Encoder;
 import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
@@ -133,13 +135,40 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
         return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
     }
 
+    public AnalyzerWrapper getLimitedOffsetAnalyzer(Analyzer a, int limit) {
+        return new AnalyzerWrapper(a.getReuseStrategy()) {
+
+            private Analyzer old = a;
+            private int maxOffset = limit;
+
+            @Override
+            protected Analyzer getWrappedAnalyzer(String fieldName) {
+                return old;
+            }
+
+            @Override
+            protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+                return new TokenStreamComponents(
+                    components.getSource(),
+                    new LimitTokenOffsetFilter(components.getTokenStream(), maxOffset)
+                );
+            }
+
+        };
+
+    }
+
     CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException {
         Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html")
             ? HighlightUtils.Encoders.HTML
             : HighlightUtils.Encoders.DEFAULT;
         int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
+        int fieldMaxAnalyzedOffset = fieldContext.field.fieldOptions().maxAnalyzerOffset();
         int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
         Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper());
+        if (fieldMaxAnalyzedOffset > 0) {
+            analyzer = getLimitedOffsetAnalyzer(analyzer, fieldMaxAnalyzedOffset);
+        }
         PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
         IndexSearcher searcher = fieldContext.context.searcher();
         OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
@@ -174,7 +203,8 @@ CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) th
             fieldContext.field.fieldOptions().noMatchSize(),
             higlighterNumberOfFragments,
             fieldMatcher(fieldContext),
-            maxAnalyzedOffset
+            maxAnalyzedOffset,
+            fieldMaxAnalyzedOffset
         );
     }
 

@@ -43,7 +43,6 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.CommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
@@ -63,6 +62,7 @@
 import org.apache.lucene.search.uhighlight.Snippet;
 import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.opensearch.common.Strings;
 import org.opensearch.common.lucene.search.MultiPhrasePrefixQuery;
 import org.opensearch.test.OpenSearchTestCase;
@@ -117,7 +117,8 @@ private void assertHighlightOneDoc(
                     noMatchSize,
                     expectedPassages.length,
                     name -> "text".equals(name),
-                    Integer.MAX_VALUE
+                    Integer.MAX_VALUE,
+                    -1
                 );
                 final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
                 assertEquals(snippets.length, expectedPassages.length);