Fix UnifiedHighlighter DefaultPassageFormatter for non-offset order p…

…assages (apache#13832) The ellipsis should have been inserted in more scenarios.
javanna · Oct 3, 2024 · e3e3328 · e3e3328
1 parent eaa6214
commit e3e3328
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 1 deletion.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -270,6 +270,9 @@ Bug Fixes
 * GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
   of DoubleValues#doubleValue(). (Uwe Schindler)
 
+* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
+  when they were not sorted by startOffset. (Seunghan Jung)
+
 Changes in Runtime Behavior
 ---------------------
 

diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
@@ -64,7 +64,7 @@ public String format(Passage[] passages, String content) {
     int pos = 0;
     for (Passage passage : passages) {
       // don't add ellipsis if its the first one, or if its connected.
-      if (passage.getStartOffset() > pos && pos > 0) {
+      if (!sb.isEmpty() && passage.getStartOffset() != pos) {
         sb.append(ellipsis);
       }
       pos = passage.getStartOffset();

diff --git a/...highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java b/...highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java
@@ -75,4 +75,30 @@ public void testOverlappingPassages() throws Exception {
         "<b>Yin yang loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong",
         formatter.format(passages, content));
   }
+
+  public void testReversedStartOffsetOrder() {
+    String content =
+        "When indexing data in Solr, each document is composed of various fields. "
+            + "A document essentially represents a single record, and each document typically contains a unique ID field.";
+
+    Passage[] passages = new Passage[2];
+    passages[0] = new Passage();
+    passages[0].setStartOffset(73);
+    passages[0].setEndOffset(179);
+    passages[0].setScore(1.8846991f);
+    passages[0].addMatch(75, 83, new BytesRef("document"), 1);
+    passages[0].addMatch(133, 141, new BytesRef("document"), 1);
+
+    passages[1] = new Passage();
+    passages[1].setStartOffset(0);
+    passages[1].setEndOffset(73);
+    passages[1].setScore(1.5923802f);
+    passages[1].addMatch(33, 41, new BytesRef("document"), 1);
+
+    DefaultPassageFormatter formatter = new DefaultPassageFormatter("<b>", "</b>", "\n", false);
+    assertEquals(
+        "A <b>document</b> essentially represents a single record, and each <b>document</b> typically contains a unique ID field.\n"
+            + "When indexing data in Solr, each <b>document</b> is composed of various fields. ",
+        formatter.format(passages, content));
+  }
 }