Skip to content

Commit

Permalink
Fix UnifiedHighlighter DefaultPassageFormatter for non-offset order p…
Browse files Browse the repository at this point in the history
…assages (apache#13832)

The ellipsis should have been inserted in more scenarios.
  • Loading branch information
Seunghan-Jung authored Oct 3, 2024
1 parent eaa6214 commit e3e3328
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 1 deletion.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,9 @@ Bug Fixes
* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
of DoubleValues#doubleValue(). (Uwe Schindler)

* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
when they were not sorted by startOffset. (Seunghan Jung)

Changes in Runtime Behavior
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public String format(Passage[] passages, String content) {
int pos = 0;
for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected.
if (passage.getStartOffset() > pos && pos > 0) {
if (!sb.isEmpty() && passage.getStartOffset() != pos) {
sb.append(ellipsis);
}
pos = passage.getStartOffset();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,30 @@ public void testOverlappingPassages() throws Exception {
"<b>Yin yang loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong",
formatter.format(passages, content));
}

public void testReversedStartOffsetOrder() {
String content =
"When indexing data in Solr, each document is composed of various fields. "
+ "A document essentially represents a single record, and each document typically contains a unique ID field.";

Passage[] passages = new Passage[2];
passages[0] = new Passage();
passages[0].setStartOffset(73);
passages[0].setEndOffset(179);
passages[0].setScore(1.8846991f);
passages[0].addMatch(75, 83, new BytesRef("document"), 1);
passages[0].addMatch(133, 141, new BytesRef("document"), 1);

passages[1] = new Passage();
passages[1].setStartOffset(0);
passages[1].setEndOffset(73);
passages[1].setScore(1.5923802f);
passages[1].addMatch(33, 41, new BytesRef("document"), 1);

DefaultPassageFormatter formatter = new DefaultPassageFormatter("<b>", "</b>", "\n", false);
assertEquals(
"A <b>document</b> essentially represents a single record, and each <b>document</b> typically contains a unique ID field.\n"
+ "When indexing data in Solr, each <b>document</b> is composed of various fields. ",
formatter.format(passages, content));
}
}

0 comments on commit e3e3328

Please sign in to comment.