From a29d12f417c5d41cb6c5f882581907a87afcb7b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=A0=95=EC=8A=B9=ED=95=9C=28Seunghan=20Jung=29/Search=20?= =?UTF-8?q?Platform?= Date: Thu, 3 Oct 2024 15:00:09 +0900 Subject: [PATCH] Fix UnifiedHighlighter DefaultPassageFormatter for non-offset order passages (#13832) The ellipsis should have been inserted in more scenarios. (cherry picked from commit e3e3328a550a8f8760cc3aa213ebf7e7140399fb) --- lucene/CHANGES.txt | 3 ++- .../uhighlight/DefaultPassageFormatter.java | 2 +- .../TestDefaultPassageFormatter.java | 26 +++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6eb82b14c3b..5c22049f7aa 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -23,7 +23,8 @@ Optimizations Bug Fixes --------------------- -(No changes) +* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended + when they were not sorted by startOffset. (Seunghan Jung) Other --------------------- diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java index 27281a91be7..4cd2b07fc1c 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java @@ -64,7 +64,7 @@ public class DefaultPassageFormatter extends PassageFormatter { int pos = 0; for (Passage passage : passages) { // don't add ellipsis if its the first one, or if its connected. - if (passage.getStartOffset() > pos && pos > 0) { + if (!sb.isEmpty() && passage.getStartOffset() != pos) { sb.append(ellipsis); } pos = passage.getStartOffset(); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java index b59fea47453..617077c987c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java @@ -75,4 +75,30 @@ public class TestDefaultPassageFormatter extends LuceneTestCase { "Yin yang loooooooooong, yin gap yang yong", formatter.format(passages, content)); } + + public void testReversedStartOffsetOrder() { + String content = + "When indexing data in Solr, each document is composed of various fields. " + + "A document essentially represents a single record, and each document typically contains a unique ID field."; + + Passage[] passages = new Passage[2]; + passages[0] = new Passage(); + passages[0].setStartOffset(73); + passages[0].setEndOffset(179); + passages[0].setScore(1.8846991f); + passages[0].addMatch(75, 83, new BytesRef("document"), 1); + passages[0].addMatch(133, 141, new BytesRef("document"), 1); + + passages[1] = new Passage(); + passages[1].setStartOffset(0); + passages[1].setEndOffset(73); + passages[1].setScore(1.5923802f); + passages[1].addMatch(33, 41, new BytesRef("document"), 1); + + DefaultPassageFormatter formatter = new DefaultPassageFormatter("", "", "\n", false); + assertEquals( + "A document essentially represents a single record, and each document typically contains a unique ID field.\n" + + "When indexing data in Solr, each document is composed of various fields. ", + formatter.format(passages, content)); + } }