Prevent DefaultPassageFormatter from taking shorter overlapping passages (#13384)

2024-06-21 02:57:59 -07:00 · 2024-06-21 02:57:59 -07:00 · 06c4a4b9e0
parent c7f4b8dee2
commit 06c4a4b9e0
5 changed files with 42 additions and 8 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -280,6 +280,8 @@ Optimizations
 Bug Fixes
 ---------------------

+* GITHUB#13384: Fix highlighter to use longer passages instead of shorter individual terms. (Zack Kendall)
+
 * GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
  some corner cases. (Greg Miller)

--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java
@ -76,10 +76,11 @@ public class DefaultPassageFormatter extends PassageFormatter {

        int end = passage.getMatchEnds()[i];
        assert end > start;
-        // its possible to have overlapping terms.
-        //   Look ahead to expand 'end' past all overlapping:
+        // It's possible to have overlapping terms.
+        //   Look ahead to expand 'end' past all overlapping.
+        //   Only take new end if it is larger than current end.
        while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) {
-          end = passage.getMatchEnds()[++i];
+          end = Math.max(end, passage.getMatchEnds()[++i]);
        }
        end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage

--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java
@ -18,6 +18,7 @@
 package org.apache.lucene.search.uhighlight;

 import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.BytesRef;

 public class TestDefaultPassageFormatter extends LuceneTestCase {
  public void testBasic() throws Exception {
@ -52,4 +53,26 @@ public class TestDefaultPassageFormatter extends LuceneTestCase {
            + "&lt;&#x2F;div&gt; of this <u>very</u> formatter.\u2026 It&#x27;s not <u>very</u> N&#x2F;A!",
        formatter.format(passages, text));
  }
+
+  public void testOverlappingPassages() throws Exception {
+    String content = "Yin yang loooooooooong, yin gap yang yong";
+    Passage[] passages = new Passage[1];
+    passages[0] = new Passage();
+    passages[0].setStartOffset(0);
+    passages[0].setEndOffset(41);
+    passages[0].setScore(5.93812f);
+    passages[0].setScore(5.93812f);
+    passages[0].addMatch(0, 3, new BytesRef("yin"), 1);
+    passages[0].addMatch(0, 22, new BytesRef("yin yang loooooooooooong"), 1);
+    passages[0].addMatch(4, 8, new BytesRef("yang"), 1);
+    passages[0].addMatch(9, 22, new BytesRef("loooooooooong"), 1);
+    passages[0].addMatch(24, 27, new BytesRef("yin"), 1);
+    passages[0].addMatch(32, 36, new BytesRef("yang"), 1);
+
+    // test default
+    DefaultPassageFormatter formatter = new DefaultPassageFormatter();
+    assertEquals(
+        "<b>Yin yang loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong",
+        formatter.format(passages, content));
+  }
 }
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@ -1422,7 +1422,7 @@ public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
            Set.of("field_tripples", "field_characters"),
            "danc",
            "<b>danc</b>e with star",
-            "<b>da</b><b>n</b><b>c</b>e with star");
+            "<b>dan</b><b>c</b>e with star");
      }
    }
  }
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java
@ -183,13 +183,15 @@ public class TestUnifiedHighlighterStrictPhrases extends UnifiedHighlighterTestB
  }

  public void testWithSameTermQuery() throws IOException {
-    indexWriter.addDocument(newDoc("Yin yang, yin gap yang"));
+    indexWriter.addDocument(newDoc("Yin yang loooooooooong, yin gap yang yong"));
    initReaderSearcherHighlighter();

    BooleanQuery query =
        new BooleanQuery.Builder()
            .add(new TermQuery(new Term("body", "yin")), BooleanClause.Occur.MUST)
-            .add(newPhraseQuery("body", "yin yang"), BooleanClause.Occur.MUST)
+            .add(new TermQuery(new Term("body", "yang")), BooleanClause.Occur.MUST)
+            .add(new TermQuery(new Term("body", "loooooooooong")), BooleanClause.Occur.MUST)
+            .add(newPhraseQuery("body", "yin\\ yang\\ loooooooooong"), BooleanClause.Occur.MUST)
            // add queries for other fields; we shouldn't highlight these because of that.
            .add(new TermQuery(new Term("title", "yang")), BooleanClause.Occur.SHOULD)
            .build();
@ -199,9 +201,15 @@ public class TestUnifiedHighlighterStrictPhrases extends UnifiedHighlighterTestB
        false); // We don't want duplicates from "Yin" being in TermQuery & PhraseQuery.
    String[] snippets = highlighter.highlight("body", query, topDocs);
    if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) {
-      assertArrayEquals(new String[] {"<b>Yin yang</b>, <b>yin</b> gap yang"}, snippets);
+      assertArrayEquals(
+          new String[] {"<b>Yin yang loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong"},
+          snippets);
    } else {
-      assertArrayEquals(new String[] {"<b>Yin</b> <b>yang</b>, <b>yin</b> gap yang"}, snippets);
+      assertArrayEquals(
+          new String[] {
+            "<b>Yin</b> <b>yang</b> <b>loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong"
+          },
+          snippets);
    }
  }