From 8773725ac0f15962b39bcc1f6fc5fb331b7a6d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=A0=95=EC=8A=B9=ED=95=9C=28Seunghan=20Jung=29/Search=20?= =?UTF-8?q?Platform?= Date: Fri, 24 May 2024 02:45:17 +0900 Subject: [PATCH] UnifiedHighlighter: new passageSortComparator option (#13276) new 'passageSortComparator' option to allow sorting other than offset order --- lucene/CHANGES.txt | 2 ++ .../search/uhighlight/FieldHighlighter.java | 8 +++--- .../search/uhighlight/UnifiedHighlighter.java | 27 ++++++++++++++++--- .../TestUnifiedHighlighterExtensibility.java | 17 +++++++++--- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index db5a4ea2677..d5e8f12227c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -298,6 +298,8 @@ Improvements * GITHUB#13385: Add Intervals.noIntervals() method to produce an empty IntervalsSource. (Aniketh Jain, Uwe Schindler, Alan Woodward)) +* GITHUB#13276: UnifiedHighlighter: new 'passageSortComparator' option to allow sorting other than offset order. (Seunghan Jung) + Optimizations --------------------- diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java index aacb9089e91..60da92b2c7b 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java @@ -40,6 +40,7 @@ public class FieldHighlighter { protected final int maxPassages; protected final int maxNoHighlightPassages; protected final PassageFormatter passageFormatter; + protected final Comparator passageSortComparator; public FieldHighlighter( String field, @@ -48,7 +49,8 @@ public class FieldHighlighter { PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, - PassageFormatter passageFormatter) { + PassageFormatter passageFormatter, + Comparator passageSortComparator) { this.field = field; this.fieldOffsetStrategy = fieldOffsetStrategy; this.breakIterator = breakIterator; @@ -56,6 +58,7 @@ public class FieldHighlighter { this.maxPassages = maxPassages; this.maxNoHighlightPassages = maxNoHighlightPassages; this.passageFormatter = passageFormatter; + this.passageSortComparator = passageSortComparator; } public String getField() { @@ -191,8 +194,7 @@ public class FieldHighlighter { maybeAddPassage(passageQueue, passageScorer, passage, contentLength); Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]); - // sort in ascending order - Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset)); + Arrays.sort(passages, passageSortComparator); return passages; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index 69065933d7c..397239c7d66 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -21,6 +21,7 @@ import java.text.BreakIterator; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Comparator; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; @@ -86,6 +87,7 @@ import org.apache.lucene.util.InPlaceMergeSorter; *
  • {@link #getBreakIterator(String)}: Customize how the text is divided into passages. *
  • {@link #getScorer(String)}: Customize how passages are ranked. *
  • {@link #getFormatter(String)}: Customize how snippets are formatted. + *
  • {@link #getPassageSortComparator(String)}: Customize how snippets are formatted. * * *

    This is thread-safe, notwithstanding the setters. @@ -113,6 +115,8 @@ public class UnifiedHighlighter { private static final PassageScorer DEFAULT_PASSAGE_SCORER = new PassageScorer(); private static final PassageFormatter DEFAULT_PASSAGE_FORMATTER = new DefaultPassageFormatter(); private static final int DEFAULT_MAX_HIGHLIGHT_PASSAGES = -1; + private static final Comparator DEFAULT_PASSAGE_SORT_COMPARATOR = + Comparator.comparingInt(Passage::getStartOffset); protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher @@ -151,6 +155,8 @@ public class UnifiedHighlighter { private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Comparator passageSortComparator = DEFAULT_PASSAGE_SORT_COMPARATOR; + /** * Constructs the highlighter with the given index searcher and analyzer. * @@ -276,6 +282,7 @@ public class UnifiedHighlighter { private PassageFormatter formatter = DEFAULT_PASSAGE_FORMATTER; private int maxNoHighlightPassages = DEFAULT_MAX_HIGHLIGHT_PASSAGES; private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD; + private Comparator passageSortComparator = DEFAULT_PASSAGE_SORT_COMPARATOR; /** * Constructor for UH builder which accepts {@link IndexSearcher} and {@link Analyzer} objects. @@ -402,6 +409,11 @@ public class UnifiedHighlighter { return this; } + public Builder withPassageSortComparator(Comparator value) { + this.passageSortComparator = value; + return this; + } + public UnifiedHighlighter build() { return new UnifiedHighlighter(this); } @@ -463,6 +475,7 @@ public class UnifiedHighlighter { this.formatter = builder.formatter; this.maxNoHighlightPassages = builder.maxNoHighlightPassages; this.cacheFieldValCharsThreshold = builder.cacheFieldValCharsThreshold; + this.passageSortComparator = builder.passageSortComparator; } /** Extracts matching terms */ @@ -614,6 +627,11 @@ public class UnifiedHighlighter { return formatter; } + /** Returns the {@link Comparator} to use for finally sorting passages. */ + protected Comparator getPassageSortComparator(String field) { + return passageSortComparator; + } + /** * Returns the number of leading passages (as delineated by the {@link BreakIterator}) when no * highlights could be found. If it's less than 0 (the default) then this defaults to the {@code @@ -1119,7 +1137,8 @@ public class UnifiedHighlighter { getScorer(field), maxPassages, getMaxNoHighlightPassages(field), - getFormatter(field)); + getFormatter(field), + getPassageSortComparator(field)); } protected FieldHighlighter newFieldHighlighter( @@ -1129,7 +1148,8 @@ public class UnifiedHighlighter { PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, - PassageFormatter passageFormatter) { + PassageFormatter passageFormatter, + Comparator passageSortComparator) { return new FieldHighlighter( field, fieldOffsetStrategy, @@ -1137,7 +1157,8 @@ public class UnifiedHighlighter { passageScorer, maxPassages, maxNoHighlightPassages, - passageFormatter); + passageFormatter, + passageSortComparator); } protected UHComponents getHighlightComponents(String field, Query query, Set allTerms) { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java index d6ce6464d4c..d12a2109101 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.uhighlight.visibility; import java.io.IOException; import java.text.BreakIterator; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Set; @@ -129,6 +130,11 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { return super.getFormatter(field); } + @Override + protected Comparator getPassageSortComparator(String field) { + return super.getPassageSortComparator(field); + } + @Override public Analyzer getIndexAnalyzer() { return super.getIndexAnalyzer(); @@ -186,7 +192,8 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { getScorer(field), maxPassages, getMaxNoHighlightPassages(field), - getFormatter(field)); + getFormatter(field), + getPassageSortComparator(field)); } @Override @@ -240,7 +247,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { public void testFieldHiglighterExtensibility() { final String fieldName = "fieldName"; FieldHighlighter fieldHighlighter = - new FieldHighlighter(fieldName, null, null, null, 1, 1, null) { + new FieldHighlighter(fieldName, null, null, null, 1, 1, null, null) { @Override protected Passage[] highlightOffsetsEnums(OffsetsEnum offsetsEnums) throws IOException { return super.highlightOffsetsEnums(offsetsEnums); @@ -262,7 +269,8 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, - PassageFormatter passageFormatter) { + PassageFormatter passageFormatter, + Comparator passageSortComparator) { super( field, fieldOffsetStrategy, @@ -270,7 +278,8 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { passageScorer, maxPassages, maxNoHighlightPassages, - passageFormatter); + passageFormatter, + passageSortComparator); } @Override