LUCENE-9431: UnifiedHighlighter WEIGHT_MATCHES is now true by default (#362)

Co-authored-by: Animesh Pandey <apanimesh061@gmail.com>
This commit is contained in:
David Smiley 2021-10-22 20:40:22 -04:00 committed by GitHub
parent e3151d6c7d
commit 2719cf6630
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 51 additions and 54 deletions

View File

@ -160,6 +160,9 @@ API Changes
* LUCENE-9325: Sort is now final, and the `setSort()` method has been removed (Alan Woodward)
* LUCENE-9431: The UnifiedHighlighter's WEIGHT_MATCHES flag is now set by default, provided its
requirements are met. It can be disabled via over-riding getFlags (Animesh Pandey, David Smiley)
* LUCENE-10158: Add a new interface Unwrappable to the utils package to allow code to
unwrap wrappers/delegators that are added by Lucene's testing framework. This will allow
testing new MMapDirectory implementation based on JDK Project Panama. (Uwe Schindler)

View File

@ -86,7 +86,7 @@ import org.apache.lucene.util.InPlaceMergeSorter;
* <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
* </ul>
*
* <p>This is thread-safe.
* <p>This is thread-safe, notwithstanding the setters.
*
* @lucene.experimental
*/
@ -823,6 +823,7 @@ public class UnifiedHighlighter {
return filteredTerms.toArray(new BytesRef[filteredTerms.size()]);
}
/** Customize the highlighting flags to use by field. */
protected Set<HighlightFlag> getFlags(String field) {
Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
if (shouldHandleMultiTermQuery(field)) {
@ -834,6 +835,11 @@ public class UnifiedHighlighter {
if (shouldPreferPassageRelevancyOverSpeed(field)) {
highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
}
if (highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
&& highlightFlags.contains(HighlightFlag.PHRASES)
&& highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)) {
highlightFlags.add(HighlightFlag.WEIGHT_MATCHES);
}
return highlightFlags;
}
@ -1168,9 +1174,11 @@ public class UnifiedHighlighter {
/**
* Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. It's
* more accurate to the query, though might not calculate passage relevancy as well. Use of this
* flag requires {@link #MULTI_TERM_QUERY} and {@link #PHRASES}. {@link
* #PASSAGE_RELEVANCY_OVER_SPEED} will be ignored. False by default.
* more accurate to the query, and the snippets can be a little different for phrases because
* the whole phrase is marked up instead of each word. The passage relevancy calculation can be
* different (maybe worse?) and it's slower when highlighting many fields. Use of this flag
* requires {@link #MULTI_TERM_QUERY} and {@link #PHRASES} and {@link
* #PASSAGE_RELEVANCY_OVER_SPEED}. True by default because those booleans are true by default.
*/
WEIGHT_MATCHES

View File

@ -460,6 +460,25 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
ir.close();
}
public void testHighlighterDefaultFlags() throws Exception {
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
Document document = new Document();
document.add(new Field("body", "test body", fieldType));
iw.addDocument(document);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
Set<HighlightFlag> flags = highlighter.getFlags("body");
assertTrue(flags.contains(HighlightFlag.PHRASES));
assertTrue(flags.contains(HighlightFlag.MULTI_TERM_QUERY));
assertTrue(flags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED));
assertTrue(flags.contains(HighlightFlag.WEIGHT_MATCHES));
// if more flags are added, bump the number below and add an assertTrue or assertFalse above
assertEquals(4, HighlightFlag.values().length);
ir.close();
}
public void testCuriousGeorge() throws Exception {
String text =
"Its the formula for success for preschoolers—Curious George and fire trucks! "

View File

@ -778,10 +778,15 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
snippets = highlighter.highlight("body", query, topDocs);
assertEquals(1, snippets.length);
// All flags are enabled.
assertEquals(
"<b>Test(body:te*)</b> a <b>one(body:*one*)</b> <b>sentence(body:zentence~~2)</b> document.",
"" + highlighter.getFlags("body"),
HighlightFlag.values().length,
highlighter.getFlags("body").size());
assertEquals(
"" + highlighter.getFlags("title"),
"<b>Test(body:te*)</b> a <b>one(body:*one*)</b> <b>sentence(sentence)</b> document.",
snippets[0]);
ir.close();
}

View File

@ -27,7 +27,6 @@ import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
@ -87,47 +86,10 @@ public class TestUnifiedHighlighterTermIntervals extends LuceneTestCase {
static UnifiedHighlighter randomUnifiedHighlighter(
IndexSearcher searcher, Analyzer indexAnalyzer) {
return randomUnifiedHighlighter(
return TestUnifiedHighlighter.randomUnifiedHighlighter(
searcher, indexAnalyzer, EnumSet.noneOf(HighlightFlag.class), null);
}
static UnifiedHighlighter randomUnifiedHighlighter(
IndexSearcher searcher,
Analyzer indexAnalyzer,
EnumSet<HighlightFlag> mandatoryFlags,
Boolean requireFieldMatch) {
final UnifiedHighlighter uh =
new UnifiedHighlighter(searcher, indexAnalyzer) {
Set<HighlightFlag> flags; // consistently random set of flags for this test run
@Override
protected Set<HighlightFlag> getFlags(String field) {
if (flags != null) {
return flags;
}
final EnumSet<HighlightFlag> result = EnumSet.copyOf(mandatoryFlags);
int r = random().nextInt();
for (HighlightFlag highlightFlag : HighlightFlag.values()) {
if (((1 << highlightFlag.ordinal()) & r) == 0) {
result.add(highlightFlag);
}
}
if (result.contains(HighlightFlag.WEIGHT_MATCHES)) {
// these two are required for WEIGHT_MATCHES
result.add(HighlightFlag.MULTI_TERM_QUERY);
result.add(HighlightFlag.PHRASES);
}
return flags = result;
}
};
uh.setCacheFieldValCharsThreshold(random().nextInt(100));
if (requireFieldMatch == Boolean.FALSE
|| (requireFieldMatch == null && random().nextBoolean())) {
uh.setFieldMatcher(f -> true); // requireFieldMatch==false
}
return uh;
}
//
// Tests below were ported from the PostingsHighlighter. Possibly augmented. Far below are newer
// tests.
@ -1033,15 +995,15 @@ public class TestUnifiedHighlighterTermIntervals extends LuceneTestCase {
assertEquals(1, topDocs.totalHits.value);
String[] snippets = highlighter.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
if (highlighter.getFlags("title").contains(HighlightFlag.WEIGHT_MATCHES)) {
assertEquals(
"" + highlighter.getFlags("title"), "<b>This is the title field</b>.", snippets[0]);
} else {
assertEquals(
"" + highlighter.getFlags("title"),
"<b>This</b> <b>is</b> <b>the</b> title <b>field</b>.",
snippets[0]);
}
// All flags are enabled.
assertEquals(
"" + highlighter.getFlags("title"),
HighlightFlag.values().length,
highlighter.getFlags("title").size());
assertEquals(
"" + highlighter.getFlags("title"),
"<b>This</b> <b>is</b> <b>the</b> title <b>field</b>.",
snippets[0]);
ir.close();
}