LUCENE-7575: Add UnifiedHighlighter field matcher predicate (AKA requireFieldMatch=false)

(cherry picked from commit 2e948fe)
This commit is contained in:
David Smiley 2016-12-05 16:11:57 -05:00
parent cdce621087
commit 4e7a7dbf9a
7 changed files with 519 additions and 84 deletions

View File

@ -3,6 +3,57 @@ Lucene Change Log
For more information on past and future Lucene versions, please see:
http://s.apache.org/luceneversions
======================= Lucene 7.0.0 =======================
API Changes
* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
Use setSplitOnWhitespace(true) to get the old behavior. (Steve Rowe)
* LUCENE-7369: Similarity.coord and BooleanQuery.disableCoord are removed.
(Adrien Grand)
* LUCENE-7368: Removed query normalization. (Adrien Grand)
* LUCENE-7355: AnalyzingQueryParser has been removed as its functionality has
been folded into the classic QueryParser. (Adrien Grand)
* LUCENE-7407: Doc values APIs have been switched from random access
to iterators, enabling future codec compression improvements. (Mike
McCandless)
* LUCENE-7475: Norms now support sparsity, allowing to pay for what is
actually used. (Adrien Grand)
* LUCENE-7494: Points now have a per-field API, like doc values. (Adrien Grand)
Bug Fixes
Improvements
* LUCENE-7489: Better storage of sparse doc-values fields with the default
codec. (Adrien Grand)
Optimizations
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
in the sets of SHOULD and FILTER clauses, or both in MUST/FILTER and MUST_NOT
clauses. (Spyros Kapnissis via Adrien Grand, Uwe Schindler)
* LUCENE-7506: FastTaxonomyFacetCounts should use CPU in proportion to
the size of the intersected set of hits from the query and documents
that have a facet value, so sparse faceting works as expected
(Adrien Grand via Mike McCandless)
* LUCENE-7519: Add optimized APIs to compute browse-only top level
facets (Mike McCandless)
Other
* LUCENE-7328: Remove LegacyNumericEncoding from GeoPointField. (Nick Knize)
* LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)
======================= Lucene 6.4.0 =======================
API Changes
@ -73,6 +124,11 @@ Improvements
* LUCENE-7537: Index time sorting now supports multi-valued sorts
using selectors (MIN, MAX, etc.) (Jim Ferenczi via Mike McCandless)
* LUCENE-7575: UnifiedHighlighter can now highlight fields with queries that don't
necessarily refer to that field (AKA requireFieldMatch==false). Disabled by default.
See UH get/setFieldMatcher. (Jim Ferenczi via David Smiley)
Optimizations
* LUCENE-7568: Optimize merging when index sorting is used but the

View File

@ -23,6 +23,7 @@ import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.FilteringTokenFilter;
@ -49,7 +50,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
private final LeafReader leafReader;
private final CharacterRunAutomaton preMemIndexFilterAutomaton;
public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
public MemoryIndexOffsetStrategy(String field, Predicate<String> fieldMatcher, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
CharacterRunAutomaton[] automata, Analyzer analyzer,
Function<Query, Collection<Query>> multiTermQueryRewrite) {
super(field, extractedTerms, phraseHelper, automata, analyzer);
@ -57,13 +58,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
// preFilter for MemoryIndex
preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite);
preMemIndexFilterAutomaton = buildCombinedAutomaton(fieldMatcher, terms, this.automata, phraseHelper, multiTermQueryRewrite);
}
/**
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
*/
private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
private static CharacterRunAutomaton buildCombinedAutomaton(Predicate<String> fieldMatcher,
BytesRef[] terms,
CharacterRunAutomaton[] automata,
PhraseHelper strictPhrases,
Function<Query, Collection<Query>> multiTermQueryRewrite) {
@ -74,7 +76,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
Collections.addAll(allAutomata, automata);
for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
Collections.addAll(allAutomata,
MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
MultiTermHighlighting.extractAutomata(spanQuery, fieldMatcher, true, multiTermQueryRewrite));//true==lookInSpan
}
if (allAutomata.size() == 1) {

View File

@ -22,6 +22,7 @@ import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
@ -56,50 +57,52 @@ class MultiTermHighlighting {
}
/**
* Extracts all MultiTermQueries for {@code field}, and returns equivalent
* automata that will match terms.
* Extracts MultiTermQueries that match the provided field predicate.
* Returns equivalent automata that will match terms.
*/
public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan,
public static CharacterRunAutomaton[] extractAutomata(Query query,
Predicate<String> fieldMatcher,
boolean lookInSpan,
Function<Query, Collection<Query>> preRewriteFunc) {
List<CharacterRunAutomaton> list = new ArrayList<>();
Collection<Query> customSubQueries = preRewriteFunc.apply(query);
if (customSubQueries != null) {
for (Query sub : customSubQueries) {
list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (query instanceof BooleanQuery) {
for (BooleanClause clause : (BooleanQuery) query) {
if (!clause.isProhibited()) {
list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
}
}
} else if (query instanceof ConstantScoreQuery) {
list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan,
list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), fieldMatcher, lookInSpan,
preRewriteFunc)));
} else if (query instanceof DisjunctionMaxQuery) {
for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanOrQuery) {
for (Query sub : ((SpanOrQuery) query).getClauses()) {
list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanNearQuery) {
for (Query sub : ((SpanNearQuery) query).getClauses()) {
list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanNotQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan,
list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), fieldMatcher, lookInSpan,
preRewriteFunc)));
} else if (lookInSpan && query instanceof SpanPositionCheckQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan,
list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), fieldMatcher, lookInSpan,
preRewriteFunc)));
} else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) {
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field,
lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(),
fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (aq.getField().equals(field)) {
if (fieldMatcher.test(aq.getField())) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
@ -110,7 +113,7 @@ class MultiTermHighlighting {
} else if (query instanceof PrefixQuery) {
final PrefixQuery pq = (PrefixQuery) query;
Term prefix = pq.getPrefix();
if (prefix.field().equals(field)) {
if (fieldMatcher.test(prefix.field())) {
list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
Automata.makeAnyString())) {
@Override
@ -121,7 +124,7 @@ class MultiTermHighlighting {
}
} else if (query instanceof FuzzyQuery) {
final FuzzyQuery fq = (FuzzyQuery) query;
if (fq.getField().equals(field)) {
if (fieldMatcher.test(fq.getField())) {
String utf16 = fq.getTerm().text();
int termText[] = new int[utf16.codePointCount(0, utf16.length())];
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
@ -142,7 +145,7 @@ class MultiTermHighlighting {
}
} else if (query instanceof TermRangeQuery) {
final TermRangeQuery tq = (TermRangeQuery) query;
if (tq.getField().equals(field)) {
if (fieldMatcher.test(tq.getField())) {
final CharsRef lowerBound;
if (tq.getLowerTerm() == null) {
lowerBound = null;

View File

@ -16,17 +16,50 @@
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.search.spans.*;
import org.apache.lucene.search.spans.SpanCollector;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.*;
import java.util.function.Function;
/**
* Helps the {@link FieldOffsetStrategy} with strict position highlighting (e.g. highlight phrases correctly).
* This is a stateful class holding information about the query, but it can (and is) re-used across highlighting
@ -40,7 +73,7 @@ import java.util.function.Function;
public class PhraseHelper {
public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_",
spanQuery -> null, query -> null, true);
(s) -> false, spanQuery -> null, query -> null, true);
//TODO it seems this ought to be a general thing on Spans?
private static final Comparator<? super Spans> SPANS_COMPARATOR = (o1, o2) -> {
@ -59,10 +92,11 @@ public class PhraseHelper {
}
};
private final String fieldName; // if non-null, only look at queries/terms for this field
private final String fieldName;
private final Set<Term> positionInsensitiveTerms; // (TermQuery terms)
private final Set<SpanQuery> spanQueries;
private final boolean willRewrite;
private final Predicate<String> fieldMatcher;
/**
* Constructor.
@ -73,14 +107,15 @@ public class PhraseHelper {
* to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked.
* {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is
* usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones.
* {@code fieldMatcher} The field name predicate to use for extracting the query part that must be highlighted.
*/
public PhraseHelper(Query query, String field, Function<SpanQuery, Boolean> rewriteQueryPred,
public PhraseHelper(Query query, String field, Predicate<String> fieldMatcher, Function<SpanQuery, Boolean> rewriteQueryPred,
Function<Query, Collection<Query>> preExtractRewriteFunction,
boolean ignoreQueriesNeedingRewrite) {
this.fieldName = field; // if null then don't require field match
this.fieldName = field;
this.fieldMatcher = fieldMatcher;
// filter terms to those we want
positionInsensitiveTerms = field != null ? new FieldFilteringTermHashSet(field) : new HashSet<>();
// requireFieldMatch optional
positionInsensitiveTerms = new FieldFilteringTermSet();
spanQueries = new HashSet<>();
// TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls
@ -131,11 +166,11 @@ public class PhraseHelper {
@Override
protected void extractWeightedSpanTerms(Map<String, WeightedSpanTerm> terms, SpanQuery spanQuery,
float boost) throws IOException {
if (field != null) {
// if this span query isn't for this field, skip it.
Set<String> fieldNameSet = new HashSet<>();//TODO reuse. note: almost always size 1
collectSpanQueryFields(spanQuery, fieldNameSet);
if (!fieldNameSet.contains(field)) {
// if this span query isn't for this field, skip it.
Set<String> fieldNameSet = new HashSet<>();//TODO reuse. note: almost always size 1
collectSpanQueryFields(spanQuery, fieldNameSet);
for (String spanField : fieldNameSet) {
if (!fieldMatcher.test(spanField)) {
return;
}
}
@ -190,10 +225,11 @@ public class PhraseHelper {
if (spanQueries.isEmpty()) {
return Collections.emptyMap();
}
final LeafReader filteredReader = new SingleFieldFilterLeafReader(leafReader, fieldName);
// for each SpanQuery, collect the member spans into a map.
Map<BytesRef, Spans> result = new HashMap<>();
for (SpanQuery spanQuery : spanQueries) {
getTermToSpans(spanQuery, leafReader.getContext(), doc, result);
getTermToSpans(spanQuery, filteredReader.getContext(), doc, result);
}
return result;
}
@ -203,15 +239,14 @@ public class PhraseHelper {
int doc, Map<BytesRef, Spans> result)
throws IOException {
// note: in WSTE there was some field specific looping that seemed pointless so that isn't here.
final IndexSearcher searcher = new IndexSearcher(readerContext);
final IndexSearcher searcher = new IndexSearcher(readerContext.reader());
searcher.setQueryCache(null);
if (willRewrite) {
spanQuery = (SpanQuery) searcher.rewrite(spanQuery); // searcher.rewrite loops till done
}
// Get the underlying query terms
TreeSet<Term> termSet = new TreeSet<>(); // sorted so we can loop over results in order shortly...
TreeSet<Term> termSet = new FieldFilteringTermSet(); // sorted so we can loop over results in order shortly...
searcher.createWeight(spanQuery, false).extractTerms(termSet);//needsScores==false
// Get Spans by running the query against the reader
@ -240,9 +275,6 @@ public class PhraseHelper {
for (final Term queryTerm : termSet) {
// note: we expect that at least one query term will pass these filters. This is because the collected
// spanQuery list were already filtered by these conditions.
if (fieldName != null && fieldName.equals(queryTerm.field()) == false) {
continue;
}
if (positionInsensitiveTerms.contains(queryTerm)) {
continue;
}
@ -375,19 +407,17 @@ public class PhraseHelper {
}
/**
* Simple HashSet that filters out Terms not matching a desired field on {@code add()}.
* Simple TreeSet that filters out Terms not matching the provided predicate on {@code add()}.
*/
private static class FieldFilteringTermHashSet extends HashSet<Term> {
private final String field;
FieldFilteringTermHashSet(String field) {
this.field = field;
}
private class FieldFilteringTermSet extends TreeSet<Term> {
@Override
public boolean add(Term term) {
if (term.field().equals(field)) {
return super.add(term);
if (fieldMatcher.test(term.field())) {
if (term.field().equals(fieldName)) {
return super.add(term);
} else {
return super.add(new Term(fieldName, term.bytes()));
}
} else {
return false;
}
@ -499,6 +529,64 @@ public class PhraseHelper {
}
}
/**
* This reader will just delegate every call to a single field in the wrapped
* LeafReader. This way we ensure that all queries going through this reader target the same field.
*/
static final class SingleFieldFilterLeafReader extends FilterLeafReader {
final String fieldName;
SingleFieldFilterLeafReader(LeafReader in, String fieldName) {
super(in);
this.fieldName = fieldName;
}
@Override
public FieldInfos getFieldInfos() {
throw new UnsupportedOperationException();
}
@Override
public Fields fields() throws IOException {
return new FilterFields(super.fields()) {
@Override
public Terms terms(String field) throws IOException {
return super.terms(fieldName);
}
@Override
public Iterator<String> iterator() {
return Collections.singletonList(fieldName).iterator();
}
@Override
public int size() {
return 1;
}
};
}
@Override
public NumericDocValues getNumericDocValues(String field) throws IOException {
return super.getNumericDocValues(fieldName);
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
return super.getBinaryDocValues(fieldName);
}
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
return super.getSortedDocValues(fieldName);
}
@Override
public NumericDocValues getNormValues(String field) throws IOException {
return super.getNormValues(fieldName);
}
}
/**
* A Spans based on a list of cached spans for one doc. It is pre-positioned to this doc.
*/

View File

@ -24,6 +24,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -31,6 +32,7 @@ import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
@ -58,7 +60,6 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
@ -119,13 +120,13 @@ public class UnifiedHighlighter {
private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy
// private boolean defaultRequireFieldMatch = true; TODO
private int maxLength = DEFAULT_MAX_LENGTH;
// BreakIterator is stateful so we use a Supplier factory method
private Supplier<BreakIterator> defaultBreakIterator = () -> BreakIterator.getSentenceInstance(Locale.ROOT);
private Predicate<String> defaultFieldMatcher;
private PassageScorer defaultScorer = new PassageScorer();
private PassageFormatter defaultFormatter = new DefaultPassageFormatter();
@ -140,8 +141,8 @@ public class UnifiedHighlighter {
/**
* Calls {@link Weight#extractTerms(Set)} on an empty index for the query.
*/
protected static SortedSet<Term> extractTerms(Query query) throws IOException {
SortedSet<Term> queryTerms = new TreeSet<>();
protected static Set<Term> extractTerms(Query query) throws IOException {
Set<Term> queryTerms = new HashSet<>();
EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
return queryTerms;
}
@ -197,6 +198,10 @@ public class UnifiedHighlighter {
this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
}
public void setFieldMatcher(Predicate<String> predicate) {
this.defaultFieldMatcher = predicate;
}
/**
* Returns whether {@link MultiTermQuery} derivatives will be highlighted. By default it's enabled. MTQ
* highlighting can be expensive, particularly when using offsets in postings.
@ -220,6 +225,18 @@ public class UnifiedHighlighter {
return defaultPassageRelevancyOverSpeed;
}
/**
* Returns the predicate to use for extracting the query part that must be highlighted.
* By default only queries that target the current field are kept. (AKA requireFieldMatch)
*/
protected Predicate<String> getFieldMatcher(String field) {
if (defaultFieldMatcher != null) {
return defaultFieldMatcher;
} else {
// requireFieldMatch = true
return (qf) -> field.equals(qf);
}
}
/**
* The maximum content size to process. Content will be truncated to this size before highlighting. Typically
@ -548,7 +565,7 @@ public class UnifiedHighlighter {
copyAndSortFieldsWithMaxPassages(fieldsIn, maxPassagesIn, fields, maxPassages); // latter 2 are "out" params
// Init field highlighters (where most of the highlight logic lives, and on a per field basis)
SortedSet<Term> queryTerms = extractTerms(query);
Set<Term> queryTerms = extractTerms(query);
FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
int numTermVectors = 0;
int numPostings = 0;
@ -718,13 +735,13 @@ public class UnifiedHighlighter {
getClass().getSimpleName() + " without an IndexSearcher.");
}
Objects.requireNonNull(content, "content is required");
SortedSet<Term> queryTerms = extractTerms(query);
Set<Term> queryTerms = extractTerms(query);
return getFieldHighlighter(field, query, queryTerms, maxPassages)
.highlightFieldForDoc(null, -1, content);
}
protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(field, allTerms);
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
@ -738,19 +755,15 @@ public class UnifiedHighlighter {
getFormatter(field));
}
protected static BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
// TODO consider requireFieldMatch
Term floor = new Term(field, "");
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
// Strip off the redundant field:
BytesRef[] terms = new BytesRef[fieldTerms.size()];
int termUpto = 0;
for (Term term : fieldTerms) {
terms[termUpto++] = term.bytes();
protected static BytesRef[] filterExtractedTerms(Predicate<String> fieldMatcher, Set<Term> queryTerms) {
// Strip off the redundant field and sort the remaining terms
SortedSet<BytesRef> filteredTerms = new TreeSet<>();
for (Term term : queryTerms) {
if (fieldMatcher.test(term.field())) {
filteredTerms.add(term.bytes());
}
}
return terms;
return filteredTerms.toArray(new BytesRef[filteredTerms.size()]);
}
protected Set<HighlightFlag> getFlags(String field) {
@ -771,14 +784,13 @@ public class UnifiedHighlighter {
boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
return highlightPhrasesStrictly ?
new PhraseHelper(query, field, this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) :
PhraseHelper.NONE;
new PhraseHelper(query, field, getFieldMatcher(field),
this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) : PhraseHelper.NONE;
}
protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES),
this::preMultiTermQueryRewrite)
? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), !highlightFlags.contains(HighlightFlag.PHRASES), this::preMultiTermQueryRewrite)
: ZERO_LEN_AUTOMATA_ARRAY;
}
@ -826,7 +838,7 @@ public class UnifiedHighlighter {
//skip using a memory index since it's pure term filtering
return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
} else {
return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
return new MemoryIndexOffsetStrategy(field, getFieldMatcher(field), terms, phraseHelper, automata, getIndexAnalyzer(),
this::preMultiTermQueryRewrite);
}
case NONE_NEEDED:

View File

@ -25,6 +25,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.Predicate;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import org.apache.lucene.analysis.MockAnalyzer;
@ -32,14 +33,17 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
@ -959,4 +963,275 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
ir.close();
}
private IndexReader indexSomeFields() throws IOException {
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
FieldType ft = new FieldType();
ft.setIndexOptions(IndexOptions.NONE);
ft.setTokenized(false);
ft.setStored(true);
ft.freeze();
Field title = new Field("title", "", fieldType);
Field text = new Field("text", "", fieldType);
Field category = new Field("category", "", fieldType);
Document doc = new Document();
doc.add(title);
doc.add(text);
doc.add(category);
title.setStringValue("This is the title field.");
text.setStringValue("This is the text field. You can put some text if you want.");
category.setStringValue("This is the category field.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
return ir;
}
public void testFieldMatcherTermQuery() throws Exception {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Predicate<String> getFieldMatcher(String field) {
// requireFieldMatch=false
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("text", "some")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "field")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "this")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "is")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "this")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("category", "this")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("category", "some")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("category", "category")), BooleanClause.Occur.SHOULD);
Query query = queryBuilder.build();
// title
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the title <b>field</b>.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// text
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the text field. ", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// category
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the <b>category</b> <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the <b>category</b> field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
ir.close();
}
public void testFieldMatcherMultiTermQuery() throws Exception {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Predicate<String> getFieldMatcher(String field) {
// requireFieldMatch=false
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new FuzzyQuery(new Term("text", "sime"), 1), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("text", "fie")), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("text", "thi")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "is")), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("title", "thi")), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("category", "thi")), BooleanClause.Occur.SHOULD)
.add(new FuzzyQuery(new Term("category", "sime"), 1), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("category", "categ")), BooleanClause.Occur.SHOULD);
Query query = queryBuilder.build();
// title
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the title <b>field</b>.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// text
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the text field. ", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// category
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the <b>category</b> <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the <b>category</b> field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
ir.close();
}
public void testFieldMatcherPhraseQuery() throws Exception {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Predicate<String> getFieldMatcher(String field) {
// requireFieldMatch=false
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new PhraseQuery("title", "this", "is", "the", "title"), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery(2, "category", "this", "is", "the", "field"), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery("text", "this", "is"), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery("category", "this", "is"), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery(1, "text", "you", "can", "put", "text"), BooleanClause.Occur.SHOULD);
Query query = queryBuilder.build();
// title
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>title</b> <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>title</b> field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// text
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>text</b> <b>field</b>. <b>You</b> <b>can</b> <b>put</b> some <b>text</b> if you want.", snippets[0]);
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the <b>text</b> field. <b>You</b> <b>can</b> <b>put</b> some <b>text</b> if you want.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("This is the text field. You can put some text if you want.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// category
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> category <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> category <b>field</b>.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
ir.close();
}
}

View File

@ -23,7 +23,6 @@ import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
@ -144,7 +143,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
}
@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
return super.getFieldHighlighter(field, query, allTerms, maxPassages);
}