LUCENE-8121: UH switch to SpanCollector API. Better accuracy.

* Use the filtered freq in position sensitive terms (better scores)
* Refactored UH's OffsetsEnum
* Improved test randomization in TestUnifiedHighlighter & MTQ
This commit is contained in:
David Smiley 2018-01-10 22:49:40 -05:00
parent 6a55def1ea
commit 352ec01a6e
11 changed files with 524 additions and 616 deletions

View File

@ -119,6 +119,10 @@ Improvements
* LUCENE-8094: TermInSetQuery.toString now returns "field:(A B C)" (Mike McCandless)
* LUCENE-8121: UnifiedHighlighter passage relevancy is improved for terms that are
position sensitive (e.g. part of a phrase) by having an accurate freq.
(David Smiley)
Bug Fixes
* LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.
@ -127,6 +131,12 @@ Bug Fixes
* SOLR-11758: Fixed FloatDocValues.boolVal to correctly return true for all values != 0.0F
(Munendra S N via hossman)
* LUCENE-8121: The UnifiedHighlighter would highlight some terms within some nested
SpanNearQueries at positions where it should not have. It's fixed in the UH by
switching to the SpanCollector API. The original Highlighter still has this
problem (LUCENE-2287, LUCENE-5455, LUCENE-6796). Some public but internal parts of
the UH were refactored. (David Smiley, Steve Davids)
Other
* LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
@ -136,13 +137,15 @@ public class FieldHighlighter {
BreakIterator breakIterator = this.breakIterator;
final int contentLength = breakIterator.getText().getEndIndex();
//TODO consider moving this part to an aggregate OffsetsEnum subclass so we have one enum that already has its weight
PriorityQueue<OffsetsEnum> offsetsEnumQueue = new PriorityQueue<>(offsetsEnums.size() + 1);
for (OffsetsEnum off : offsetsEnums) {
off.setWeight(scorer.weight(contentLength, off.freq()));
off.nextPosition(); // go to first position
offsetsEnumQueue.add(off);
if (off.nextPosition()) {// go to first position
offsetsEnumQueue.add(off);
}
}
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination
offsetsEnumQueue.add(new OffsetsEnum.OfPostings(new BytesRef(), EMPTY)); // a sentinel for termination
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
if (left.getScore() < right.getScore()) {
@ -203,10 +206,9 @@ public class FieldHighlighter {
assert term != null;
passage.addMatch(start, end, term);
// see if there are multiple occurrences of this term in this passage. If so, add them.
if (!off.hasMorePositions()) {
if (!off.nextPosition()) {
break; // No more in the entire text. Already removed from pq; move on
}
off.nextPosition();
start = off.startOffset();
end = off.endOffset();
if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage
@ -222,7 +224,7 @@ public class FieldHighlighter {
p.sort();
}
// sort in ascending order
Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset));
return passages;
}

View File

@ -20,14 +20,12 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@ -41,9 +39,9 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
public abstract class FieldOffsetStrategy {
protected final String field;
protected final PhraseHelper phraseHelper; // Query: position-sensitive information TODO: rename
protected final BytesRef[] terms; // Query: free-standing terms
protected final CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
protected final PhraseHelper phraseHelper; // Query: position-sensitive information
protected final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive)
protected final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
this.field = field;
@ -70,47 +68,50 @@ public abstract class FieldOffsetStrategy {
return Collections.emptyList();
}
// For strict positions, get a Map of term to Spans:
// note: ScriptPhraseHelper.NONE does the right thing for these method calls
final Map<BytesRef, Spans> strictPhrasesTermToSpans =
phraseHelper.getTermToSpans(leafReader, doc);
// Usually simply wraps terms in a List; but if willRewrite() then can be expanded
final List<BytesRef> sourceTerms =
phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
final List<OffsetsEnum> offsetsEnums = new ArrayList<>(terms.length + automata.length);
final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
// Handle position insensitive terms (a subset of this.terms field):
final BytesRef[] insensitiveTerms;
if (phraseHelper.hasPositionSensitivity()) {
insensitiveTerms = phraseHelper.getAllPositionInsensitiveTerms();
assert insensitiveTerms.length <= terms.length : "insensitive terms should be smaller set of all terms";
} else {
insensitiveTerms = terms;
}
if (insensitiveTerms.length > 0) {
createOffsetsEnumsForTerms(insensitiveTerms, termsIndex, doc, offsetsEnums);
}
// Handle sourceTerms:
if (!sourceTerms.isEmpty()) {
TermsEnum termsEnum = termsIndex.iterator();//does not return null
for (BytesRef term : sourceTerms) {
if (termsEnum.seekExact(term)) {
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (postingsEnum == null) {
// no offsets or positions available
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
if (postingsEnum != null) {
offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
}
}
}
}
// Handle spans
if (phraseHelper.hasPositionSensitivity()) {
phraseHelper.createOffsetsEnumsForSpans(leafReader, doc, offsetsEnums);
}
// Handle automata
if (automata.length > 0) {
offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
createOffsetsEnumsForAutomata(termsIndex, doc, offsetsEnums);
}
return offsetsEnums;
}
protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
TermsEnum termsEnum = termsIndex.iterator();//does not return null
for (BytesRef term : sourceTerms) {
if (termsEnum.seekExact(term)) {
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (postingsEnum == null) {
// no offsets or positions available
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
results.add(new OffsetsEnum.OfPostings(term, postingsEnum));
}
}
}
}
protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
automataPostings.add(new ArrayList<>());
@ -118,6 +119,7 @@ public abstract class FieldOffsetStrategy {
TermsEnum termsEnum = termsIndex.iterator();
BytesRef term;
CharsRefBuilder refBuilder = new CharsRefBuilder();
while ((term = termsEnum.next()) != null) {
for (int i = 0; i < automata.length; i++) {
@ -132,7 +134,6 @@ public abstract class FieldOffsetStrategy {
}
}
List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length); //will be at most this long
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
List<PostingsEnum> postingsEnums = automataPostings.get(i);
@ -140,14 +141,13 @@ public abstract class FieldOffsetStrategy {
if (size > 0) { //only add if we have offsets
BytesRef wildcardTerm = new BytesRef(automaton.toString());
if (size == 1) { //don't wrap in a composite if there's only one OffsetsEnum
offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
results.add(new OffsetsEnum.OfPostings(wildcardTerm, postingsEnums.get(0)));
} else {
offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
results.add(new OffsetsEnum.OfPostings(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
}
}
}
return offsetsEnums;
}
}

View File

@ -14,6 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.Closeable;
@ -25,25 +26,19 @@ import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.util.BytesRef;
/**
* Holds the term ({@link BytesRef}), {@link PostingsEnum}, offset iteration tracking.
* It is advanced with the underlying postings and is placed in a priority queue by
* An enumeration/iterator of a term and its offsets for use by {@link FieldHighlighter}.
* It is advanced and is placed in a priority queue by
* {@link FieldHighlighter#highlightOffsetsEnums(List)} based on the start offset.
*
* @lucene.internal
*/
public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
private final BytesRef term;
private final PostingsEnum postingsEnum; // with offsets
public abstract class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
private float weight; // set once in highlightOffsetsEnums
private int posCounter = 0; // the occurrence counter of this term within the text being highlighted.
public OffsetsEnum(BytesRef term, PostingsEnum postingsEnum) throws IOException {
this.term = term; // can be null
this.postingsEnum = Objects.requireNonNull(postingsEnum);
}
// note: the ordering clearly changes as the postings enum advances
// note: would be neat to use some Comparator utilities with method
// references but our methods throw IOException
@Override
public int compareTo(OffsetsEnum other) {
try {
@ -51,53 +46,41 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
if (cmp != 0) {
return cmp; // vast majority of the time we return here.
}
if (this.term == null || other.term == null) {
if (this.term == null && other.term == null) {
final BytesRef thisTerm = this.getTerm();
final BytesRef otherTerm = other.getTerm();
if (thisTerm == null || otherTerm == null) {
if (thisTerm == null && otherTerm == null) {
return 0;
} else if (this.term == null) {
} else if (thisTerm == null) {
return 1; // put "this" (wildcard mtq enum) last
} else {
return -1;
}
}
return term.compareTo(other.term);
return thisTerm.compareTo(otherTerm);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/** The term at this position; usually always the same. This term is a reference that is safe to continue to refer to,
* even after we move to next position. */
public BytesRef getTerm() throws IOException {
// TODO TokenStreamOffsetStrategy could override OffsetsEnum; then remove this hack here
return term != null ? term : postingsEnum.getPayload(); // abusing payload like this is a total hack!
}
/**
* Advances to the next position and returns true, or if can't then returns false.
* Note that the initial state of this class is not positioned.
*/
public abstract boolean nextPosition() throws IOException;
public PostingsEnum getPostingsEnum() {
return postingsEnum;
}
/** An estimate of the number of occurrences of this term/OffsetsEnum. */
public abstract int freq() throws IOException;
public int freq() throws IOException {
return postingsEnum.freq();
}
/**
* The term at this position; usually always the same.
* This BytesRef is safe to continue to refer to, even after we move to the next position.
*/
public abstract BytesRef getTerm() throws IOException;
public boolean hasMorePositions() throws IOException {
return posCounter < postingsEnum.freq();
}
public abstract int startOffset() throws IOException;
public void nextPosition() throws IOException {
assert hasMorePositions();
posCounter++;
postingsEnum.nextPosition();
}
public int startOffset() throws IOException {
return postingsEnum.startOffset();
}
public int endOffset() throws IOException {
return postingsEnum.endOffset();
}
public abstract int endOffset() throws IOException;
public float getWeight() {
return weight;
@ -109,9 +92,66 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
@Override
public void close() throws IOException {
// TODO TokenStreamOffsetStrategy could override OffsetsEnum; then this base impl would be no-op.
if (postingsEnum instanceof Closeable) {
((Closeable) postingsEnum).close();
}
@Override
public String toString() {
final String name = getClass().getSimpleName();
try {
return name + "(term:" + getTerm().utf8ToString() +")";
} catch (Exception e) {
return name;
}
}
/**
* Based on a {@link PostingsEnum} -- the typical/standard OE impl.
*/
public static class OfPostings extends OffsetsEnum {
private final BytesRef term;
private final PostingsEnum postingsEnum; // with offsets
private int posCounter = 0; // the occurrence counter of this term within the text being highlighted.
public OfPostings(BytesRef term, PostingsEnum postingsEnum) throws IOException {
this.term = Objects.requireNonNull(term);
this.postingsEnum = Objects.requireNonNull(postingsEnum);
}
public PostingsEnum getPostingsEnum() {
return postingsEnum;
}
@Override
public boolean nextPosition() throws IOException {
if (posCounter < postingsEnum.freq()) {
posCounter++;
postingsEnum.nextPosition(); // note: we don't need to save the position
return true;
} else {
return false;
}
}
@Override
public int freq() throws IOException {
return postingsEnum.freq();
}
@Override
public BytesRef getTerm() throws IOException {
return term;
}
@Override
public int startOffset() throws IOException {
return postingsEnum.startOffset();
}
@Override
public int endOffset() throws IOException {
return postingsEnum.endOffset();
}
}
}

View File

@ -98,6 +98,24 @@ public class Passage {
numMatches = 0;
}
/** For debugging. ex: Passage[0-22]{yin[0-3],yang[4-8],yin[10-13]}score=2.4964213 */
@Override
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("Passage[").append(startOffset).append('-').append(endOffset).append(']');
buf.append('{');
for (int i = 0; i < numMatches; i++) {
if (i != 0) {
buf.append(',');
}
buf.append(matchTerms[i].utf8ToString());
buf.append('[').append(matchStarts[i] - startOffset).append('-').append(matchEnds[i] - startOffset).append(']');
}
buf.append('}');
buf.append("score=").append(score);
return buf.toString();
}
/**
* Start offset of this passage.
*

View File

@ -17,82 +17,58 @@
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.search.spans.SpanCollector;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.SpanScorer;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
/**
* Helps the {@link FieldOffsetStrategy} with strict position highlighting (e.g. highlight phrases correctly).
* Helps the {@link FieldOffsetStrategy} with position sensitive queries (e.g. highlight phrases correctly).
* This is a stateful class holding information about the query, but it can (and is) re-used across highlighting
* documents. Despite this state; it's immutable after construction. The approach taken in this class is very similar
* to the standard Highlighter's {@link WeightedSpanTermExtractor} which is in fact re-used here. However, we ought to
* completely rewrite it to use the SpanCollector interface to collect offsets directly. We'll get better
* phrase accuracy.
* documents. Despite this state, it's immutable after construction.
*
* @lucene.internal
*/
// TODO rename to SpanHighlighting ?
public class PhraseHelper {
public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_",
(s) -> false, spanQuery -> null, query -> null, true);
//TODO it seems this ought to be a general thing on Spans?
private static final Comparator<? super Spans> SPANS_COMPARATOR = (o1, o2) -> {
int cmp = Integer.compare(o1.docID(), o2.docID());
if (cmp != 0) {
return cmp;
}
if (o1.docID() == DocIdSetIterator.NO_MORE_DOCS) {
return 0; // don't ask for start/end position; not sure if we can even call those methods
}
cmp = Integer.compare(o1.startPosition(), o2.startPosition());
if (cmp != 0) {
return cmp;
} else {
return Integer.compare(o1.endPosition(), o2.endPosition());
}
};
private final String fieldName;
private final Set<Term> positionInsensitiveTerms; // (TermQuery terms)
private final Set<BytesRef> positionInsensitiveTerms; // (TermQuery terms)
private final Set<SpanQuery> spanQueries;
private final boolean willRewrite;
private final Predicate<String> fieldMatcher;
@ -114,13 +90,27 @@ public class PhraseHelper {
this.fieldName = field;
this.fieldMatcher = fieldMatcher;
// filter terms to those we want
positionInsensitiveTerms = new FieldFilteringTermSet();
positionInsensitiveTerms = new HashSet<>();
spanQueries = new HashSet<>();
// TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls
boolean[] mustRewriteHolder = {false}; // boolean wrapped in 1-ary array so it's mutable from inner class
// When we call Weight.extractTerms, we do it on clauses that are NOT position sensitive.
// We only want the to track a Set of bytes for the Term, not Term class with field part.
Set<Term> extractPosInsensitiveTermsTarget = new TreeSet<Term>() {
@Override
public boolean add(Term term) {
// don't call super.add; we don't actually use the superclass
if (fieldMatcher.test(term.field())) {
return positionInsensitiveTerms.add(term.bytes());
} else {
return false;
}
}
};
// For TermQueries or other position insensitive queries, collect the Terms.
// For other Query types, WSTE will convert to an equivalent SpanQuery. NOT extracting position spans here.
new WeightedSpanTermExtractor(field) {
@ -155,13 +145,15 @@ public class PhraseHelper {
return true; //TODO set to false and provide a hook to customize certain queries.
}
// called on Query types that are NOT position sensitive, e.g. TermQuery
@Override
protected void extractWeightedTerms(Map<String, WeightedSpanTerm> terms, Query query, float boost)
throws IOException {
query.createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER, ScoreMode.COMPLETE_NO_SCORES, boost)
.extractTerms(positionInsensitiveTerms);
.extractTerms(extractPosInsensitiveTermsTarget);
}
// called on SpanQueries. Some other position-sensitive queries like PhraseQuery are converted beforehand
@Override
protected void extractWeightedSpanTerms(Map<String, WeightedSpanTerm> terms, SpanQuery spanQuery,
float boost) throws IOException {
@ -174,7 +166,6 @@ public class PhraseHelper {
}
}
// TODO allow users to override the answer to mustRewriteQuery
boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
if (ignoreQueriesNeedingRewrite && mustRewriteQuery) {
return;// ignore this query
@ -194,14 +185,14 @@ public class PhraseHelper {
willRewrite = mustRewriteHolder[0];
}
Set<SpanQuery> getSpanQueries() {
public Set<SpanQuery> getSpanQueries() {
return spanQueries;
}
/**
* If there is no position sensitivity then use of the instance of this class can be ignored.
*/
boolean hasPositionSensitivity() {
public boolean hasPositionSensitivity() {
return spanQueries.isEmpty() == false;
}
@ -210,335 +201,85 @@ public class PhraseHelper {
* custom things. When true, the resulting term list will probably be different than what it was known
* to be initially.
*/
boolean willRewrite() {
public boolean willRewrite() {
return willRewrite;
}
/**
* Collect a list of pre-positioned {@link Spans} for each term, given a reader that has just one document.
* It returns no mapping for query terms that occurs in a position insensitive way which therefore don't
* need to be filtered.
*/
Map<BytesRef, Spans> getTermToSpans(LeafReader leafReader, int doc)
throws IOException {
if (spanQueries.isEmpty()) {
return Collections.emptyMap();
}
final LeafReader filteredReader = new SingleFieldFilterLeafReader(leafReader, fieldName);
// for each SpanQuery, collect the member spans into a map.
Map<BytesRef, Spans> result = new HashMap<>();
for (SpanQuery spanQuery : spanQueries) {
getTermToSpans(spanQuery, filteredReader.getContext(), doc, result);
}
/** Returns the terms that are position-insensitive (sorted). */
public BytesRef[] getAllPositionInsensitiveTerms() {
BytesRef[] result = positionInsensitiveTerms.toArray(new BytesRef[positionInsensitiveTerms.size()]);
Arrays.sort(result);
return result;
}
// code extracted & refactored from WSTE.extractWeightedSpanTerms()
private void getTermToSpans(SpanQuery spanQuery, LeafReaderContext readerContext,
int doc, Map<BytesRef, Spans> result)
throws IOException {
// note: in WSTE there was some field specific looping that seemed pointless so that isn't here.
final IndexSearcher searcher = new IndexSearcher(readerContext.reader());
/** Given the internal SpanQueries, produce a number of OffsetsEnum into the {@code results} param. */
public void createOffsetsEnumsForSpans(LeafReader leafReader, int docId, List<OffsetsEnum> results) throws IOException {
leafReader = new SingleFieldWithOffsetsFilterLeafReader(leafReader, fieldName);
//TODO avoid searcher and do what it does to rewrite & get weight?
IndexSearcher searcher = new IndexSearcher(leafReader);
searcher.setQueryCache(null);
if (willRewrite) {
spanQuery = (SpanQuery) searcher.rewrite(spanQuery); // searcher.rewrite loops till done
}
// Get the underlying query terms
TreeSet<Term> termSet = new FieldFilteringTermSet(); // sorted so we can loop over results in order shortly...
searcher.createWeight(spanQuery, ScoreMode.COMPLETE_NO_SCORES, 1.0f).extractTerms(termSet);//needsScores==false
// Get Spans by running the query against the reader
// TODO it might make sense to re-use/cache the Spans instance, to advance forward between docs
SpanWeight spanWeight = (SpanWeight) searcher.createNormalizedWeight(spanQuery, ScoreMode.COMPLETE_NO_SCORES);
Spans spans = spanWeight.getSpans(readerContext, SpanWeight.Postings.POSITIONS);
if (spans == null) {
return;
}
TwoPhaseIterator twoPhaseIterator = spans.asTwoPhaseIterator();
if (twoPhaseIterator != null) {
if (twoPhaseIterator.approximation().advance(doc) != doc || !twoPhaseIterator.matches()) {
return;
}
} else if (spans.advance(doc) != doc) { // preposition, and return doing nothing if find none
return;
}
// Consume the Spans into a cache. This instance is used as a source for multiple cloned copies.
// It's important we do this and not re-use the same original Spans instance since these will be iterated
// independently later on; sometimes in ways that prevents sharing the original Spans.
CachedSpans cachedSpansSource = new CachedSpans(spans); // consumes spans for this doc only and caches
spans = null;// we don't use it below
// Map terms to a Spans instance (aggregate if necessary)
for (final Term queryTerm : termSet) {
// note: we expect that at least one query term will pass these filters. This is because the collected
// spanQuery list were already filtered by these conditions.
if (positionInsensitiveTerms.contains(queryTerm)) {
continue;
}
// copy-constructor refers to same data (shallow) but has iteration state from the beginning
CachedSpans cachedSpans = new CachedSpans(cachedSpansSource);
// Add the span to whatever span may or may not exist
Spans existingSpans = result.get(queryTerm.bytes());
if (existingSpans != null) {
if (existingSpans instanceof MultiSpans) {
((MultiSpans) existingSpans).addSpans(cachedSpans);
} else { // upgrade to MultiSpans
MultiSpans multiSpans = new MultiSpans();
multiSpans.addSpans(existingSpans);
multiSpans.addSpans(cachedSpans);
result.put(queryTerm.bytes(), multiSpans);
}
} else {
result.put(queryTerm.bytes(), cachedSpans);
}
}
}
/**
* Returns terms as a List, but expanded to any terms in phraseHelper' keySet if present. That can only
* happen if willRewrite() is true.
*/
List<BytesRef> expandTermsIfRewrite(BytesRef[] terms, Map<BytesRef, Spans> strictPhrasesTermToSpans) {
if (willRewrite()) {
Set<BytesRef> allTermSet = new LinkedHashSet<>(terms.length + strictPhrasesTermToSpans.size());
Collections.addAll(allTermSet, terms);//FYI already sorted; will keep order
if (allTermSet.addAll(strictPhrasesTermToSpans.keySet())) { // true if any were added
List<BytesRef> sourceTerms = Arrays.asList(allTermSet.toArray(new BytesRef[allTermSet.size()]));
sourceTerms.sort(Comparator.naturalOrder());
return sourceTerms;
}
}
return Arrays.asList(terms); // no rewrite; use original terms
}
/**
* Returns a filtered postings where the position must be in the given Spans.
* The Spans must be in a positioned state (not initial) and should not be shared between other terms.
* {@code postingsEnum} should be positioned at the
* document (the same one as the spans) but it hasn't iterated the positions yet.
* The Spans should be the result of a simple
* lookup from {@link #getTermToSpans(LeafReader, int)}, and so it could be null which could mean
* either it's completely filtered or that there should be no filtering; this class knows what to do.
* <p>
* Due to limitations in filtering, the {@link PostingsEnum#freq()} is un-changed even if some positions
* get filtered. So when {@link PostingsEnum#nextPosition()} is called or {@code startOffset} or {@code
* endOffset} beyond the "real" positions, these methods returns {@link Integer#MAX_VALUE}.
* <p>
* <b>This will return null if it's completely filtered out (i.e. effectively has no postings).</b>
*/
PostingsEnum filterPostings(BytesRef term, PostingsEnum postingsEnum, Spans spans)
throws IOException {
if (spans == null) {
if (hasPositionSensitivity() == false || positionInsensitiveTerms.contains(new Term(fieldName, term))) {
return postingsEnum; // no filtering
} else {
return null; // completely filtered out
}
}
if (postingsEnum.docID() != spans.docID()) {
throw new IllegalStateException("Spans & Postings doc ID misaligned or not positioned");
}
return new FilterLeafReader.FilterPostingsEnum(postingsEnum) {
// freq() is max times nextPosition can be called. We'll set this var to -1 when exhausted.
int remainingPositions = postingsEnum.freq();
// for each SpanQuery, grab it's Spans and put it into a PriorityQueue
PriorityQueue<Spans> spansPriorityQueue = new PriorityQueue<Spans>(spanQueries.size()) {
@Override
public String toString() {
String where;
try {
where = "[" + startOffset() + ":" + endOffset() + "]";
} catch (IOException e) {
where = "[" + e + "]";
}
return "'" + term.utf8ToString() + "'@" + where + " filtered by " + spans;
}
@Override
public int nextDoc() throws IOException {
throw new IllegalStateException("not expected"); // don't need to implement; just used on one doc
}
@Override
public int advance(int target) throws IOException {
throw new IllegalStateException("not expected"); // don't need to implement; just used on one doc
}
@Override
public int nextPosition() throws IOException {
// loop over posting positions...
NEXT_POS_LOOP:
while (remainingPositions > 0) {
final int thisPos = super.nextPosition();
remainingPositions--;
// loop spans forward (if necessary) while the span end is behind thisPos
while (spans.endPosition() <= thisPos) {
if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) { // advance
break NEXT_POS_LOOP;
}
assert spans.docID() == postingsEnum.docID();
}
// is this position within the span?
if (thisPos >= spans.startPosition()) {
assert thisPos < spans.endPosition(); // guaranteed by previous loop
return thisPos; // yay!
}
// else continue and try the next position
}
remainingPositions = -1; // signify done
return Integer.MAX_VALUE;
}
@Override
public int startOffset() throws IOException {
return remainingPositions >= 0 ? super.startOffset() : Integer.MAX_VALUE;
}
@Override
public int endOffset() throws IOException {
return remainingPositions >= 0 ? super.endOffset() : Integer.MAX_VALUE;
protected boolean lessThan(Spans a, Spans b) {
return a.startPosition() <= b.startPosition();
}
};
}
/**
* Simple TreeSet that filters out Terms not matching the provided predicate on {@code add()}.
*/
private class FieldFilteringTermSet extends TreeSet<Term> {
@Override
public boolean add(Term term) {
if (fieldMatcher.test(term.field())) {
if (term.field().equals(fieldName)) {
return super.add(term);
} else {
return super.add(new Term(fieldName, term.bytes()));
for (Query query : spanQueries) {
Weight weight = searcher.createNormalizedWeight(query, ScoreMode.COMPLETE_NO_SCORES);
Scorer scorer = weight.scorer(leafReader.getContext());
if (scorer == null) {
continue;
}
TwoPhaseIterator twoPhaseIterator = scorer.twoPhaseIterator();
if (twoPhaseIterator != null) {
if (twoPhaseIterator.approximation().advance(docId) != docId || !twoPhaseIterator.matches()) {
continue;
}
} else if (scorer.iterator().advance(docId) != docId) { // preposition, and return doing nothing if find none
continue;
}
Spans spans = ((SpanScorer) scorer).getSpans();
assert spans.docID() == docId;
if (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
spansPriorityQueue.add(spans);
}
}
// Iterate the Spans in the PriorityQueue, collecting as we go. By using a PriorityQueue ordered by position,
// the underlying offsets in our collector will be mostly appended to the end of arrays (efficient).
// note: alternatively it'd interesting if we produced one OffsetsEnum that internally advanced
// this PriorityQueue when nextPosition is called; it would cap what we have to cache for large docs and
// exiting early (due to maxLen) is easy.
// But at least we have an accurate "freq" and it shouldn't be too much data to collect. Even SpanScorer
// navigates the spans fully to compute a good freq (and thus score)!
OffsetSpanCollector spanCollector = new OffsetSpanCollector();
while (spansPriorityQueue.size() > 0) {
Spans spans = spansPriorityQueue.top();
//TODO limit to a capped endOffset length somehow so we can break this loop early
spans.collect(spanCollector);
if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) {
spansPriorityQueue.pop();
} else {
return false;
spansPriorityQueue.updateTop();
}
}
results.addAll(spanCollector.termToOffsetsEnums.values());
}
/**
* A single {@link Spans} view over multiple spans. At least one span is mandatory, but you should probably
* supply more than one. Furthermore, the given spans are expected to be positioned to a document already
* via a call to next or advance).
*/ // TODO move to Lucene core as a Spans utility class?
static class MultiSpans extends Spans {
final PriorityQueue<Spans> spansQueue = new PriorityQueue<>(SPANS_COMPARATOR);
long cost;
void addSpans(Spans spans) {
if (spans.docID() < 0 || spans.docID() == NO_MORE_DOCS) {
throw new IllegalArgumentException("Expecting given spans to be in a positioned state.");
}
spansQueue.add(spans);
cost = Math.max(cost, spans.cost());
}
// DocIdSetIterator methods:
@Override
public int nextDoc() throws IOException {
if (spansQueue.isEmpty()) {
return NO_MORE_DOCS;
}
return advance(spansQueue.peek().docID() + 1);
}
@Override
public int advance(int target) throws IOException {
if (spansQueue.isEmpty()) {
return NO_MORE_DOCS;
}
while (true) {
Spans spans = spansQueue.peek();
if (spans.docID() >= target) {
return spans.docID();
}
spansQueue.remove(); // must remove before modify state
if (spans.advance(target) != NO_MORE_DOCS) { // ... otherwise it's not re-added
spansQueue.add(spans);
} else if (spansQueue.isEmpty()) {
return NO_MORE_DOCS;
}
}
}
@Override
public int docID() {
if (spansQueue.isEmpty()) {
return NO_MORE_DOCS;
}
return spansQueue.peek().docID();
}
@Override
public long cost() {
return cost;
}
// Spans methods:
@Override
public int nextStartPosition() throws IOException {
// advance any spans at the initial position per document
boolean atDocStart = false;
while (spansQueue.peek().startPosition() == -1) {
atDocStart = true;
Spans headSpans = spansQueue.remove(); // remove because we will change state
headSpans.nextStartPosition();
spansQueue.add(headSpans);
}
if (!atDocStart) {
Spans headSpans = spansQueue.remove(); // remove because we will change state
headSpans.nextStartPosition();
spansQueue.add(headSpans);
}
return startPosition();
}
@Override
public int startPosition() {
return spansQueue.peek().startPosition();
}
@Override
public int endPosition() {
return spansQueue.peek().endPosition();
}
@Override
public int width() {
return spansQueue.peek().width();
}
@Override
public void collect(SpanCollector collector) throws IOException {
spansQueue.peek().collect(collector);
}
@Override
public float positionsCost() {
return 100f;// no idea; and we can't delegate due to not allowing to call it dependent on TwoPhaseIterator
}
}
//TODO move up; it's currently inbetween other inner classes that are related
/**
* Needed to support the ability to highlight a query irrespective of the field a query refers to
* (aka requireFieldMatch=false).
* This reader will just delegate every call to a single field in the wrapped
* LeafReader. This way we ensure that all queries going through this reader target the same field.
*/
static final class SingleFieldFilterLeafReader extends FilterLeafReader {
private static final class SingleFieldWithOffsetsFilterLeafReader extends FilterLeafReader {
final String fieldName;
SingleFieldFilterLeafReader(LeafReader in, String fieldName) {
SingleFieldWithOffsetsFilterLeafReader(LeafReader in, String fieldName) {
super(in);
this.fieldName = fieldName;
}
@ -550,22 +291,18 @@ public class PhraseHelper {
@Override
public Terms terms(String field) throws IOException {
return super.terms(fieldName);
}
@Override
public NumericDocValues getNumericDocValues(String field) throws IOException {
return super.getNumericDocValues(fieldName);
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
return super.getBinaryDocValues(fieldName);
}
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
return super.getSortedDocValues(fieldName);
// ensure the underlying PostingsEnum returns offsets. It's sad we have to do this to use the SpanCollector.
return new FilterTerms(super.terms(fieldName)) {
@Override
public TermsEnum iterator() throws IOException {
return new FilterTermsEnum(in.iterator()) {
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
return super.postings(reuse, flags | PostingsEnum.OFFSETS);
}
};
}
};
}
@Override
@ -584,99 +321,102 @@ public class PhraseHelper {
}
}
private class OffsetSpanCollector implements SpanCollector {
Map<BytesRef, SpanCollectedOffsetsEnum> termToOffsetsEnums = new HashMap<>();
/**
* A Spans based on a list of cached spans for one doc. It is pre-positioned to this doc.
*/
private static class CachedSpans extends Spans {
private static class CachedSpan {
final int start;
final int end;
CachedSpan(int start, int end) {
this.start = start;
this.end = end;
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
if (!fieldMatcher.test(term.field())) {
return;
}
}
final int docId;
final ArrayList<CachedSpan> cachedSpanList;
int index = -1;
CachedSpans(Spans spans) throws IOException {
this.docId = spans.docID();
assert this.docId != -1;
// Consume the spans for this doc into a list. There's always at least one; the first/current one.
cachedSpanList = new ArrayList<>();
while (spans.nextStartPosition() != NO_MORE_POSITIONS) {
cachedSpanList.add(new CachedSpan(spans.startPosition(), spans.endPosition()));
SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes());
if (offsetsEnum == null) {
// If it's pos insensitive we handle it outside of PhraseHelper. term.field() is from the Query.
if (positionInsensitiveTerms.contains(term.bytes())) {
return;
}
offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq());
termToOffsetsEnums.put(term.bytes(), offsetsEnum);
}
assert !cachedSpanList.isEmpty(); // bad Span impl?
}
/**
* Clone; reset iteration state.
*/
CachedSpans(CachedSpans cloneMe) {
docId = cloneMe.docId;
cachedSpanList = cloneMe.cachedSpanList;
offsetsEnum.add(postings.startOffset(), postings.endOffset());
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException("Not expected");
public void reset() { // called when at a new position. We don't care.
}
}
private static class SpanCollectedOffsetsEnum extends OffsetsEnum {
// TODO perhaps optionally collect (and expose) payloads?
private final BytesRef term;
private final int[] startOffsets;
private final int[] endOffsets;
private int numPairs = 0;
private int enumIdx = -1;
private SpanCollectedOffsetsEnum(BytesRef term, int postingsFreq) {
this.term = term;
this.startOffsets = new int[postingsFreq]; // hopefully not wasteful? At least we needn't resize it.
this.endOffsets = new int[postingsFreq];
}
// called from collector before it's navigated
void add(int startOffset, int endOffset) {
assert enumIdx == -1 : "bad state";
// loop backwards since we expect a match at the end or close to it. We expect O(1) not O(N).
int pairIdx = numPairs - 1;
for (; pairIdx >= 0; pairIdx--) {
int iStartOffset = startOffsets[pairIdx];
int iEndOffset = endOffsets[pairIdx];
int cmp = Integer.compare(iStartOffset, startOffset);
if (cmp == 0) {
cmp = Integer.compare(iEndOffset, endOffset);
}
if (cmp == 0) {
return; // we already have this offset-pair for this term
} else if (cmp < 0) {
break; //we will insert offsetPair to the right of pairIdx
}
}
// pairIdx is now one position to the left of where we insert the new pair
// shift right any pairs by one to make room
final int shiftLen = numPairs - (pairIdx + 1);
if (shiftLen > 0) {
System.arraycopy(startOffsets, pairIdx + 2, startOffsets, pairIdx + 3, shiftLen);
System.arraycopy(endOffsets, pairIdx + 2, endOffsets, pairIdx + 3, shiftLen);
}
// now we can place the offset pair
startOffsets[pairIdx + 1] = startOffset;
endOffsets[pairIdx + 1] = endOffset;
numPairs++;
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException("Not expected");
public boolean nextPosition() throws IOException {
return ++enumIdx < numPairs;
}
@Override
public int docID() {
return docId;
public int freq() throws IOException {
return startOffsets.length;
}
@Override
public long cost() {
return 1;
public BytesRef getTerm() throws IOException {
return term;
}
@Override
public int nextStartPosition() throws IOException {
index++;
return startPosition();
public int startOffset() throws IOException {
return startOffsets[enumIdx];
}
@Override
public int startPosition() {
return index < 0 ?
-1 : index >= cachedSpanList.size() ?
NO_MORE_POSITIONS : cachedSpanList.get(index).start;
public int endOffset() throws IOException {
return endOffsets[enumIdx];
}
}
@Override
public int endPosition() {
return index < 0 ?
-1 : index >= cachedSpanList.size() ?
NO_MORE_POSITIONS : cachedSpanList.get(index).end;
}
@Override
public int width() {
return endPosition() - startPosition();
}
@Override
public void collect(SpanCollector collector) throws IOException {
throw new UnsupportedOperationException("Not expected");
}
@Override
public float positionsCost() {
return 1f;
}
} // class CachedSpans
}

View File

@ -16,7 +16,6 @@
*/
package org.apache.lucene.search.uhighlight;
import java.io.Closeable;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
@ -26,7 +25,6 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@ -63,29 +61,20 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
@Override
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
TokenStream tokenStream = tokenStream(content);
PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata);
mtqPostingsEnum.advance(docId);
return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
return Collections.singletonList(new TokenStreamOffsetsEnum(tokenStream(content), automata));
}
// See class javadocs.
// TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl? See TODOs in OffsetsEnum.
private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
private static class TokenStreamOffsetsEnum extends OffsetsEnum {
TokenStream stream; // becomes null when closed
final CharacterRunAutomaton[] matchers;
final CharTermAttribute charTermAtt;
final OffsetAttribute offsetAtt;
int currentDoc = -1;
int currentMatch = -1;
int currentStartOffset = -1;
int currentEndOffset = -1;
final BytesRef matchDescriptions[];
TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
TokenStreamOffsetsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
this.stream = ts;
this.matchers = matchers;
matchDescriptions = new BytesRef[matchers.length];
@ -95,15 +84,13 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
}
@Override
public int nextPosition() throws IOException {
public boolean nextPosition() throws IOException {
if (stream != null) {
while (stream.incrementToken()) {
for (int i = 0; i < matchers.length; i++) {
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
currentStartOffset = offsetAtt.startOffset();
currentEndOffset = offsetAtt.endOffset();
currentMatch = i;
return 0;
return true;
}
}
}
@ -111,8 +98,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
close();
}
// exhausted
currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
return Integer.MAX_VALUE;
return false;
}
@Override
@ -122,45 +108,23 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
@Override
public int startOffset() throws IOException {
assert currentStartOffset >= 0;
return currentStartOffset;
return offsetAtt.startOffset();
}
@Override
public int endOffset() throws IOException {
assert currentEndOffset >= 0;
return currentEndOffset;
return offsetAtt.endOffset();
}
// TOTAL HACK; used in OffsetsEnum.getTerm()
@Override
public BytesRef getPayload() throws IOException {
public BytesRef getTerm() throws IOException {
if (matchDescriptions[currentMatch] == null) {
// these CharRunAutomata are subclassed so that toString() returns the query
matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
}
return matchDescriptions[currentMatch];
}
@Override
public int docID() {
return currentDoc;
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int advance(int target) throws IOException {
return currentDoc = target;
}
@Override
public long cost() {
return 0;
}
@Override
public void close() throws IOException {
if (stream != null) {

View File

@ -23,11 +23,14 @@ import java.nio.charset.StandardCharsets;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
@ -49,6 +52,7 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.After;
@ -81,6 +85,36 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
dir.close();
}
static UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analyzer indexAnalyzer) {
return randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.noneOf(HighlightFlag.class));
}
static UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analyzer indexAnalyzer,
EnumSet<HighlightFlag> mandatoryFlags) {
if (random().nextBoolean()) {
return new UnifiedHighlighter(searcher, indexAnalyzer);
} else {
final UnifiedHighlighter uh = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Set<HighlightFlag> getFlags(String field) {
final EnumSet<HighlightFlag> result = EnumSet.copyOf(mandatoryFlags);
int r = random().nextInt();
for (HighlightFlag highlightFlag : HighlightFlag.values()) {
if (((1 << highlightFlag.ordinal()) & r) == 0) {
result.add(highlightFlag);
}
}
return result;
}
};
uh.setCacheFieldValCharsThreshold(random().nextInt(100));
if (random().nextBoolean()) {
uh.setFieldMatcher(f -> true); // requireFieldMatch==false
}
return uh;
}
}
//
// Tests below were ported from the PostingsHighlighter. Possibly augmented. Far below are newer tests.
//
@ -101,7 +135,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
@ -167,7 +201,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setMaxLength(maxLength);
String snippets[] = highlighter.highlight("body", query, topDocs);
@ -191,7 +225,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
@ -219,7 +253,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
@ -248,7 +282,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setMaxLength(value.length() * 2 + 1);
Query query = new TermQuery(new Term("body", "field"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
@ -281,7 +315,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "best")), BooleanClause.Occur.SHOULD)
@ -313,7 +347,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("body", "just")), BooleanClause.Occur.SHOULD)
@ -345,7 +379,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
@ -382,7 +416,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
.build();
TopDocs topDocs = searcher.search(query, 10);
assertEquals(1, topDocs.totalHits);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setHighlightPhrasesStrictly(false);
String snippets[] = highlighter.highlight("body", query, topDocs, 2);
assertEquals(1, snippets.length);
@ -410,7 +444,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
.build();
TopDocs topDocs = searcher.search(query, 10);
assertEquals(1, topDocs.totalHits);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setHighlightPhrasesStrictly(false);
String snippets[] = highlighter.highlight("body", query, topDocs, 2);
assertEquals(1, snippets.length);
@ -438,7 +472,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
.build();
TopDocs topDocs = searcher.search(query, 10);
assertEquals(1, topDocs.totalHits);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setMaxLength(Integer.MAX_VALUE - 1);
String snippets[] = highlighter.highlight("body", query, topDocs, 2);
assertEquals(1, snippets.length);
@ -461,7 +495,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
@ -494,7 +528,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
TopDocs topDocs = searcher.search(query, 10);
assertEquals(1, topDocs.totalHits);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setMaxLength(Integer.MAX_VALUE - 1);
String snippets[] = highlighter.highlight("body", query, topDocs, 2);
assertEquals(1, snippets.length);
@ -549,7 +583,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
@ -623,7 +657,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[]{0};
String snippets[] = highlighter.highlightFields(new String[]{"body"}, query, docIDs, new int[]{2}).get("body");
@ -652,7 +686,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
Query query = new TermQuery(new Term("body", "highlighting"));
@ -683,7 +717,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setMaxNoHighlightPassages(0);// don't want any default summary
Query query = new TermQuery(new Term("body", "highlighting"));
int[] docIDs = new int[]{0};
@ -743,7 +777,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new TermQuery(new Term("bogus", "highlighting"));
int[] docIDs = new int[]{0};
String snippets[] = highlighter.highlightFields(new String[]{"bogus"}, query, docIDs, new int[]{2}).get("bogus");
@ -769,7 +803,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
Query query = new TermQuery(new Term("body", "highlighting"));
@ -798,7 +832,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
Query query = new TermQuery(new Term("body", "highlighting"));
@ -834,7 +868,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setCacheFieldValCharsThreshold(random().nextInt(10) * 10);// 0 thru 90 intervals of 10
Query query = new TermQuery(new Term("body", "answer"));
TopDocs hits = searcher.search(query, numDocs);
@ -872,7 +906,7 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("body", "test")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "test")), BooleanClause.Occur.SHOULD)
@ -995,7 +1029,8 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighterFieldMatch.setFieldMatcher(null);//default
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("text", "some")), BooleanClause.Occur.SHOULD)
@ -1078,7 +1113,8 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.of(HighlightFlag.MULTI_TERM_QUERY));
highlighterFieldMatch.setFieldMatcher(null);//default
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new FuzzyQuery(new Term("text", "sime"), 1), BooleanClause.Occur.SHOULD)
@ -1161,7 +1197,8 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.of(HighlightFlag.PHRASES));
highlighterFieldMatch.setFieldMatcher(null);//default
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new PhraseQuery("title", "this", "is", "the", "title"), BooleanClause.Occur.SHOULD)

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Objects;
@ -65,6 +66,7 @@ import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@ -150,6 +152,11 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
ir.close();
}
private UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analyzer indexAnalyzer) {
return TestUnifiedHighlighter.randomUnifiedHighlighter(searcher, indexAnalyzer,
EnumSet.of(HighlightFlag.MULTI_TERM_QUERY));
}
public void testOnePrefix() throws Exception {
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
@ -166,7 +173,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
// wrap in a BoostQuery to also show we see inside it
Query query = new BoostQuery(new PrefixQuery(new Term("body", "te")), 2.0f);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
@ -177,6 +184,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
highlighter.setFieldMatcher(null);//default
BooleanQuery bq = new BooleanQuery.Builder()
.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("bogus", "te")), BooleanClause.Occur.SHOULD)
@ -207,7 +215,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new RegexpQuery(new Term("body", "te.*"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
@ -217,6 +225,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
highlighter.setFieldMatcher(null);//default
BooleanQuery bq = new BooleanQuery.Builder()
.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
.add(new RegexpQuery(new Term("bogus", "te.*")), BooleanClause.Occur.SHOULD)
@ -247,7 +256,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new FuzzyQuery(new Term("body", "tets"), 1);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
@ -266,6 +275,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
highlighter.setFieldMatcher(null);//default
BooleanQuery bq = new BooleanQuery.Builder()
.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
.add(new FuzzyQuery(new Term("bogus", "tets"), 1), BooleanClause.Occur.SHOULD)
@ -296,7 +306,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = TermRangeQuery.newStringRange("body", "ta", "tf", true, true);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
@ -366,6 +376,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
assertEquals("Test a one sentence document.", snippets[1]);
// wrong field
highlighter.setFieldMatcher(null);//default
bq = new BooleanQuery.Builder()
.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
.add(TermRangeQuery.newStringRange("bogus", "ta", "tf", true, true), BooleanClause.Occur.SHOULD)
@ -396,7 +407,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery query = new BooleanQuery.Builder()
.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD)
.build();
@ -438,7 +449,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery query = new BooleanQuery.Builder()
.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("body", "test")), BooleanClause.Occur.FILTER)
@ -469,7 +480,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
ConstantScoreQuery query = new ConstantScoreQuery(new WildcardQuery(new Term("body", "te*")));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.totalHits);
@ -497,7 +508,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
DisjunctionMaxQuery query = new DisjunctionMaxQuery(
Collections.singleton(new WildcardQuery(new Term("body", "te*"))), 0);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
@ -526,7 +537,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
// wrap in a SpanBoostQuery to also show we see inside it
Query query = new SpanBoostQuery(
new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*"))), 2.0f);
@ -556,7 +567,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanOrQuery(new SpanQuery[]{childQuery});
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
@ -585,7 +596,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanNearQuery(new SpanQuery[]{childQuery, childQuery}, 0, false);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
@ -614,7 +625,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
SpanQuery include = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
SpanQuery exclude = new SpanTermQuery(new Term("body", "bogus"));
Query query = new SpanNotQuery(include, exclude);
@ -644,7 +655,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
SpanQuery childQuery = new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanFirstQuery(childQuery, 1000000);
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
@ -675,7 +686,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
// use a variety of common MTQ types
BooleanQuery query = new BooleanQuery.Builder()
.add(new PrefixQuery(new Term("body", "te")), BooleanClause.Occur.SHOULD)
@ -765,7 +776,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setMaxLength(25);//a little past first sentence
BooleanQuery query = new BooleanQuery.Builder()
@ -798,7 +809,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
highlighter.setMaxLength(32);//a little past first sentence
BooleanQuery query = new BooleanQuery.Builder()
@ -846,7 +857,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
};
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, buggyAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, buggyAnalyzer);
highlighter.setHandleMultiTermQuery(true);
if (rarely()) {
highlighter.setMaxLength(25);//a little past first sentence
@ -903,7 +914,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
Query query = new PrefixQuery(new Term("body", "nonexistent"));
@ -934,7 +945,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
Query query = new PrefixQuery(new Term("body", "ab"));
TopDocs topDocs = searcher.search(query, 10);
@ -956,7 +967,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer);
int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc;
PhraseQuery pq = new PhraseQuery.Builder()
@ -1076,7 +1087,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
IndexSearcher searcher = newSearcher(ir);
Query query = new PrefixQuery(new Term(field, "я"));
TopDocs topDocs = searcher.search(query, 1);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, analyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, analyzer);
String[] snippets = highlighter.highlight(field, query, topDocs);
assertEquals("[<b>я</b>]", Arrays.toString(snippets));
ir.close();
@ -1100,7 +1111,7 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
iw.commit();
try (IndexReader ir = iw.getReader()) {
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, analyzer);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, analyzer);
highlighter.setBreakIterator(WholeBreakIterator::new);
// Test PrefixQuery

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
@ -38,6 +39,7 @@ import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
@ -46,6 +48,7 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
@ -55,6 +58,7 @@ import org.apache.lucene.util.QueryBuilder;
import org.junit.After;
import org.junit.Before;
//TODO rename to reflect position sensitivity
public class TestUnifiedHighlighterStrictPhrases extends LuceneTestCase {
final FieldType fieldType;
@ -151,6 +155,16 @@ public class TestUnifiedHighlighterStrictPhrases extends LuceneTestCase {
String[] snippets = highlighter.highlight("body", query, topDocs);
assertArrayEquals(new String[]{"<b>Yin</b> <b>yang</b>, <b>yin</b> gap yang"}, snippets);
// test the Passage only has 3 matches. We don't want duplicates from "Yin" being in TermQuery & PhraseQuery.
highlighter.setFormatter(new PassageFormatter() {
@Override
public Object format(Passage[] passages, String content) {
return Arrays.toString(passages);
}
});
assertArrayEquals(new String[]{"[Passage[0-22]{yin[0-3],yang[4-8],yin[10-13]}score=2.4964213]"},
highlighter.highlight("body", query, topDocs));
}
public void testPhraseNotInDoc() throws IOException {
@ -185,6 +199,16 @@ public class TestUnifiedHighlighterStrictPhrases extends LuceneTestCase {
String[] snippets = highlighter.highlight("body", query, topDocs);
assertArrayEquals(new String[]{"<b>alpha</b> <b>bravo</b> <b>charlie</b> - charlie bravo alpha"}, snippets);
// test the Passage only has 3 matches. We don't want duplicates from both PhraseQuery
highlighter.setFormatter(new PassageFormatter() {
@Override
public Object format(Passage[] passages, String content) {
return Arrays.toString(passages);
}
});
assertArrayEquals(new String[]{"[Passage[0-41]{alpha[0-5],bravo[6-11],charlie[12-19]}score=3.931102]"},
highlighter.highlight("body", query, topDocs));
}
public void testSynonyms() throws IOException {
@ -477,4 +501,68 @@ public class TestUnifiedHighlighterStrictPhrases extends LuceneTestCase {
return wrapped.hashCode();
}
}
// Ported from LUCENE-5455 (fixed in LUCENE-8121). Also see LUCENE-2287.
public void testNestedSpanQueryHighlight() throws Exception {
// For a long time, the highlighters used to assume all query terms within the SpanQuery were valid at the Spans'
// position range. This would highlight occurrences of terms that were actually not matched by the query.
// But now using the SpanCollector API we don't make this kind of mistake.
final String FIELD_NAME = "body";
final String indexedText = "x y z x z x a";
indexWriter.addDocument(newDoc(indexedText));
initReaderSearcherHighlighter();
TopDocs topDocs = new TopDocs(1, new ScoreDoc[]{new ScoreDoc(0, 1f)}, 1f);
String expected = "<b>x</b> <b>y</b> <b>z</b> x z x <b>a</b>";
Query q = new SpanNearQuery(new SpanQuery[] {
new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term(FIELD_NAME, "x")),
new SpanTermQuery(new Term(FIELD_NAME, "y")),
new SpanTermQuery(new Term(FIELD_NAME, "z"))}, 0, true),
new SpanTermQuery(new Term(FIELD_NAME, "a"))}, 10, false);
String observed = highlighter.highlight(FIELD_NAME, q, topDocs)[0];
if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed);
assertEquals("Nested SpanNear query not properly highlighted.", expected, observed);
expected = "x <b>y</b> <b>z</b> <b>x</b> <b>z</b> x <b>a</b>";
q = new SpanNearQuery(new SpanQuery[] {
new SpanOrQuery(
new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term(FIELD_NAME, "x")),
new SpanTermQuery(new Term(FIELD_NAME, "z"))}, 0, true),
new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term(FIELD_NAME, "y")),
new SpanTermQuery(new Term(FIELD_NAME, "z"))}, 0, true)),
new SpanOrQuery(
new SpanTermQuery(new Term(FIELD_NAME, "a")),
new SpanTermQuery(new Term(FIELD_NAME, "b")))}, 10, false);
observed = highlighter.highlight(FIELD_NAME, q, topDocs)[0];
if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed);
assertEquals("Nested SpanNear query within SpanOr not properly highlighted.", expected, observed);
expected = "x <b>y</b> <b>z</b> <b>x</b> <b>z</b> x <b>a</b>";
q = new SpanNearQuery(new SpanQuery[] {
new SpanNearQuery(new SpanQuery[] {
new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term(FIELD_NAME, "*"))),
new SpanTermQuery(new Term(FIELD_NAME, "z"))}, 0, true),
new SpanTermQuery(new Term(FIELD_NAME, "a"))}, 10, false);
observed = highlighter.highlight(FIELD_NAME, q, topDocs)[0];
if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed);
assertEquals("Nested SpanNear query with wildcard not properly highlighted.", expected, observed);
expected = "<b>x</b> <b>y</b> z x z x <b>a</b>";
q = new SpanNearQuery(new SpanQuery[] {
new SpanOrQuery(
new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term(FIELD_NAME, "x")),
new SpanTermQuery(new Term(FIELD_NAME, "y"))}, 0, true),
new SpanNearQuery(new SpanQuery[] { //No hit span query
new SpanTermQuery(new Term(FIELD_NAME, "z")),
new SpanTermQuery(new Term(FIELD_NAME, "a"))}, 0, true)),
new SpanTermQuery(new Term(FIELD_NAME, "a"))}, 10, false);
observed = highlighter.highlight(FIELD_NAME, q, topDocs)[0];
if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed);
assertEquals("Nested SpanNear query within SpanOr not properly highlighted.", expected, observed);
}
}

View File

@ -218,11 +218,9 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
// this code never runs; just for compilation
Passage p;
try (OffsetsEnum oe = new OffsetsEnum(null, EMPTY)) {
try (OffsetsEnum oe = new OffsetsEnum.OfPostings(null, EMPTY)) {
oe.getTerm();
oe.getPostingsEnum();
oe.freq();
oe.hasMorePositions();
oe.nextPosition();
oe.startOffset();
oe.endOffset();