mirror of https://github.com/apache/lucene.git
LUCENE-8306: Allow iteration over submatches
Also includes LUCENE-8404, adding match iteration to SpanQuery
This commit is contained in:
parent
995a902d1a
commit
028c86b1fa
|
@ -45,14 +45,14 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
|
|||
*
|
||||
* Only terms that have at least one match in the given document will be included
|
||||
*/
|
||||
static MatchesIterator fromTerms(LeafReaderContext context, int doc, String field, List<Term> terms) throws IOException {
|
||||
static MatchesIterator fromTerms(LeafReaderContext context, int doc, Query query, String field, List<Term> terms) throws IOException {
|
||||
Objects.requireNonNull(field);
|
||||
for (Term term : terms) {
|
||||
if (Objects.equals(field, term.field()) == false) {
|
||||
throw new IllegalArgumentException("Tried to generate iterator from terms in multiple fields: expected [" + field + "] but got [" + term.field() + "]");
|
||||
}
|
||||
}
|
||||
return fromTermsEnum(context, doc, field, asBytesRefIterator(terms));
|
||||
return fromTermsEnum(context, doc, query, field, asBytesRefIterator(terms));
|
||||
}
|
||||
|
||||
private static BytesRefIterator asBytesRefIterator(List<Term> terms) {
|
||||
|
@ -72,7 +72,7 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
|
|||
*
|
||||
* Only terms that have at least one match in the given document will be included
|
||||
*/
|
||||
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, String field, BytesRefIterator terms) throws IOException {
|
||||
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
|
||||
Objects.requireNonNull(field);
|
||||
List<MatchesIterator> mis = new ArrayList<>();
|
||||
Terms t = context.reader().terms(field);
|
||||
|
@ -84,7 +84,7 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
|
|||
if (te.seekExact(term)) {
|
||||
PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
|
||||
if (pe.advance(doc) == doc) {
|
||||
mis.add(new TermMatchesIterator(pe));
|
||||
mis.add(new TermMatchesIterator(query, pe));
|
||||
reuse = null;
|
||||
}
|
||||
else {
|
||||
|
@ -158,4 +158,13 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
|
|||
return queue.top().endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return queue.top().getSubMatches();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query getQuery() {
|
||||
return queue.top().getQuery();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -78,7 +78,7 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
|
|||
@Override
|
||||
public Matches matches(LeafReaderContext context, int doc) throws IOException {
|
||||
final SortedSetDocValues fcsi = DocValues.getSortedSet(context.reader(), query.field);
|
||||
return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query.field, getTermsEnum(fcsi)));
|
||||
return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query, query.field, getTermsEnum(fcsi)));
|
||||
}
|
||||
|
||||
private TermsEnum getTermsEnum(SortedSetDocValues fcsi) throws IOException {
|
||||
|
|
|
@ -28,6 +28,9 @@ import org.apache.lucene.index.LeafReaderContext;
|
|||
* positions and/or offsets after each call. You should not call the position or offset methods
|
||||
* before {@link #next()} has been called, or after {@link #next()} has returned {@code false}.
|
||||
*
|
||||
* Matches from some queries may span multiple positions. You can retrieve the positions of
|
||||
* individual matching terms on the current match by calling {@link #getSubMatches()}.
|
||||
*
|
||||
* Matches are ordered by start position, and then by end position. Match intervals may overlap.
|
||||
*
|
||||
* @see Weight#matches(LeafReaderContext, int)
|
||||
|
@ -70,4 +73,25 @@ public interface MatchesIterator {
|
|||
*/
|
||||
int endOffset() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns a MatchesIterator that iterates over the positions and offsets of individual
|
||||
* terms within the current match
|
||||
*
|
||||
* Returns {@code null} if there are no submatches (ie the current iterator is at the
|
||||
* leaf level)
|
||||
*
|
||||
* Should only be called after {@link #next()} has returned {@code true}
|
||||
*/
|
||||
MatchesIterator getSubMatches() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the Query causing the current match
|
||||
*
|
||||
* If this {@link MatchesIterator} has been returned from a {@link #getSubMatches()}
|
||||
* call, then returns a {@link TermQuery} equivalent to the current match
|
||||
*
|
||||
* Should only be called after {@link #next()} has returned {@code true}
|
||||
*/
|
||||
Query getQuery();
|
||||
|
||||
}
|
||||
|
|
|
@ -269,7 +269,7 @@ public class MultiPhraseQuery extends Query {
|
|||
TermState termState = termStates.get(term).get(context);
|
||||
if (termState != null) {
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS));
|
||||
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS));
|
||||
totalMatchCost += PhraseQuery.termPositionsCost(termsEnum);
|
||||
}
|
||||
}
|
||||
|
@ -294,7 +294,7 @@ public class MultiPhraseQuery extends Query {
|
|||
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
|
||||
}
|
||||
else {
|
||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
|
||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -647,5 +647,6 @@ public class MultiPhraseQuery extends Query {
|
|||
public BytesRef getPayload() throws IOException {
|
||||
return posQueue.top().pe.getPayload();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -211,7 +211,7 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
|
|||
if (terms.hasPositions() == false) {
|
||||
return super.matches(context, doc);
|
||||
}
|
||||
return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query.field, query.getTermsEnum(terms)));
|
||||
return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query, query.field, query.getTermsEnum(terms)));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -88,4 +88,5 @@ abstract class PhraseMatcher {
|
|||
public float getMatchCost() {
|
||||
return matchCost;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -446,7 +446,7 @@ public class PhraseQuery extends Query {
|
|||
return null;
|
||||
}
|
||||
te.seekExact(t.bytes(), state);
|
||||
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
|
||||
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS);
|
||||
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t);
|
||||
totalMatchCost += termPositionsCost(te);
|
||||
}
|
||||
|
@ -457,7 +457,7 @@ public class PhraseQuery extends Query {
|
|||
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
|
||||
}
|
||||
else {
|
||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
|
||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -123,6 +123,16 @@ abstract class PhraseWeight extends Weight {
|
|||
public int endOffset() throws IOException {
|
||||
return matcher.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return null; // phrases are treated as leaves
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query getQuery() {
|
||||
return PhraseWeight.this.getQuery();
|
||||
}
|
||||
};
|
||||
});
|
||||
}
|
||||
|
|
|
@ -54,13 +54,14 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
private final int slop;
|
||||
private final int numPostings;
|
||||
private final PhraseQueue pq; // for advancing min position
|
||||
private final boolean captureLeadMatch;
|
||||
|
||||
private int end; // current largest phrase position
|
||||
|
||||
private int leadPosition;
|
||||
private int leadOffset;
|
||||
private int currentEndPostings;
|
||||
private int advanceEndPostings;
|
||||
private int leadEndOffset;
|
||||
private int leadOrd;
|
||||
|
||||
private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
|
||||
private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
|
||||
|
@ -71,10 +72,11 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
private boolean positioned;
|
||||
private int matchLength;
|
||||
|
||||
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost) {
|
||||
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) {
|
||||
super(approximation(postings), matchCost);
|
||||
this.slop = slop;
|
||||
this.numPostings = postings.length;
|
||||
this.captureLeadMatch = captureLeadMatch;
|
||||
pq = new PhraseQueue(postings.length);
|
||||
phrasePositions = new PhrasePositions[postings.length];
|
||||
for (int i = 0; i < postings.length; ++i) {
|
||||
|
@ -120,10 +122,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
return false;
|
||||
}
|
||||
PhrasePositions pp = pq.pop();
|
||||
assert pp != null; // if the pq is empty, then positioned == false
|
||||
leadPosition = pp.position + pp.offset;
|
||||
leadOffset = pp.postings.startOffset();
|
||||
currentEndPostings = advanceEndPostings;
|
||||
assert pp != null; // if the pq is not full, then positioned == false
|
||||
captureLead(pp);
|
||||
matchLength = end - pp.position;
|
||||
int next = pq.top().position;
|
||||
while (advancePP(pp)) {
|
||||
|
@ -137,6 +137,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
}
|
||||
pp = pq.pop();
|
||||
next = pq.top().position;
|
||||
assert pp != null; // if the pq is not full, then positioned == false
|
||||
matchLength = end - pp.position;
|
||||
} else {
|
||||
int matchLength2 = end - pp.position;
|
||||
|
@ -144,14 +145,22 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
matchLength = matchLength2;
|
||||
}
|
||||
}
|
||||
leadPosition = pp.position + pp.offset;
|
||||
leadOffset = pp.postings.startOffset();
|
||||
currentEndPostings = advanceEndPostings;
|
||||
captureLead(pp);
|
||||
}
|
||||
positioned = false;
|
||||
return matchLength <= slop;
|
||||
}
|
||||
|
||||
private void captureLead(PhrasePositions pp) throws IOException {
|
||||
if (captureLeadMatch == false) {
|
||||
return;
|
||||
}
|
||||
leadOrd = pp.ord;
|
||||
leadPosition = pp.position + pp.offset;
|
||||
leadOffset = pp.postings.startOffset();
|
||||
leadEndOffset = pp.postings.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
// when a match is detected, the top postings is advanced until it has moved
|
||||
|
@ -160,6 +169,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
// However, the priority queue doesn't guarantee that the top postings is in fact the
|
||||
// earliest in the list, so we need to cycle through all terms to check.
|
||||
// this is slow, but Matches is slow anyway...
|
||||
int leadPosition = this.leadPosition;
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
leadPosition = Math.min(leadPosition, pp.position + pp.offset);
|
||||
}
|
||||
|
@ -168,7 +178,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return phrasePositions[currentEndPostings].position + phrasePositions[currentEndPostings].offset;
|
||||
int endPosition = leadPosition;
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
if (pp.ord != leadOrd) {
|
||||
endPosition = Math.max(endPosition, pp.position + pp.offset);
|
||||
}
|
||||
}
|
||||
return endPosition;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -179,6 +195,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
// However, the priority queue doesn't guarantee that the top postings is in fact the
|
||||
// earliest in the list, so we need to cycle through all terms to check
|
||||
// this is slow, but Matches is slow anyway...
|
||||
int leadOffset = this.leadOffset;
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
leadOffset = Math.min(leadOffset, pp.postings.startOffset());
|
||||
}
|
||||
|
@ -187,7 +204,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return phrasePositions[currentEndPostings].postings.endOffset();
|
||||
int endOffset = leadEndOffset;
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
if (pp.ord != leadOrd) {
|
||||
endOffset = Math.max(endOffset, pp.postings.endOffset());
|
||||
}
|
||||
}
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
/** advance a PhrasePosition and update 'end', return false if exhausted */
|
||||
|
@ -197,12 +220,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
}
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
if (pp.position == end) {
|
||||
if (pp.ord > advanceEndPostings) {
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -307,12 +324,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
pp.firstPosition();
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
if (pp.position == end) {
|
||||
if (pp.ord > advanceEndPostings) {
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
}
|
||||
pq.add(pp);
|
||||
}
|
||||
|
@ -342,12 +353,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
if (pp.position == end) {
|
||||
if (pp.ord > advanceEndPostings) {
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
}
|
||||
pq.add(pp);
|
||||
}
|
||||
|
|
|
@ -176,7 +176,7 @@ public final class SynonymQuery extends Query {
|
|||
if (terms == null || terms.hasPositions() == false) {
|
||||
return super.matches(context, doc);
|
||||
}
|
||||
return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTerms(context, doc, field, Arrays.asList(SynonymQuery.this.terms)));
|
||||
return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTerms(context, doc, getQuery(), field, Arrays.asList(SynonymQuery.this.terms)));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -226,7 +226,7 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
if (terms == null || terms.hasPositions() == false) {
|
||||
return super.matches(context, doc);
|
||||
}
|
||||
return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, field, termData.iterator()));
|
||||
return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, getQuery(), field, termData.iterator()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -29,12 +29,14 @@ class TermMatchesIterator implements MatchesIterator {
|
|||
private int upto;
|
||||
private int pos;
|
||||
private final PostingsEnum pe;
|
||||
private final Query query;
|
||||
|
||||
/**
|
||||
* Create a new {@link TermMatchesIterator} for the given term and postings list
|
||||
*/
|
||||
TermMatchesIterator(PostingsEnum pe) throws IOException {
|
||||
TermMatchesIterator(Query query, PostingsEnum pe) throws IOException {
|
||||
this.pe = pe;
|
||||
this.query = query;
|
||||
this.upto = pe.freq();
|
||||
}
|
||||
|
||||
|
@ -67,4 +69,13 @@ class TermMatchesIterator implements MatchesIterator {
|
|||
return pe.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query getQuery() {
|
||||
return query;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -94,7 +94,7 @@ public class TermQuery extends Query {
|
|||
if (pe.advance(doc) != doc) {
|
||||
return null;
|
||||
}
|
||||
return new TermMatchesIterator(pe);
|
||||
return new TermMatchesIterator(getQuery(), pe);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.search.spans;
|
|||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -28,6 +30,10 @@ import org.apache.lucene.search.CollectionStatistics;
|
|||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.LeafSimScorer;
|
||||
import org.apache.lucene.search.Matches;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
@ -161,4 +167,138 @@ public abstract class SpanWeight extends Weight {
|
|||
|
||||
return Explanation.noMatch("no matching term");
|
||||
}
|
||||
|
||||
private static class TermMatch {
|
||||
Term term;
|
||||
int position;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Matches matches(LeafReaderContext context, int doc) throws IOException {
|
||||
return Matches.forField(field, () -> {
|
||||
Spans spans = getSpans(context, Postings.OFFSETS);
|
||||
if (spans == null || spans.advance(doc) != doc) {
|
||||
return null;
|
||||
}
|
||||
return new MatchesIterator() {
|
||||
|
||||
int innerTermCount = 0;
|
||||
TermMatch[] innerTerms = new TermMatch[0];
|
||||
|
||||
SpanCollector termCollector = new SpanCollector() {
|
||||
@Override
|
||||
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
|
||||
innerTermCount++;
|
||||
if (innerTermCount > innerTerms.length) {
|
||||
TermMatch[] temp = new TermMatch[innerTermCount];
|
||||
System.arraycopy(innerTerms, 0, temp, 0, innerTermCount - 1);
|
||||
innerTerms = temp;
|
||||
innerTerms[innerTermCount - 1] = new TermMatch();
|
||||
}
|
||||
innerTerms[innerTermCount - 1].term = term;
|
||||
innerTerms[innerTermCount - 1].position = position;
|
||||
innerTerms[innerTermCount - 1].startOffset = postings.startOffset();
|
||||
innerTerms[innerTermCount - 1].endOffset = postings.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
innerTermCount = 0;
|
||||
}
|
||||
};
|
||||
|
||||
@Override
|
||||
public boolean next() throws IOException {
|
||||
innerTermCount = 0;
|
||||
return spans.nextStartPosition() != Spans.NO_MORE_POSITIONS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
return spans.startPosition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return spans.endPosition() - 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
if (innerTermCount == 0) {
|
||||
collectInnerTerms();
|
||||
}
|
||||
return innerTerms[0].startOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
if (innerTermCount == 0) {
|
||||
collectInnerTerms();
|
||||
}
|
||||
return innerTerms[innerTermCount - 1].endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
if (innerTermCount == 0) {
|
||||
collectInnerTerms();
|
||||
}
|
||||
return new MatchesIterator() {
|
||||
|
||||
int upto = -1;
|
||||
|
||||
@Override
|
||||
public boolean next() throws IOException {
|
||||
upto++;
|
||||
return upto < innerTermCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
return innerTerms[upto].position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return innerTerms[upto].position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return innerTerms[upto].startOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return innerTerms[upto].endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query getQuery() {
|
||||
return new TermQuery(innerTerms[upto].term);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query getQuery() {
|
||||
return SpanWeight.this.getQuery();
|
||||
}
|
||||
|
||||
void collectInnerTerms() throws IOException {
|
||||
termCollector.reset();
|
||||
spans.collect(termCollector);
|
||||
Arrays.sort(innerTerms, 0, innerTermCount, Comparator.comparing(a -> a.position));
|
||||
}
|
||||
};
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,8 +18,12 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -29,9 +33,14 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -99,7 +108,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
"nothing matches this document"
|
||||
};
|
||||
|
||||
void checkMatches(Query q, String field, int[][] expected) throws IOException {
|
||||
private void checkMatches(Query q, String field, int[][] expected) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(expected[i][0], searcher.leafContexts));
|
||||
|
@ -112,14 +121,40 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
MatchesIterator it = matches.getMatches(field);
|
||||
if (expected[i].length == 1) {
|
||||
assertNull(it);
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
checkFieldMatches(it, expected[i]);
|
||||
checkFieldMatches(matches.getMatches(field), expected[i]); // test multiple calls
|
||||
}
|
||||
}
|
||||
|
||||
void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
|
||||
private void checkLabelCount(Query q, String field, int[] expected) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
|
||||
int doc = i - ctx.docBase;
|
||||
Matches matches = w.matches(ctx, doc);
|
||||
if (matches == null) {
|
||||
assertEquals("Expected to get matches on document " + i, 0, expected[i]);
|
||||
continue;
|
||||
}
|
||||
MatchesIterator it = matches.getMatches(field);
|
||||
if (expected[i] == 0) {
|
||||
assertNull(it);
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
assertNotNull(it);
|
||||
}
|
||||
IdentityHashMap<Query, Integer> labels = new IdentityHashMap<>();
|
||||
while (it.next()) {
|
||||
labels.put(it.getQuery(), 1);
|
||||
}
|
||||
assertEquals(expected[i], labels.size());
|
||||
}
|
||||
}
|
||||
|
||||
private void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
|
||||
int pos = 1;
|
||||
while (it.next()) {
|
||||
//System.out.println(expected[i][pos] + "->" + expected[i][pos + 1] + "[" + expected[i][pos + 2] + "->" + expected[i][pos + 3] + "]");
|
||||
|
@ -132,7 +167,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
assertEquals(expected.length, pos);
|
||||
}
|
||||
|
||||
void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
|
||||
private void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
|
||||
|
@ -148,8 +183,109 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void assertIsLeafMatch(Query q, String field) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < searcher.reader.maxDoc(); i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
|
||||
int doc = i - ctx.docBase;
|
||||
Matches matches = w.matches(ctx, doc);
|
||||
if (matches == null) {
|
||||
return;
|
||||
}
|
||||
MatchesIterator mi = matches.getMatches(field);
|
||||
if (mi == null) {
|
||||
return;
|
||||
}
|
||||
while (mi.next()) {
|
||||
assertNull(mi.getSubMatches());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkTermMatches(Query q, String field, TermMatch[][][] expected) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
|
||||
int doc = i - ctx.docBase;
|
||||
Matches matches = w.matches(ctx, doc);
|
||||
if (matches == null) {
|
||||
assertEquals(expected[i].length, 0);
|
||||
continue;
|
||||
}
|
||||
MatchesIterator it = matches.getMatches(field);
|
||||
if (expected[i].length == 0) {
|
||||
assertNull(it);
|
||||
continue;
|
||||
}
|
||||
checkTerms(expected[i], it);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkTerms(TermMatch[][] expected, MatchesIterator it) throws IOException {
|
||||
int upTo = 0;
|
||||
while (it.next()) {
|
||||
Set<TermMatch> expectedMatches = new HashSet<>(Arrays.asList(expected[upTo]));
|
||||
MatchesIterator submatches = it.getSubMatches();
|
||||
while (submatches.next()) {
|
||||
TermMatch tm = new TermMatch(submatches.startPosition(), submatches.startOffset(), submatches.endOffset());
|
||||
if (expectedMatches.remove(tm) == false) {
|
||||
fail("Unexpected term match: " + tm);
|
||||
}
|
||||
}
|
||||
if (expectedMatches.size() != 0) {
|
||||
fail("Missing term matches: " + expectedMatches.stream().map(Object::toString).collect(Collectors.joining(", ")));
|
||||
}
|
||||
upTo++;
|
||||
}
|
||||
if (upTo < expected.length - 1) {
|
||||
fail("Missing expected match");
|
||||
}
|
||||
}
|
||||
|
||||
static class TermMatch {
|
||||
|
||||
public final int position;
|
||||
|
||||
public final int startOffset;
|
||||
|
||||
public final int endOffset;
|
||||
|
||||
public TermMatch(PostingsEnum pe, int position) throws IOException {
|
||||
this.position = position;
|
||||
this.startOffset = pe.startOffset();
|
||||
this.endOffset = pe.endOffset();
|
||||
}
|
||||
|
||||
public TermMatch(int position, int startOffset, int endOffset) {
|
||||
this.position = position;
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
TermMatch termMatch = (TermMatch) o;
|
||||
return position == termMatch.position &&
|
||||
startOffset == termMatch.startOffset &&
|
||||
endOffset == termMatch.endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(position, startOffset, endOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return position + "[" + startOffset + "->" + endOffset + "]";
|
||||
}
|
||||
}
|
||||
|
||||
public void testTermQuery() throws IOException {
|
||||
Query q = new TermQuery(new Term(FIELD_WITH_OFFSETS, "w1"));
|
||||
Term t = new Term(FIELD_WITH_OFFSETS, "w1");
|
||||
Query q = new TermQuery(t);
|
||||
checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0, 0, 0, 0, 2 },
|
||||
{ 1, 0, 0, 0, 2 },
|
||||
|
@ -157,6 +293,8 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 2, 2, 6, 8 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0, 0 });
|
||||
assertIsLeafMatch(q, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
public void testTermQueryNoStoredOffsets() throws IOException {
|
||||
|
@ -191,6 +329,8 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 1, 2, 0, 0 });
|
||||
assertIsLeafMatch(q, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
public void testDisjunctionNoPositions() throws IOException {
|
||||
|
@ -215,6 +355,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 0, 2, 0, 0 });
|
||||
}
|
||||
|
||||
public void testReqOptNoPositions() throws IOException {
|
||||
|
@ -248,6 +389,8 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 3, 3, 9, 11, 5, 5, 15, 17 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 3, 1, 3, 3, 0, 0 });
|
||||
assertIsLeafMatch(q, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
public void testMinShouldMatchNoPositions() throws IOException {
|
||||
|
@ -331,6 +474,8 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(rq, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0 });
|
||||
assertIsLeafMatch(rq, FIELD_WITH_OFFSETS);
|
||||
|
||||
}
|
||||
|
||||
|
@ -357,6 +502,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 },
|
||||
{ 4 }
|
||||
});
|
||||
assertIsLeafMatch(q, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
public void testSynonymQueryNoPositions() throws IOException {
|
||||
|
@ -392,12 +538,25 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
// 0 1 2 3 4 5 6 7
|
||||
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
|
||||
|
||||
public void testSloppyPhraseQueryWithRepeats() throws IOException {
|
||||
Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
|
||||
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
|
||||
PhraseQuery pq = new PhraseQuery(10, FIELD_WITH_OFFSETS, "phrase", "sentence", "sentence");
|
||||
checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 1, 6, 2, 43, 2, 11, 9, 75, 5, 11, 28, 75, 6, 11, 35, 75 }
|
||||
});
|
||||
checkLabelCount(pq, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
|
||||
assertIsLeafMatch(pq, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
public void testSloppyPhraseQuery() throws IOException {
|
||||
PhraseQuery pq = new PhraseQuery(4, FIELD_WITH_OFFSETS, "a", "sentence");
|
||||
checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 0, 2, 0, 17, 6, 9, 35, 59, 9, 11, 58, 75 }
|
||||
});
|
||||
assertIsLeafMatch(pq, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
public void testExactPhraseQuery() throws IOException {
|
||||
|
@ -407,29 +566,36 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 4, 1, 2, 2, 17, 5, 6, 28, 43, 10, 11, 60, 75 }
|
||||
});
|
||||
|
||||
Term a = new Term(FIELD_WITH_OFFSETS, "a");
|
||||
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
|
||||
PhraseQuery pq2 = new PhraseQuery.Builder()
|
||||
.add(new Term(FIELD_WITH_OFFSETS, "a"))
|
||||
.add(new Term(FIELD_WITH_OFFSETS, "sentence"), 2)
|
||||
.add(a)
|
||||
.add(s, 2)
|
||||
.build();
|
||||
checkMatches(pq2, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 0, 2, 0, 17, 9, 11, 58, 75 }
|
||||
});
|
||||
assertIsLeafMatch(pq2, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
|
||||
|
||||
public void testSloppyMultiPhraseQuery() throws IOException {
|
||||
Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
|
||||
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
|
||||
Term i = new Term(FIELD_WITH_OFFSETS, "iterations");
|
||||
MultiPhraseQuery mpq = new MultiPhraseQuery.Builder()
|
||||
.add(new Term(FIELD_WITH_OFFSETS, "phrase"))
|
||||
.add(new Term[]{ new Term(FIELD_WITH_OFFSETS, "sentence"), new Term(FIELD_WITH_OFFSETS, "iterations") })
|
||||
.add(p)
|
||||
.add(new Term[]{ s, i })
|
||||
.setSlop(4)
|
||||
.build();
|
||||
checkMatches(mpq, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 1, 2, 2, 17, 5, 7, 28, 54, 5, 7, 28, 54, 10, 11, 60, 75 }
|
||||
{ 4, 1, 2, 2, 17, 5, 6, 28, 43, 5, 7, 28, 54, 10, 11, 60, 75 }
|
||||
});
|
||||
assertIsLeafMatch(mpq, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
public void testExactMultiPhraseQuery() throws IOException {
|
||||
|
@ -450,6 +616,38 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 0, 1, 0, 8, 4, 5, 23, 34, 9, 10, 58, 66 }
|
||||
});
|
||||
assertIsLeafMatch(mpq2, FIELD_WITH_OFFSETS);
|
||||
}
|
||||
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
|
||||
|
||||
public void testSpanQuery() throws IOException {
|
||||
SpanQuery subq = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
|
||||
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "with")))
|
||||
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "many")))
|
||||
.build();
|
||||
Query q = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
|
||||
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "sentence")))
|
||||
.addClause(new SpanOrQuery(subq, new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "iterations"))))
|
||||
.build();
|
||||
checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 2, 4, 9, 27, 6, 7, 35, 54 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
|
||||
checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{
|
||||
{}, {}, {}, {},
|
||||
{
|
||||
{
|
||||
new TermMatch(2, 9, 17),
|
||||
new TermMatch(3, 18, 22),
|
||||
new TermMatch(4, 23, 27)
|
||||
}, {
|
||||
new TermMatch(6, 35, 43), new TermMatch(7, 44, 54)
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -67,4 +67,15 @@ class AssertingMatchesIterator implements MatchesIterator {
|
|||
return in.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
assert state == State.ITERATING : state;
|
||||
return in.getSubMatches();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query getQuery() {
|
||||
assert state == State.ITERATING : state;
|
||||
return in.getQuery();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue