mirror of https://github.com/apache/lucene.git
LUCENE-8306: Allow iteration over submatches
This commit is contained in:
parent
2826a9550b
commit
a8839b7eab
|
@ -198,6 +198,9 @@ Improvements
|
|||
* LUCENE-8345, GitHub PR #392: Remove instantiation of redundant wrapper classes for primitives;
|
||||
add wrapper class constructors to forbiddenapis. (Michael Braun via Uwe Schindler)
|
||||
|
||||
* LUCENE-8306: Matches API now allows iteration over sub-matches in Spans (Alan Woodward,
|
||||
Jim Ferenczi, David Smiley)
|
||||
|
||||
Other:
|
||||
|
||||
* LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's
|
||||
|
|
|
@ -158,4 +158,13 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
|
|||
return queue.top().endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return queue.top().getSubMatches();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return queue.top().label();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -149,4 +149,48 @@ final class ExactPhraseMatcher extends PhraseMatcher {
|
|||
return postings[postings.length - 1].postings.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
MatchesIterator getSubMatches() {
|
||||
return new MatchesIterator() {
|
||||
|
||||
int upTo = -1;
|
||||
|
||||
@Override
|
||||
public boolean next() throws IOException {
|
||||
upTo++;
|
||||
return upTo < postings.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
return postings[upTo].pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return postings[upTo].pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return postings[upTo].postings.startOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return postings[upTo].postings.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return MatchesIterator.EMPTY_ITERATOR;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return this;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -28,6 +28,9 @@ import org.apache.lucene.index.LeafReaderContext;
|
|||
* positions and/or offsets after each call. You should not call the position or offset methods
|
||||
* before {@link #next()} has been called, or after {@link #next()} has returned {@code false}.
|
||||
*
|
||||
* Matches from some queries may span multiple positions. You can retrieve the positions of
|
||||
* individual matching terms on the current match by calling {@link #getSubMatches()}.
|
||||
*
|
||||
* Matches are ordered by start position, and then by end position. Match intervals may overlap.
|
||||
*
|
||||
* @see Weight#matches(LeafReaderContext, int)
|
||||
|
@ -70,4 +73,59 @@ public interface MatchesIterator {
|
|||
*/
|
||||
int endOffset() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns a MatchesIterator that iterates over the positions and offsets of individual
|
||||
* terms within the current match
|
||||
*
|
||||
* Should only be called after {@link #next()} has returned {@code true}
|
||||
*/
|
||||
MatchesIterator getSubMatches() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns a label identifying the leaf query causing the current match
|
||||
*
|
||||
* Should only be called after {@link #next()} has returned {@code true}
|
||||
*/
|
||||
Object label();
|
||||
|
||||
/**
|
||||
* A MatchesIterator that is immediately exhausted
|
||||
*/
|
||||
MatchesIterator EMPTY_ITERATOR = new MatchesIterator() {
|
||||
@Override
|
||||
public boolean next() throws IOException {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return this;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -269,7 +269,7 @@ public class MultiPhraseQuery extends Query {
|
|||
TermState termState = termStates.get(term).get(context);
|
||||
if (termState != null) {
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS));
|
||||
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS));
|
||||
totalMatchCost += PhraseQuery.termPositionsCost(termsEnum);
|
||||
}
|
||||
}
|
||||
|
@ -294,7 +294,7 @@ public class MultiPhraseQuery extends Query {
|
|||
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
|
||||
}
|
||||
else {
|
||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
|
||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -647,5 +647,6 @@ public class MultiPhraseQuery extends Query {
|
|||
public BytesRef getPayload() throws IOException {
|
||||
return posQueue.top().pe.getPayload();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -88,4 +88,6 @@ abstract class PhraseMatcher {
|
|||
public float getMatchCost() {
|
||||
return matchCost;
|
||||
}
|
||||
|
||||
abstract MatchesIterator getSubMatches() throws IOException;
|
||||
}
|
||||
|
|
|
@ -446,7 +446,7 @@ public class PhraseQuery extends Query {
|
|||
return null;
|
||||
}
|
||||
te.seekExact(t.bytes(), state);
|
||||
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
|
||||
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS);
|
||||
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t);
|
||||
totalMatchCost += termPositionsCost(te);
|
||||
}
|
||||
|
@ -457,7 +457,7 @@ public class PhraseQuery extends Query {
|
|||
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
|
||||
}
|
||||
else {
|
||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
|
||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -123,6 +123,16 @@ abstract class PhraseWeight extends Weight {
|
|||
public int endOffset() throws IOException {
|
||||
return matcher.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return matcher.getSubMatches();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return matcher;
|
||||
}
|
||||
};
|
||||
});
|
||||
}
|
||||
|
|
|
@ -54,13 +54,14 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
private final int slop;
|
||||
private final int numPostings;
|
||||
private final PhraseQueue pq; // for advancing min position
|
||||
private final boolean captureLeadMatch;
|
||||
|
||||
private int end; // current largest phrase position
|
||||
|
||||
private int leadPosition;
|
||||
private int leadOffset;
|
||||
private int currentEndPostings;
|
||||
private int advanceEndPostings;
|
||||
private int leadEndOffset;
|
||||
private int leadOrd;
|
||||
|
||||
private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
|
||||
private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
|
||||
|
@ -71,10 +72,11 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
private boolean positioned;
|
||||
private int matchLength;
|
||||
|
||||
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost) {
|
||||
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) {
|
||||
super(approximation(postings), matchCost);
|
||||
this.slop = slop;
|
||||
this.numPostings = postings.length;
|
||||
this.captureLeadMatch = captureLeadMatch;
|
||||
pq = new PhraseQueue(postings.length);
|
||||
phrasePositions = new PhrasePositions[postings.length];
|
||||
for (int i = 0; i < postings.length; ++i) {
|
||||
|
@ -120,10 +122,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
return false;
|
||||
}
|
||||
PhrasePositions pp = pq.pop();
|
||||
assert pp != null; // if the pq is empty, then positioned == false
|
||||
leadPosition = pp.position + pp.offset;
|
||||
leadOffset = pp.postings.startOffset();
|
||||
currentEndPostings = advanceEndPostings;
|
||||
assert pp != null; // if the pq is not full, then positioned == false
|
||||
captureLead(pp);
|
||||
matchLength = end - pp.position;
|
||||
int next = pq.top().position;
|
||||
while (advancePP(pp)) {
|
||||
|
@ -137,6 +137,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
}
|
||||
pp = pq.pop();
|
||||
next = pq.top().position;
|
||||
assert pp != null; // if the pq is not full, then positioned == false
|
||||
matchLength = end - pp.position;
|
||||
} else {
|
||||
int matchLength2 = end - pp.position;
|
||||
|
@ -144,14 +145,22 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
matchLength = matchLength2;
|
||||
}
|
||||
}
|
||||
leadPosition = pp.position + pp.offset;
|
||||
leadOffset = pp.postings.startOffset();
|
||||
currentEndPostings = advanceEndPostings;
|
||||
captureLead(pp);
|
||||
}
|
||||
positioned = false;
|
||||
return matchLength <= slop;
|
||||
}
|
||||
|
||||
private void captureLead(PhrasePositions pp) throws IOException {
|
||||
if (captureLeadMatch == false) {
|
||||
return;
|
||||
}
|
||||
leadOrd = pp.ord;
|
||||
leadPosition = pp.position + pp.offset;
|
||||
leadOffset = pp.postings.startOffset();
|
||||
leadEndOffset = pp.postings.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
// when a match is detected, the top postings is advanced until it has moved
|
||||
|
@ -160,6 +169,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
// However, the priority queue doesn't guarantee that the top postings is in fact the
|
||||
// earliest in the list, so we need to cycle through all terms to check.
|
||||
// this is slow, but Matches is slow anyway...
|
||||
int leadPosition = this.leadPosition;
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
leadPosition = Math.min(leadPosition, pp.position + pp.offset);
|
||||
}
|
||||
|
@ -168,7 +178,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return phrasePositions[currentEndPostings].position + phrasePositions[currentEndPostings].offset;
|
||||
int endPosition = leadPosition;
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
if (pp.ord != leadOrd) {
|
||||
endPosition = Math.max(endPosition, pp.position + pp.offset);
|
||||
}
|
||||
}
|
||||
return endPosition;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -179,6 +195,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
// However, the priority queue doesn't guarantee that the top postings is in fact the
|
||||
// earliest in the list, so we need to cycle through all terms to check
|
||||
// this is slow, but Matches is slow anyway...
|
||||
int leadOffset = this.leadOffset;
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
leadOffset = Math.min(leadOffset, pp.postings.startOffset());
|
||||
}
|
||||
|
@ -187,7 +204,69 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return phrasePositions[currentEndPostings].postings.endOffset();
|
||||
int endOffset = leadEndOffset;
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
if (pp.ord != leadOrd) {
|
||||
endOffset = Math.max(endOffset, pp.postings.endOffset());
|
||||
}
|
||||
}
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
MatchesIterator getSubMatches() throws IOException {
|
||||
int[][] submatches = new int[phrasePositions.length][3];
|
||||
for (PhrasePositions pp : phrasePositions) {
|
||||
if (pp.ord == leadOrd) {
|
||||
submatches[pp.ord][0] = leadPosition;
|
||||
submatches[pp.ord][1] = leadOffset;
|
||||
submatches[pp.ord][2] = leadEndOffset;
|
||||
}
|
||||
else {
|
||||
submatches[pp.ord][0] = pp.position + pp.offset;
|
||||
submatches[pp.ord][1] = pp.postings.startOffset();
|
||||
submatches[pp.ord][2] = pp.postings.endOffset();
|
||||
}
|
||||
}
|
||||
Arrays.sort(submatches, Comparator.comparingInt(a -> a[0]));
|
||||
return new MatchesIterator() {
|
||||
int upTo = -1;
|
||||
@Override
|
||||
public boolean next() throws IOException {
|
||||
upTo++;
|
||||
return upTo < submatches.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
return submatches[upTo][0];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return submatches[upTo][0];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return submatches[upTo][1];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return submatches[upTo][2];
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() {
|
||||
return MatchesIterator.EMPTY_ITERATOR;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return this;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** advance a PhrasePosition and update 'end', return false if exhausted */
|
||||
|
@ -197,12 +276,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
}
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
if (pp.position == end) {
|
||||
if (pp.ord > advanceEndPostings) {
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -307,12 +380,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
pp.firstPosition();
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
if (pp.position == end) {
|
||||
if (pp.ord > advanceEndPostings) {
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
}
|
||||
pq.add(pp);
|
||||
}
|
||||
|
@ -342,12 +409,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
|||
for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max
|
||||
if (pp.position > end) {
|
||||
end = pp.position;
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
if (pp.position == end) {
|
||||
if (pp.ord > advanceEndPostings) {
|
||||
advanceEndPostings = pp.ord;
|
||||
}
|
||||
}
|
||||
pq.add(pp);
|
||||
}
|
||||
|
|
|
@ -67,4 +67,54 @@ class TermMatchesIterator implements MatchesIterator {
|
|||
return pe.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return pe;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return new MatchesIterator() {
|
||||
|
||||
boolean exhausted = false;
|
||||
|
||||
@Override
|
||||
public boolean next() {
|
||||
if (exhausted) {
|
||||
return false;
|
||||
}
|
||||
return exhausted = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return pe.startOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return pe.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() {
|
||||
return MatchesIterator.EMPTY_ITERATOR;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return this;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,8 @@ package org.apache.lucene.search.spans;
|
|||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -28,6 +30,8 @@ import org.apache.lucene.search.CollectionStatistics;
|
|||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.LeafSimScorer;
|
||||
import org.apache.lucene.search.Matches;
|
||||
import org.apache.lucene.search.MatchesIterator;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
@ -161,4 +165,131 @@ public abstract class SpanWeight extends Weight {
|
|||
|
||||
return Explanation.noMatch("no matching term");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Matches matches(LeafReaderContext context, int doc) throws IOException {
|
||||
return Matches.forField(field, () -> {
|
||||
Spans spans = getSpans(context, Postings.OFFSETS);
|
||||
if (spans == null) {
|
||||
return null;
|
||||
}
|
||||
if (spans.advance(doc) != doc) {
|
||||
return null;
|
||||
}
|
||||
return new MatchesIterator() {
|
||||
|
||||
int innerTermCount = 0;
|
||||
int[][] innerTerms = new int[2][3];
|
||||
SpanCollector termCollector = new SpanCollector() {
|
||||
@Override
|
||||
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
|
||||
innerTermCount++;
|
||||
if (innerTermCount > innerTerms.length) {
|
||||
int[][] temp = new int[innerTermCount][3];
|
||||
System.arraycopy(innerTerms, 0, temp, 0, innerTermCount - 1);
|
||||
innerTerms = temp;
|
||||
}
|
||||
innerTerms[innerTermCount - 1][0] = position;
|
||||
innerTerms[innerTermCount - 1][1] = postings.startOffset();
|
||||
innerTerms[innerTermCount - 1][2] = postings.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
innerTermCount = 0;
|
||||
}
|
||||
};
|
||||
|
||||
@Override
|
||||
public boolean next() throws IOException {
|
||||
innerTermCount = 0;
|
||||
return spans.nextStartPosition() != Spans.NO_MORE_POSITIONS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
return spans.startPosition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return spans.endPosition() - 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
if (innerTermCount == 0) {
|
||||
collectInnerTerms();
|
||||
}
|
||||
return innerTerms[0][1];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
if (innerTermCount == 0) {
|
||||
collectInnerTerms();
|
||||
}
|
||||
return innerTerms[innerTermCount - 1][2];
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
if (innerTermCount == 0) {
|
||||
collectInnerTerms();
|
||||
}
|
||||
return new MatchesIterator() {
|
||||
|
||||
int upto = -1;
|
||||
|
||||
@Override
|
||||
public boolean next() throws IOException {
|
||||
upto++;
|
||||
return upto < innerTermCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startPosition() {
|
||||
return innerTerms[upto][0];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endPosition() {
|
||||
return innerTerms[upto][0];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return innerTerms[upto][1];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return innerTerms[upto][2];
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
return MatchesIterator.EMPTY_ITERATOR;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return this;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
return SpanWeight.this;
|
||||
}
|
||||
|
||||
void collectInnerTerms() throws IOException {
|
||||
termCollector.reset();
|
||||
spans.collect(termCollector);
|
||||
Arrays.sort(innerTerms, 0, innerTermCount, Comparator.comparing(a -> a[0]));
|
||||
}
|
||||
};
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,8 +18,12 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -29,9 +33,14 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -99,7 +108,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
"nothing matches this document"
|
||||
};
|
||||
|
||||
void checkMatches(Query q, String field, int[][] expected) throws IOException {
|
||||
private void checkMatches(Query q, String field, int[][] expected) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(expected[i][0], searcher.leafContexts));
|
||||
|
@ -112,14 +121,40 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
MatchesIterator it = matches.getMatches(field);
|
||||
if (expected[i].length == 1) {
|
||||
assertNull(it);
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
checkFieldMatches(it, expected[i]);
|
||||
checkFieldMatches(matches.getMatches(field), expected[i]); // test multiple calls
|
||||
}
|
||||
}
|
||||
|
||||
void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
|
||||
private void checkLabelCount(Query q, String field, int[] expected) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
|
||||
int doc = i - ctx.docBase;
|
||||
Matches matches = w.matches(ctx, doc);
|
||||
if (matches == null) {
|
||||
assertEquals("Expected to get matches on document " + i, 0, expected[i]);
|
||||
continue;
|
||||
}
|
||||
MatchesIterator it = matches.getMatches(field);
|
||||
if (expected[i] == 0) {
|
||||
assertNull(it);
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
assertNotNull(it);
|
||||
}
|
||||
IdentityHashMap<Object, Integer> labels = new IdentityHashMap<>();
|
||||
while (it.next()) {
|
||||
labels.put(it.label(), 1);
|
||||
}
|
||||
assertEquals(expected[i], labels.size());
|
||||
}
|
||||
}
|
||||
|
||||
private void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
|
||||
int pos = 1;
|
||||
while (it.next()) {
|
||||
//System.out.println(expected[i][pos] + "->" + expected[i][pos + 1] + "[" + expected[i][pos + 2] + "->" + expected[i][pos + 3] + "]");
|
||||
|
@ -132,7 +167,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
assertEquals(expected.length, pos);
|
||||
}
|
||||
|
||||
void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
|
||||
private void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
|
||||
|
@ -148,8 +183,90 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void checkTermMatches(Query q, String field, TermMatch[][][] expected) throws IOException {
|
||||
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
|
||||
int doc = i - ctx.docBase;
|
||||
Matches matches = w.matches(ctx, doc);
|
||||
if (matches == null) {
|
||||
assertEquals(expected[i].length, 0);
|
||||
continue;
|
||||
}
|
||||
MatchesIterator it = matches.getMatches(field);
|
||||
if (expected[i].length == 0) {
|
||||
assertNull(it);
|
||||
continue;
|
||||
}
|
||||
checkTerms(expected[i], it);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkTerms(TermMatch[][] expected, MatchesIterator it) throws IOException {
|
||||
int upTo = 0;
|
||||
while (it.next()) {
|
||||
Set<TermMatch> expectedMatches = new HashSet<>(Arrays.asList(expected[upTo]));
|
||||
MatchesIterator submatches = it.getSubMatches();
|
||||
while (submatches.next()) {
|
||||
TermMatch tm = new TermMatch(submatches.startPosition(), submatches.startOffset(), submatches.endOffset());
|
||||
if (expectedMatches.remove(tm) == false) {
|
||||
fail("Unexpected term match: " + tm);
|
||||
}
|
||||
}
|
||||
if (expectedMatches.size() != 0) {
|
||||
fail("Missing term matches: " + expectedMatches.stream().map(Object::toString).collect(Collectors.joining(", ")));
|
||||
}
|
||||
upTo++;
|
||||
}
|
||||
if (upTo < expected.length - 1) {
|
||||
fail("Missing expected match");
|
||||
}
|
||||
}
|
||||
|
||||
static class TermMatch {
|
||||
|
||||
public final int position;
|
||||
|
||||
public final int startOffset;
|
||||
|
||||
public final int endOffset;
|
||||
|
||||
public TermMatch(PostingsEnum pe, int position) throws IOException {
|
||||
this.position = position;
|
||||
this.startOffset = pe.startOffset();
|
||||
this.endOffset = pe.endOffset();
|
||||
}
|
||||
|
||||
public TermMatch(int position, int startOffset, int endOffset) {
|
||||
this.position = position;
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
TermMatch termMatch = (TermMatch) o;
|
||||
return position == termMatch.position &&
|
||||
startOffset == termMatch.startOffset &&
|
||||
endOffset == termMatch.endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(position, startOffset, endOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return position + "[" + startOffset + "->" + endOffset + "]";
|
||||
}
|
||||
}
|
||||
|
||||
public void testTermQuery() throws IOException {
|
||||
Query q = new TermQuery(new Term(FIELD_WITH_OFFSETS, "w1"));
|
||||
Term t = new Term(FIELD_WITH_OFFSETS, "w1");
|
||||
Query q = new TermQuery(t);
|
||||
checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0, 0, 0, 0, 2 },
|
||||
{ 1, 0, 0, 0, 2 },
|
||||
|
@ -157,6 +274,14 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 2, 2, 6, 8 },
|
||||
{ 4 }
|
||||
});
|
||||
checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{
|
||||
{ { new TermMatch(0, 0, 2) } },
|
||||
{ { new TermMatch(0, 0, 2) } },
|
||||
{ { new TermMatch(0, 0, 2) } },
|
||||
{ { new TermMatch(0, 0, 2) }, { new TermMatch(2, 6, 8) } },
|
||||
{}
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0, 0 });
|
||||
}
|
||||
|
||||
public void testTermQueryNoStoredOffsets() throws IOException {
|
||||
|
@ -191,6 +316,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 1, 2, 0, 0 });
|
||||
}
|
||||
|
||||
public void testDisjunctionNoPositions() throws IOException {
|
||||
|
@ -215,6 +341,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 0, 2, 0, 0 });
|
||||
}
|
||||
|
||||
public void testReqOptNoPositions() throws IOException {
|
||||
|
@ -248,6 +375,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 3, 3, 9, 11, 5, 5, 15, 17 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 3, 1, 3, 3, 0, 0 });
|
||||
}
|
||||
|
||||
public void testMinShouldMatchNoPositions() throws IOException {
|
||||
|
@ -331,6 +459,7 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 },
|
||||
{ 4 }
|
||||
});
|
||||
checkLabelCount(rq, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 2, 2, 0 });
|
||||
|
||||
}
|
||||
|
||||
|
@ -392,12 +521,55 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
// 0 1 2 3 4 5 6 7
|
||||
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
|
||||
|
||||
public void testSloppyPhraseQueryWithRepeats() throws IOException {
|
||||
Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
|
||||
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
|
||||
PhraseQuery pq = new PhraseQuery(10, FIELD_WITH_OFFSETS, "phrase", "sentence", "sentence");
|
||||
checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 1, 6, 2, 43, 2, 11, 9, 75, 5, 11, 28, 75, 6, 11, 35, 75 }
|
||||
});
|
||||
checkLabelCount(pq, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
|
||||
checkTermMatches(pq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
|
||||
{}, {}, {}, {},
|
||||
{ {
|
||||
new TermMatch(1, 2, 8),
|
||||
new TermMatch(2, 9, 17),
|
||||
new TermMatch(6, 35, 43)
|
||||
}, {
|
||||
new TermMatch(5, 28, 34),
|
||||
new TermMatch(2, 9, 17),
|
||||
new TermMatch(11, 67, 75)
|
||||
}, {
|
||||
new TermMatch(5, 28, 34),
|
||||
new TermMatch(6, 35, 43),
|
||||
new TermMatch(11, 67, 75)
|
||||
}, {
|
||||
new TermMatch(10, 60, 66),
|
||||
new TermMatch(6, 35, 43),
|
||||
new TermMatch(11, 67, 75)
|
||||
} }
|
||||
});
|
||||
}
|
||||
|
||||
public void testSloppyPhraseQuery() throws IOException {
|
||||
Term a = new Term(FIELD_WITH_OFFSETS, "a");
|
||||
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
|
||||
PhraseQuery pq = new PhraseQuery(4, FIELD_WITH_OFFSETS, "a", "sentence");
|
||||
checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 0, 2, 0, 17, 6, 9, 35, 59, 9, 11, 58, 75 }
|
||||
});
|
||||
checkTermMatches(pq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
|
||||
{}, {}, {}, {},
|
||||
{ {
|
||||
new TermMatch(0, 0, 1), new TermMatch(2, 9, 17)
|
||||
}, {
|
||||
new TermMatch(9, 58, 59), new TermMatch(6, 35, 43)
|
||||
}, {
|
||||
new TermMatch(9, 58, 59), new TermMatch(11, 67, 75)
|
||||
} }
|
||||
});
|
||||
}
|
||||
|
||||
public void testExactPhraseQuery() throws IOException {
|
||||
|
@ -407,28 +579,57 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
{ 4, 1, 2, 2, 17, 5, 6, 28, 43, 10, 11, 60, 75 }
|
||||
});
|
||||
|
||||
Term a = new Term(FIELD_WITH_OFFSETS, "a");
|
||||
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
|
||||
PhraseQuery pq2 = new PhraseQuery.Builder()
|
||||
.add(new Term(FIELD_WITH_OFFSETS, "a"))
|
||||
.add(new Term(FIELD_WITH_OFFSETS, "sentence"), 2)
|
||||
.add(a)
|
||||
.add(s, 2)
|
||||
.build();
|
||||
checkMatches(pq2, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 0, 2, 0, 17, 9, 11, 58, 75 }
|
||||
});
|
||||
checkTermMatches(pq2, FIELD_WITH_OFFSETS, new TermMatch[][][]{
|
||||
{}, {}, {}, {},
|
||||
{ {
|
||||
new TermMatch(0, 0, 1), new TermMatch(2, 9, 17)
|
||||
}, {
|
||||
new TermMatch(9, 58, 59), new TermMatch(11, 67, 75)
|
||||
} }
|
||||
});
|
||||
}
|
||||
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
|
||||
|
||||
public void testSloppyMultiPhraseQuery() throws IOException {
|
||||
Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
|
||||
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
|
||||
Term i = new Term(FIELD_WITH_OFFSETS, "iterations");
|
||||
MultiPhraseQuery mpq = new MultiPhraseQuery.Builder()
|
||||
.add(new Term(FIELD_WITH_OFFSETS, "phrase"))
|
||||
.add(new Term[]{ new Term(FIELD_WITH_OFFSETS, "sentence"), new Term(FIELD_WITH_OFFSETS, "iterations") })
|
||||
.add(p)
|
||||
.add(new Term[]{ s, i })
|
||||
.setSlop(4)
|
||||
.build();
|
||||
checkMatches(mpq, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 1, 2, 2, 17, 5, 7, 28, 54, 5, 7, 28, 54, 10, 11, 60, 75 }
|
||||
{ 4, 1, 2, 2, 17, 5, 6, 28, 43, 5, 7, 28, 54, 10, 11, 60, 75 }
|
||||
});
|
||||
checkTermMatches(mpq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
|
||||
{}, {}, {}, {},
|
||||
{ {
|
||||
new TermMatch(1, 2, 8),
|
||||
new TermMatch(2, 9, 17)
|
||||
}, {
|
||||
new TermMatch(5, 28, 34),
|
||||
new TermMatch(6, 35, 43)
|
||||
}, {
|
||||
new TermMatch(5, 28, 34),
|
||||
new TermMatch(7, 44, 54)
|
||||
}, {
|
||||
new TermMatch(10, 60, 66),
|
||||
new TermMatch(11, 67, 75)
|
||||
} }
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -452,4 +653,35 @@ public class TestMatchesIterator extends LuceneTestCase {
|
|||
});
|
||||
}
|
||||
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
|
||||
|
||||
public void testSpanQuery() throws IOException {
|
||||
SpanQuery subq = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
|
||||
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "with")))
|
||||
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "many")))
|
||||
.build();
|
||||
Query q = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
|
||||
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "sentence")))
|
||||
.addClause(new SpanOrQuery(subq, new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "iterations"))))
|
||||
.build();
|
||||
checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
|
||||
{ 0 }, { 1 }, { 2 }, { 3 },
|
||||
{ 4, 2, 4, 9, 27, 6, 7, 35, 54 }
|
||||
});
|
||||
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
|
||||
checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{
|
||||
{}, {}, {}, {},
|
||||
{
|
||||
{
|
||||
new TermMatch(2, 9, 17),
|
||||
new TermMatch(3, 18, 22),
|
||||
new TermMatch(4, 23, 27)
|
||||
}, {
|
||||
new TermMatch(6, 35, 43), new TermMatch(7, 44, 54)
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -67,4 +67,15 @@ class AssertingMatchesIterator implements MatchesIterator {
|
|||
return in.endOffset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchesIterator getSubMatches() throws IOException {
|
||||
assert state == State.ITERATING : state;
|
||||
return in.getSubMatches();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object label() {
|
||||
assert state == State.ITERATING : state;
|
||||
return in.label();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue