LUCENE-8306: Allow iteration over submatches

Also includes LUCENE-8404, adding match iteration to SpanQuery
This commit is contained in:
Alan Woodward 2018-07-22 22:38:55 +01:00
parent 995a902d1a
commit 028c86b1fa
16 changed files with 464 additions and 54 deletions

View File

@ -45,14 +45,14 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
*
* Only terms that have at least one match in the given document will be included
*/
static MatchesIterator fromTerms(LeafReaderContext context, int doc, String field, List<Term> terms) throws IOException {
static MatchesIterator fromTerms(LeafReaderContext context, int doc, Query query, String field, List<Term> terms) throws IOException {
Objects.requireNonNull(field);
for (Term term : terms) {
if (Objects.equals(field, term.field()) == false) {
throw new IllegalArgumentException("Tried to generate iterator from terms in multiple fields: expected [" + field + "] but got [" + term.field() + "]");
}
}
return fromTermsEnum(context, doc, field, asBytesRefIterator(terms));
return fromTermsEnum(context, doc, query, field, asBytesRefIterator(terms));
}
private static BytesRefIterator asBytesRefIterator(List<Term> terms) {
@ -72,7 +72,7 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
*
* Only terms that have at least one match in the given document will be included
*/
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, String field, BytesRefIterator terms) throws IOException {
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
Objects.requireNonNull(field);
List<MatchesIterator> mis = new ArrayList<>();
Terms t = context.reader().terms(field);
@ -84,7 +84,7 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
if (te.seekExact(term)) {
PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
if (pe.advance(doc) == doc) {
mis.add(new TermMatchesIterator(pe));
mis.add(new TermMatchesIterator(query, pe));
reuse = null;
}
else {
@ -158,4 +158,13 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
return queue.top().endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return queue.top().getSubMatches();
}
@Override
public Query getQuery() {
return queue.top().getQuery();
}
}

View File

@ -78,7 +78,7 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
@Override
public Matches matches(LeafReaderContext context, int doc) throws IOException {
final SortedSetDocValues fcsi = DocValues.getSortedSet(context.reader(), query.field);
return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query.field, getTermsEnum(fcsi)));
return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query, query.field, getTermsEnum(fcsi)));
}
private TermsEnum getTermsEnum(SortedSetDocValues fcsi) throws IOException {

View File

@ -28,6 +28,9 @@ import org.apache.lucene.index.LeafReaderContext;
* positions and/or offsets after each call. You should not call the position or offset methods
* before {@link #next()} has been called, or after {@link #next()} has returned {@code false}.
*
* Matches from some queries may span multiple positions. You can retrieve the positions of
* individual matching terms on the current match by calling {@link #getSubMatches()}.
*
* Matches are ordered by start position, and then by end position. Match intervals may overlap.
*
* @see Weight#matches(LeafReaderContext, int)
@ -70,4 +73,25 @@ public interface MatchesIterator {
*/
int endOffset() throws IOException;
/**
* Returns a MatchesIterator that iterates over the positions and offsets of individual
* terms within the current match
*
* Returns {@code null} if there are no submatches (ie the current iterator is at the
* leaf level)
*
* Should only be called after {@link #next()} has returned {@code true}
*/
MatchesIterator getSubMatches() throws IOException;
/**
* Returns the Query causing the current match
*
* If this {@link MatchesIterator} has been returned from a {@link #getSubMatches()}
* call, then returns a {@link TermQuery} equivalent to the current match
*
* Should only be called after {@link #next()} has returned {@code true}
*/
Query getQuery();
}

View File

@ -269,7 +269,7 @@ public class MultiPhraseQuery extends Query {
TermState termState = termStates.get(term).get(context);
if (termState != null) {
termsEnum.seekExact(term.bytes(), termState);
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS));
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS));
totalMatchCost += PhraseQuery.termPositionsCost(termsEnum);
}
}
@ -294,7 +294,7 @@ public class MultiPhraseQuery extends Query {
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
}
else {
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
}
}
@ -647,5 +647,6 @@ public class MultiPhraseQuery extends Query {
public BytesRef getPayload() throws IOException {
return posQueue.top().pe.getPayload();
}
}
}

View File

@ -211,7 +211,7 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
if (terms.hasPositions() == false) {
return super.matches(context, doc);
}
return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query.field, query.getTermsEnum(terms)));
return Matches.forField(query.field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, query, query.field, query.getTermsEnum(terms)));
}
@Override

View File

@ -88,4 +88,5 @@ abstract class PhraseMatcher {
public float getMatchCost() {
return matchCost;
}
}

View File

@ -446,7 +446,7 @@ public class PhraseQuery extends Query {
return null;
}
te.seekExact(t.bytes(), state);
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS);
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t);
totalMatchCost += termPositionsCost(te);
}
@ -457,7 +457,7 @@ public class PhraseQuery extends Query {
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
}
else {
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
}
}

View File

@ -123,6 +123,16 @@ abstract class PhraseWeight extends Weight {
public int endOffset() throws IOException {
return matcher.endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return null; // phrases are treated as leaves
}
@Override
public Query getQuery() {
return PhraseWeight.this.getQuery();
}
};
});
}

View File

@ -54,13 +54,14 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
private final int slop;
private final int numPostings;
private final PhraseQueue pq; // for advancing min position
private final boolean captureLeadMatch;
private int end; // current largest phrase position
private int leadPosition;
private int leadOffset;
private int currentEndPostings;
private int advanceEndPostings;
private int leadEndOffset;
private int leadOrd;
private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
@ -71,10 +72,11 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
private boolean positioned;
private int matchLength;
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost) {
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) {
super(approximation(postings), matchCost);
this.slop = slop;
this.numPostings = postings.length;
this.captureLeadMatch = captureLeadMatch;
pq = new PhraseQueue(postings.length);
phrasePositions = new PhrasePositions[postings.length];
for (int i = 0; i < postings.length; ++i) {
@ -120,10 +122,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
return false;
}
PhrasePositions pp = pq.pop();
assert pp != null; // if the pq is empty, then positioned == false
leadPosition = pp.position + pp.offset;
leadOffset = pp.postings.startOffset();
currentEndPostings = advanceEndPostings;
assert pp != null; // if the pq is not full, then positioned == false
captureLead(pp);
matchLength = end - pp.position;
int next = pq.top().position;
while (advancePP(pp)) {
@ -137,6 +137,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
}
pp = pq.pop();
next = pq.top().position;
assert pp != null; // if the pq is not full, then positioned == false
matchLength = end - pp.position;
} else {
int matchLength2 = end - pp.position;
@ -144,14 +145,22 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
matchLength = matchLength2;
}
}
leadPosition = pp.position + pp.offset;
leadOffset = pp.postings.startOffset();
currentEndPostings = advanceEndPostings;
captureLead(pp);
}
positioned = false;
return matchLength <= slop;
}
private void captureLead(PhrasePositions pp) throws IOException {
if (captureLeadMatch == false) {
return;
}
leadOrd = pp.ord;
leadPosition = pp.position + pp.offset;
leadOffset = pp.postings.startOffset();
leadEndOffset = pp.postings.endOffset();
}
@Override
public int startPosition() {
// when a match is detected, the top postings is advanced until it has moved
@ -160,6 +169,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
// However, the priority queue doesn't guarantee that the top postings is in fact the
// earliest in the list, so we need to cycle through all terms to check.
// this is slow, but Matches is slow anyway...
int leadPosition = this.leadPosition;
for (PhrasePositions pp : phrasePositions) {
leadPosition = Math.min(leadPosition, pp.position + pp.offset);
}
@ -168,7 +178,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
@Override
public int endPosition() {
return phrasePositions[currentEndPostings].position + phrasePositions[currentEndPostings].offset;
int endPosition = leadPosition;
for (PhrasePositions pp : phrasePositions) {
if (pp.ord != leadOrd) {
endPosition = Math.max(endPosition, pp.position + pp.offset);
}
}
return endPosition;
}
@Override
@ -179,6 +195,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
// However, the priority queue doesn't guarantee that the top postings is in fact the
// earliest in the list, so we need to cycle through all terms to check
// this is slow, but Matches is slow anyway...
int leadOffset = this.leadOffset;
for (PhrasePositions pp : phrasePositions) {
leadOffset = Math.min(leadOffset, pp.postings.startOffset());
}
@ -187,7 +204,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
@Override
public int endOffset() throws IOException {
return phrasePositions[currentEndPostings].postings.endOffset();
int endOffset = leadEndOffset;
for (PhrasePositions pp : phrasePositions) {
if (pp.ord != leadOrd) {
endOffset = Math.max(endOffset, pp.postings.endOffset());
}
}
return endOffset;
}
/** advance a PhrasePosition and update 'end', return false if exhausted */
@ -197,12 +220,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
}
if (pp.position > end) {
end = pp.position;
advanceEndPostings = pp.ord;
}
if (pp.position == end) {
if (pp.ord > advanceEndPostings) {
advanceEndPostings = pp.ord;
}
}
return true;
}
@ -307,12 +324,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
pp.firstPosition();
if (pp.position > end) {
end = pp.position;
advanceEndPostings = pp.ord;
}
if (pp.position == end) {
if (pp.ord > advanceEndPostings) {
advanceEndPostings = pp.ord;
}
}
pq.add(pp);
}
@ -342,12 +353,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max
if (pp.position > end) {
end = pp.position;
advanceEndPostings = pp.ord;
}
if (pp.position == end) {
if (pp.ord > advanceEndPostings) {
advanceEndPostings = pp.ord;
}
}
pq.add(pp);
}

View File

@ -176,7 +176,7 @@ public final class SynonymQuery extends Query {
if (terms == null || terms.hasPositions() == false) {
return super.matches(context, doc);
}
return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTerms(context, doc, field, Arrays.asList(SynonymQuery.this.terms)));
return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTerms(context, doc, getQuery(), field, Arrays.asList(SynonymQuery.this.terms)));
}
@Override

View File

@ -226,7 +226,7 @@ public class TermInSetQuery extends Query implements Accountable {
if (terms == null || terms.hasPositions() == false) {
return super.matches(context, doc);
}
return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, field, termData.iterator()));
return Matches.forField(field, () -> DisjunctionMatchesIterator.fromTermsEnum(context, doc, getQuery(), field, termData.iterator()));
}
/**

View File

@ -29,12 +29,14 @@ class TermMatchesIterator implements MatchesIterator {
private int upto;
private int pos;
private final PostingsEnum pe;
private final Query query;
/**
* Create a new {@link TermMatchesIterator} for the given term and postings list
*/
TermMatchesIterator(PostingsEnum pe) throws IOException {
TermMatchesIterator(Query query, PostingsEnum pe) throws IOException {
this.pe = pe;
this.query = query;
this.upto = pe.freq();
}
@ -67,4 +69,13 @@ class TermMatchesIterator implements MatchesIterator {
return pe.endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return null;
}
@Override
public Query getQuery() {
return query;
}
}

View File

@ -94,7 +94,7 @@ public class TermQuery extends Query {
if (pe.advance(doc) != doc) {
return null;
}
return new TermMatchesIterator(pe);
return new TermMatchesIterator(getQuery(), pe);
});
}

View File

@ -18,6 +18,8 @@ package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
@ -28,6 +30,10 @@ import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafSimScorer;
import org.apache.lucene.search.Matches;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
@ -161,4 +167,138 @@ public abstract class SpanWeight extends Weight {
return Explanation.noMatch("no matching term");
}
private static class TermMatch {
Term term;
int position;
int startOffset;
int endOffset;
}
@Override
public Matches matches(LeafReaderContext context, int doc) throws IOException {
return Matches.forField(field, () -> {
Spans spans = getSpans(context, Postings.OFFSETS);
if (spans == null || spans.advance(doc) != doc) {
return null;
}
return new MatchesIterator() {
int innerTermCount = 0;
TermMatch[] innerTerms = new TermMatch[0];
SpanCollector termCollector = new SpanCollector() {
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
innerTermCount++;
if (innerTermCount > innerTerms.length) {
TermMatch[] temp = new TermMatch[innerTermCount];
System.arraycopy(innerTerms, 0, temp, 0, innerTermCount - 1);
innerTerms = temp;
innerTerms[innerTermCount - 1] = new TermMatch();
}
innerTerms[innerTermCount - 1].term = term;
innerTerms[innerTermCount - 1].position = position;
innerTerms[innerTermCount - 1].startOffset = postings.startOffset();
innerTerms[innerTermCount - 1].endOffset = postings.endOffset();
}
@Override
public void reset() {
innerTermCount = 0;
}
};
@Override
public boolean next() throws IOException {
innerTermCount = 0;
return spans.nextStartPosition() != Spans.NO_MORE_POSITIONS;
}
@Override
public int startPosition() {
return spans.startPosition();
}
@Override
public int endPosition() {
return spans.endPosition() - 1;
}
@Override
public int startOffset() throws IOException {
if (innerTermCount == 0) {
collectInnerTerms();
}
return innerTerms[0].startOffset;
}
@Override
public int endOffset() throws IOException {
if (innerTermCount == 0) {
collectInnerTerms();
}
return innerTerms[innerTermCount - 1].endOffset;
}
@Override
public MatchesIterator getSubMatches() throws IOException {
if (innerTermCount == 0) {
collectInnerTerms();
}
return new MatchesIterator() {
int upto = -1;
@Override
public boolean next() throws IOException {
upto++;
return upto < innerTermCount;
}
@Override
public int startPosition() {
return innerTerms[upto].position;
}
@Override
public int endPosition() {
return innerTerms[upto].position;
}
@Override
public int startOffset() throws IOException {
return innerTerms[upto].startOffset;
}
@Override
public int endOffset() throws IOException {
return innerTerms[upto].endOffset;
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return null;
}
@Override
public Query getQuery() {
return new TermQuery(innerTerms[upto].term);
}
};
}
@Override
public Query getQuery() {
return SpanWeight.this.getQuery();
}
void collectInnerTerms() throws IOException {
termCollector.reset();
spans.collect(termCollector);
Arrays.sort(innerTerms, 0, innerTermCount, Comparator.comparing(a -> a.position));
}
};
});
}
}

View File

@ -18,8 +18,12 @@
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@ -29,9 +33,14 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -99,7 +108,7 @@ public class TestMatchesIterator extends LuceneTestCase {
"nothing matches this document"
};
void checkMatches(Query q, String field, int[][] expected) throws IOException {
private void checkMatches(Query q, String field, int[][] expected) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < expected.length; i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(expected[i][0], searcher.leafContexts));
@ -112,14 +121,40 @@ public class TestMatchesIterator extends LuceneTestCase {
MatchesIterator it = matches.getMatches(field);
if (expected[i].length == 1) {
assertNull(it);
return;
continue;
}
checkFieldMatches(it, expected[i]);
checkFieldMatches(matches.getMatches(field), expected[i]); // test multiple calls
}
}
void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
private void checkLabelCount(Query q, String field, int[] expected) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < expected.length; i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
int doc = i - ctx.docBase;
Matches matches = w.matches(ctx, doc);
if (matches == null) {
assertEquals("Expected to get matches on document " + i, 0, expected[i]);
continue;
}
MatchesIterator it = matches.getMatches(field);
if (expected[i] == 0) {
assertNull(it);
continue;
}
else {
assertNotNull(it);
}
IdentityHashMap<Query, Integer> labels = new IdentityHashMap<>();
while (it.next()) {
labels.put(it.getQuery(), 1);
}
assertEquals(expected[i], labels.size());
}
}
private void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
int pos = 1;
while (it.next()) {
//System.out.println(expected[i][pos] + "->" + expected[i][pos + 1] + "[" + expected[i][pos + 2] + "->" + expected[i][pos + 3] + "]");
@ -132,7 +167,7 @@ public class TestMatchesIterator extends LuceneTestCase {
assertEquals(expected.length, pos);
}
void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
private void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < expected.length; i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
@ -148,8 +183,109 @@ public class TestMatchesIterator extends LuceneTestCase {
}
}
private void assertIsLeafMatch(Query q, String field) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < searcher.reader.maxDoc(); i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
int doc = i - ctx.docBase;
Matches matches = w.matches(ctx, doc);
if (matches == null) {
return;
}
MatchesIterator mi = matches.getMatches(field);
if (mi == null) {
return;
}
while (mi.next()) {
assertNull(mi.getSubMatches());
}
}
}
private void checkTermMatches(Query q, String field, TermMatch[][][] expected) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < expected.length; i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
int doc = i - ctx.docBase;
Matches matches = w.matches(ctx, doc);
if (matches == null) {
assertEquals(expected[i].length, 0);
continue;
}
MatchesIterator it = matches.getMatches(field);
if (expected[i].length == 0) {
assertNull(it);
continue;
}
checkTerms(expected[i], it);
}
}
private void checkTerms(TermMatch[][] expected, MatchesIterator it) throws IOException {
int upTo = 0;
while (it.next()) {
Set<TermMatch> expectedMatches = new HashSet<>(Arrays.asList(expected[upTo]));
MatchesIterator submatches = it.getSubMatches();
while (submatches.next()) {
TermMatch tm = new TermMatch(submatches.startPosition(), submatches.startOffset(), submatches.endOffset());
if (expectedMatches.remove(tm) == false) {
fail("Unexpected term match: " + tm);
}
}
if (expectedMatches.size() != 0) {
fail("Missing term matches: " + expectedMatches.stream().map(Object::toString).collect(Collectors.joining(", ")));
}
upTo++;
}
if (upTo < expected.length - 1) {
fail("Missing expected match");
}
}
static class TermMatch {
public final int position;
public final int startOffset;
public final int endOffset;
public TermMatch(PostingsEnum pe, int position) throws IOException {
this.position = position;
this.startOffset = pe.startOffset();
this.endOffset = pe.endOffset();
}
public TermMatch(int position, int startOffset, int endOffset) {
this.position = position;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TermMatch termMatch = (TermMatch) o;
return position == termMatch.position &&
startOffset == termMatch.startOffset &&
endOffset == termMatch.endOffset;
}
@Override
public int hashCode() {
return Objects.hash(position, startOffset, endOffset);
}
@Override
public String toString() {
return position + "[" + startOffset + "->" + endOffset + "]";
}
}
public void testTermQuery() throws IOException {
Query q = new TermQuery(new Term(FIELD_WITH_OFFSETS, "w1"));
Term t = new Term(FIELD_WITH_OFFSETS, "w1");
Query q = new TermQuery(t);
checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
{ 0, 0, 0, 0, 2 },
{ 1, 0, 0, 0, 2 },
@ -157,6 +293,8 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 2, 2, 6, 8 },
{ 4 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0, 0 });
assertIsLeafMatch(q, FIELD_WITH_OFFSETS);
}
public void testTermQueryNoStoredOffsets() throws IOException {
@ -191,6 +329,8 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
{ 4 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 1, 2, 0, 0 });
assertIsLeafMatch(q, FIELD_WITH_OFFSETS);
}
public void testDisjunctionNoPositions() throws IOException {
@ -215,6 +355,7 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
{ 4 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 0, 2, 0, 0 });
}
public void testReqOptNoPositions() throws IOException {
@ -248,6 +389,8 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 3, 3, 9, 11, 5, 5, 15, 17 },
{ 4 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 3, 1, 3, 3, 0, 0 });
assertIsLeafMatch(q, FIELD_WITH_OFFSETS);
}
public void testMinShouldMatchNoPositions() throws IOException {
@ -331,6 +474,8 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 },
{ 4 }
});
checkLabelCount(rq, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0 });
assertIsLeafMatch(rq, FIELD_WITH_OFFSETS);
}
@ -357,6 +502,7 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 },
{ 4 }
});
assertIsLeafMatch(q, FIELD_WITH_OFFSETS);
}
public void testSynonymQueryNoPositions() throws IOException {
@ -392,12 +538,25 @@ public class TestMatchesIterator extends LuceneTestCase {
// 0 1 2 3 4 5 6 7
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
public void testSloppyPhraseQueryWithRepeats() throws IOException {
Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
PhraseQuery pq = new PhraseQuery(10, FIELD_WITH_OFFSETS, "phrase", "sentence", "sentence");
checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 1, 6, 2, 43, 2, 11, 9, 75, 5, 11, 28, 75, 6, 11, 35, 75 }
});
checkLabelCount(pq, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
assertIsLeafMatch(pq, FIELD_WITH_OFFSETS);
}
public void testSloppyPhraseQuery() throws IOException {
PhraseQuery pq = new PhraseQuery(4, FIELD_WITH_OFFSETS, "a", "sentence");
checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 0, 2, 0, 17, 6, 9, 35, 59, 9, 11, 58, 75 }
});
assertIsLeafMatch(pq, FIELD_WITH_OFFSETS);
}
public void testExactPhraseQuery() throws IOException {
@ -407,29 +566,36 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 4, 1, 2, 2, 17, 5, 6, 28, 43, 10, 11, 60, 75 }
});
Term a = new Term(FIELD_WITH_OFFSETS, "a");
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
PhraseQuery pq2 = new PhraseQuery.Builder()
.add(new Term(FIELD_WITH_OFFSETS, "a"))
.add(new Term(FIELD_WITH_OFFSETS, "sentence"), 2)
.add(a)
.add(s, 2)
.build();
checkMatches(pq2, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 0, 2, 0, 17, 9, 11, 58, 75 }
});
assertIsLeafMatch(pq2, FIELD_WITH_OFFSETS);
}
// 0 1 2 3 4 5 6 7
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
public void testSloppyMultiPhraseQuery() throws IOException {
Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
Term i = new Term(FIELD_WITH_OFFSETS, "iterations");
MultiPhraseQuery mpq = new MultiPhraseQuery.Builder()
.add(new Term(FIELD_WITH_OFFSETS, "phrase"))
.add(new Term[]{ new Term(FIELD_WITH_OFFSETS, "sentence"), new Term(FIELD_WITH_OFFSETS, "iterations") })
.add(p)
.add(new Term[]{ s, i })
.setSlop(4)
.build();
checkMatches(mpq, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 1, 2, 2, 17, 5, 7, 28, 54, 5, 7, 28, 54, 10, 11, 60, 75 }
{ 4, 1, 2, 2, 17, 5, 6, 28, 43, 5, 7, 28, 54, 10, 11, 60, 75 }
});
assertIsLeafMatch(mpq, FIELD_WITH_OFFSETS);
}
public void testExactMultiPhraseQuery() throws IOException {
@ -450,6 +616,38 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 0, 1, 0, 8, 4, 5, 23, 34, 9, 10, 58, 66 }
});
assertIsLeafMatch(mpq2, FIELD_WITH_OFFSETS);
}
// 0 1 2 3 4 5 6 7
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
public void testSpanQuery() throws IOException {
SpanQuery subq = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "with")))
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "many")))
.build();
Query q = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "sentence")))
.addClause(new SpanOrQuery(subq, new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "iterations"))))
.build();
checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 2, 4, 9, 27, 6, 7, 35, 54 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{
{}, {}, {}, {},
{
{
new TermMatch(2, 9, 17),
new TermMatch(3, 18, 22),
new TermMatch(4, 23, 27)
}, {
new TermMatch(6, 35, 43), new TermMatch(7, 44, 54)
}
}
});
}
}

View File

@ -67,4 +67,15 @@ class AssertingMatchesIterator implements MatchesIterator {
return in.endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
assert state == State.ITERATING : state;
return in.getSubMatches();
}
@Override
public Query getQuery() {
assert state == State.ITERATING : state;
return in.getQuery();
}
}