LUCENE-8306: Allow iteration over submatches

This commit is contained in:
Alan Woodward 2018-07-22 20:59:50 +01:00
parent 2826a9550b
commit a8839b7eab
13 changed files with 656 additions and 44 deletions

View File

@ -198,6 +198,9 @@ Improvements
* LUCENE-8345, GitHub PR #392: Remove instantiation of redundant wrapper classes for primitives;
add wrapper class constructors to forbiddenapis. (Michael Braun via Uwe Schindler)
* LUCENE-8306: Matches API now allows iteration over sub-matches in Spans (Alan Woodward,
Jim Ferenczi, David Smiley)
Other:
* LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's

View File

@ -158,4 +158,13 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
return queue.top().endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return queue.top().getSubMatches();
}
@Override
public Object label() {
return queue.top().label();
}
}

View File

@ -149,4 +149,48 @@ final class ExactPhraseMatcher extends PhraseMatcher {
return postings[postings.length - 1].postings.endOffset();
}
@Override
MatchesIterator getSubMatches() {
return new MatchesIterator() {
int upTo = -1;
@Override
public boolean next() throws IOException {
upTo++;
return upTo < postings.length;
}
@Override
public int startPosition() {
return postings[upTo].pos;
}
@Override
public int endPosition() {
return postings[upTo].pos;
}
@Override
public int startOffset() throws IOException {
return postings[upTo].postings.startOffset();
}
@Override
public int endOffset() throws IOException {
return postings[upTo].postings.endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return MatchesIterator.EMPTY_ITERATOR;
}
@Override
public Object label() {
return this;
}
};
}
}

View File

@ -28,6 +28,9 @@ import org.apache.lucene.index.LeafReaderContext;
* positions and/or offsets after each call. You should not call the position or offset methods
* before {@link #next()} has been called, or after {@link #next()} has returned {@code false}.
*
* Matches from some queries may span multiple positions. You can retrieve the positions of
* individual matching terms on the current match by calling {@link #getSubMatches()}.
*
* Matches are ordered by start position, and then by end position. Match intervals may overlap.
*
* @see Weight#matches(LeafReaderContext, int)
@ -70,4 +73,59 @@ public interface MatchesIterator {
*/
int endOffset() throws IOException;
/**
* Returns a MatchesIterator that iterates over the positions and offsets of individual
* terms within the current match
*
* Should only be called after {@link #next()} has returned {@code true}
*/
MatchesIterator getSubMatches() throws IOException;
/**
* Returns a label identifying the leaf query causing the current match
*
* Should only be called after {@link #next()} has returned {@code true}
*/
Object label();
/**
* A MatchesIterator that is immediately exhausted
*/
MatchesIterator EMPTY_ITERATOR = new MatchesIterator() {
@Override
public boolean next() throws IOException {
return false;
}
@Override
public int startPosition() {
throw new UnsupportedOperationException();
}
@Override
public int endPosition() {
throw new UnsupportedOperationException();
}
@Override
public int startOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Object label() {
return this;
}
};
}

View File

@ -269,7 +269,7 @@ public class MultiPhraseQuery extends Query {
TermState termState = termStates.get(term).get(context);
if (termState != null) {
termsEnum.seekExact(term.bytes(), termState);
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS));
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS));
totalMatchCost += PhraseQuery.termPositionsCost(termsEnum);
}
}
@ -294,7 +294,7 @@ public class MultiPhraseQuery extends Query {
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
}
else {
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
}
}
@ -647,5 +647,6 @@ public class MultiPhraseQuery extends Query {
public BytesRef getPayload() throws IOException {
return posQueue.top().pe.getPayload();
}
}
}

View File

@ -88,4 +88,6 @@ abstract class PhraseMatcher {
public float getMatchCost() {
return matchCost;
}
abstract MatchesIterator getSubMatches() throws IOException;
}

View File

@ -446,7 +446,7 @@ public class PhraseQuery extends Query {
return null;
}
te.seekExact(t.bytes(), state);
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS);
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t);
totalMatchCost += termPositionsCost(te);
}
@ -457,7 +457,7 @@ public class PhraseQuery extends Query {
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
}
else {
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost);
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
}
}

View File

@ -123,6 +123,16 @@ abstract class PhraseWeight extends Weight {
public int endOffset() throws IOException {
return matcher.endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return matcher.getSubMatches();
}
@Override
public Object label() {
return matcher;
}
};
});
}

View File

@ -54,13 +54,14 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
private final int slop;
private final int numPostings;
private final PhraseQueue pq; // for advancing min position
private final boolean captureLeadMatch;
private int end; // current largest phrase position
private int leadPosition;
private int leadOffset;
private int currentEndPostings;
private int advanceEndPostings;
private int leadEndOffset;
private int leadOrd;
private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
@ -71,10 +72,11 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
private boolean positioned;
private int matchLength;
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost) {
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) {
super(approximation(postings), matchCost);
this.slop = slop;
this.numPostings = postings.length;
this.captureLeadMatch = captureLeadMatch;
pq = new PhraseQueue(postings.length);
phrasePositions = new PhrasePositions[postings.length];
for (int i = 0; i < postings.length; ++i) {
@ -120,10 +122,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
return false;
}
PhrasePositions pp = pq.pop();
assert pp != null; // if the pq is empty, then positioned == false
leadPosition = pp.position + pp.offset;
leadOffset = pp.postings.startOffset();
currentEndPostings = advanceEndPostings;
assert pp != null; // if the pq is not full, then positioned == false
captureLead(pp);
matchLength = end - pp.position;
int next = pq.top().position;
while (advancePP(pp)) {
@ -137,6 +137,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
}
pp = pq.pop();
next = pq.top().position;
assert pp != null; // if the pq is not full, then positioned == false
matchLength = end - pp.position;
} else {
int matchLength2 = end - pp.position;
@ -144,14 +145,22 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
matchLength = matchLength2;
}
}
leadPosition = pp.position + pp.offset;
leadOffset = pp.postings.startOffset();
currentEndPostings = advanceEndPostings;
captureLead(pp);
}
positioned = false;
return matchLength <= slop;
}
private void captureLead(PhrasePositions pp) throws IOException {
if (captureLeadMatch == false) {
return;
}
leadOrd = pp.ord;
leadPosition = pp.position + pp.offset;
leadOffset = pp.postings.startOffset();
leadEndOffset = pp.postings.endOffset();
}
@Override
public int startPosition() {
// when a match is detected, the top postings is advanced until it has moved
@ -160,6 +169,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
// However, the priority queue doesn't guarantee that the top postings is in fact the
// earliest in the list, so we need to cycle through all terms to check.
// this is slow, but Matches is slow anyway...
int leadPosition = this.leadPosition;
for (PhrasePositions pp : phrasePositions) {
leadPosition = Math.min(leadPosition, pp.position + pp.offset);
}
@ -168,7 +178,13 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
@Override
public int endPosition() {
return phrasePositions[currentEndPostings].position + phrasePositions[currentEndPostings].offset;
int endPosition = leadPosition;
for (PhrasePositions pp : phrasePositions) {
if (pp.ord != leadOrd) {
endPosition = Math.max(endPosition, pp.position + pp.offset);
}
}
return endPosition;
}
@Override
@ -179,6 +195,7 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
// However, the priority queue doesn't guarantee that the top postings is in fact the
// earliest in the list, so we need to cycle through all terms to check
// this is slow, but Matches is slow anyway...
int leadOffset = this.leadOffset;
for (PhrasePositions pp : phrasePositions) {
leadOffset = Math.min(leadOffset, pp.postings.startOffset());
}
@ -187,7 +204,69 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
@Override
public int endOffset() throws IOException {
return phrasePositions[currentEndPostings].postings.endOffset();
int endOffset = leadEndOffset;
for (PhrasePositions pp : phrasePositions) {
if (pp.ord != leadOrd) {
endOffset = Math.max(endOffset, pp.postings.endOffset());
}
}
return endOffset;
}
@Override
MatchesIterator getSubMatches() throws IOException {
int[][] submatches = new int[phrasePositions.length][3];
for (PhrasePositions pp : phrasePositions) {
if (pp.ord == leadOrd) {
submatches[pp.ord][0] = leadPosition;
submatches[pp.ord][1] = leadOffset;
submatches[pp.ord][2] = leadEndOffset;
}
else {
submatches[pp.ord][0] = pp.position + pp.offset;
submatches[pp.ord][1] = pp.postings.startOffset();
submatches[pp.ord][2] = pp.postings.endOffset();
}
}
Arrays.sort(submatches, Comparator.comparingInt(a -> a[0]));
return new MatchesIterator() {
int upTo = -1;
@Override
public boolean next() throws IOException {
upTo++;
return upTo < submatches.length;
}
@Override
public int startPosition() {
return submatches[upTo][0];
}
@Override
public int endPosition() {
return submatches[upTo][0];
}
@Override
public int startOffset() {
return submatches[upTo][1];
}
@Override
public int endOffset() {
return submatches[upTo][2];
}
@Override
public MatchesIterator getSubMatches() {
return MatchesIterator.EMPTY_ITERATOR;
}
@Override
public Object label() {
return this;
}
};
}
/** advance a PhrasePosition and update 'end', return false if exhausted */
@ -197,12 +276,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
}
if (pp.position > end) {
end = pp.position;
advanceEndPostings = pp.ord;
}
if (pp.position == end) {
if (pp.ord > advanceEndPostings) {
advanceEndPostings = pp.ord;
}
}
return true;
}
@ -307,12 +380,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
pp.firstPosition();
if (pp.position > end) {
end = pp.position;
advanceEndPostings = pp.ord;
}
if (pp.position == end) {
if (pp.ord > advanceEndPostings) {
advanceEndPostings = pp.ord;
}
}
pq.add(pp);
}
@ -342,12 +409,6 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max
if (pp.position > end) {
end = pp.position;
advanceEndPostings = pp.ord;
}
if (pp.position == end) {
if (pp.ord > advanceEndPostings) {
advanceEndPostings = pp.ord;
}
}
pq.add(pp);
}

View File

@ -67,4 +67,54 @@ class TermMatchesIterator implements MatchesIterator {
return pe.endOffset();
}
@Override
public Object label() {
return pe;
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return new MatchesIterator() {
boolean exhausted = false;
@Override
public boolean next() {
if (exhausted) {
return false;
}
return exhausted = true;
}
@Override
public int startPosition() {
return pos;
}
@Override
public int endPosition() {
return pos;
}
@Override
public int startOffset() throws IOException {
return pe.startOffset();
}
@Override
public int endOffset() throws IOException {
return pe.endOffset();
}
@Override
public MatchesIterator getSubMatches() {
return MatchesIterator.EMPTY_ITERATOR;
}
@Override
public Object label() {
return this;
}
};
}
}

View File

@ -18,6 +18,8 @@ package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
@ -28,6 +30,8 @@ import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafSimScorer;
import org.apache.lucene.search.Matches;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
@ -161,4 +165,131 @@ public abstract class SpanWeight extends Weight {
return Explanation.noMatch("no matching term");
}
@Override
public Matches matches(LeafReaderContext context, int doc) throws IOException {
return Matches.forField(field, () -> {
Spans spans = getSpans(context, Postings.OFFSETS);
if (spans == null) {
return null;
}
if (spans.advance(doc) != doc) {
return null;
}
return new MatchesIterator() {
int innerTermCount = 0;
int[][] innerTerms = new int[2][3];
SpanCollector termCollector = new SpanCollector() {
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
innerTermCount++;
if (innerTermCount > innerTerms.length) {
int[][] temp = new int[innerTermCount][3];
System.arraycopy(innerTerms, 0, temp, 0, innerTermCount - 1);
innerTerms = temp;
}
innerTerms[innerTermCount - 1][0] = position;
innerTerms[innerTermCount - 1][1] = postings.startOffset();
innerTerms[innerTermCount - 1][2] = postings.endOffset();
}
@Override
public void reset() {
innerTermCount = 0;
}
};
@Override
public boolean next() throws IOException {
innerTermCount = 0;
return spans.nextStartPosition() != Spans.NO_MORE_POSITIONS;
}
@Override
public int startPosition() {
return spans.startPosition();
}
@Override
public int endPosition() {
return spans.endPosition() - 1;
}
@Override
public int startOffset() throws IOException {
if (innerTermCount == 0) {
collectInnerTerms();
}
return innerTerms[0][1];
}
@Override
public int endOffset() throws IOException {
if (innerTermCount == 0) {
collectInnerTerms();
}
return innerTerms[innerTermCount - 1][2];
}
@Override
public MatchesIterator getSubMatches() throws IOException {
if (innerTermCount == 0) {
collectInnerTerms();
}
return new MatchesIterator() {
int upto = -1;
@Override
public boolean next() throws IOException {
upto++;
return upto < innerTermCount;
}
@Override
public int startPosition() {
return innerTerms[upto][0];
}
@Override
public int endPosition() {
return innerTerms[upto][0];
}
@Override
public int startOffset() throws IOException {
return innerTerms[upto][1];
}
@Override
public int endOffset() throws IOException {
return innerTerms[upto][2];
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return MatchesIterator.EMPTY_ITERATOR;
}
@Override
public Object label() {
return this;
}
};
}
@Override
public Object label() {
return SpanWeight.this;
}
void collectInnerTerms() throws IOException {
termCollector.reset();
spans.collect(termCollector);
Arrays.sort(innerTerms, 0, innerTermCount, Comparator.comparing(a -> a[0]));
}
};
});
}
}

View File

@ -18,8 +18,12 @@
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@ -29,9 +33,14 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -99,7 +108,7 @@ public class TestMatchesIterator extends LuceneTestCase {
"nothing matches this document"
};
void checkMatches(Query q, String field, int[][] expected) throws IOException {
private void checkMatches(Query q, String field, int[][] expected) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < expected.length; i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(expected[i][0], searcher.leafContexts));
@ -112,14 +121,40 @@ public class TestMatchesIterator extends LuceneTestCase {
MatchesIterator it = matches.getMatches(field);
if (expected[i].length == 1) {
assertNull(it);
return;
continue;
}
checkFieldMatches(it, expected[i]);
checkFieldMatches(matches.getMatches(field), expected[i]); // test multiple calls
}
}
void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
private void checkLabelCount(Query q, String field, int[] expected) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < expected.length; i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
int doc = i - ctx.docBase;
Matches matches = w.matches(ctx, doc);
if (matches == null) {
assertEquals("Expected to get matches on document " + i, 0, expected[i]);
continue;
}
MatchesIterator it = matches.getMatches(field);
if (expected[i] == 0) {
assertNull(it);
continue;
}
else {
assertNotNull(it);
}
IdentityHashMap<Object, Integer> labels = new IdentityHashMap<>();
while (it.next()) {
labels.put(it.label(), 1);
}
assertEquals(expected[i], labels.size());
}
}
private void checkFieldMatches(MatchesIterator it, int[] expected) throws IOException {
int pos = 1;
while (it.next()) {
//System.out.println(expected[i][pos] + "->" + expected[i][pos + 1] + "[" + expected[i][pos + 2] + "->" + expected[i][pos + 3] + "]");
@ -132,7 +167,7 @@ public class TestMatchesIterator extends LuceneTestCase {
assertEquals(expected.length, pos);
}
void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
private void checkNoPositionsMatches(Query q, String field, boolean[] expected) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < expected.length; i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
@ -148,8 +183,90 @@ public class TestMatchesIterator extends LuceneTestCase {
}
}
private void checkTermMatches(Query q, String field, TermMatch[][][] expected) throws IOException {
Weight w = searcher.createWeight(searcher.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1);
for (int i = 0; i < expected.length; i++) {
LeafReaderContext ctx = searcher.leafContexts.get(ReaderUtil.subIndex(i, searcher.leafContexts));
int doc = i - ctx.docBase;
Matches matches = w.matches(ctx, doc);
if (matches == null) {
assertEquals(expected[i].length, 0);
continue;
}
MatchesIterator it = matches.getMatches(field);
if (expected[i].length == 0) {
assertNull(it);
continue;
}
checkTerms(expected[i], it);
}
}
private void checkTerms(TermMatch[][] expected, MatchesIterator it) throws IOException {
int upTo = 0;
while (it.next()) {
Set<TermMatch> expectedMatches = new HashSet<>(Arrays.asList(expected[upTo]));
MatchesIterator submatches = it.getSubMatches();
while (submatches.next()) {
TermMatch tm = new TermMatch(submatches.startPosition(), submatches.startOffset(), submatches.endOffset());
if (expectedMatches.remove(tm) == false) {
fail("Unexpected term match: " + tm);
}
}
if (expectedMatches.size() != 0) {
fail("Missing term matches: " + expectedMatches.stream().map(Object::toString).collect(Collectors.joining(", ")));
}
upTo++;
}
if (upTo < expected.length - 1) {
fail("Missing expected match");
}
}
static class TermMatch {
public final int position;
public final int startOffset;
public final int endOffset;
public TermMatch(PostingsEnum pe, int position) throws IOException {
this.position = position;
this.startOffset = pe.startOffset();
this.endOffset = pe.endOffset();
}
public TermMatch(int position, int startOffset, int endOffset) {
this.position = position;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TermMatch termMatch = (TermMatch) o;
return position == termMatch.position &&
startOffset == termMatch.startOffset &&
endOffset == termMatch.endOffset;
}
@Override
public int hashCode() {
return Objects.hash(position, startOffset, endOffset);
}
@Override
public String toString() {
return position + "[" + startOffset + "->" + endOffset + "]";
}
}
public void testTermQuery() throws IOException {
Query q = new TermQuery(new Term(FIELD_WITH_OFFSETS, "w1"));
Term t = new Term(FIELD_WITH_OFFSETS, "w1");
Query q = new TermQuery(t);
checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
{ 0, 0, 0, 0, 2 },
{ 1, 0, 0, 0, 2 },
@ -157,6 +274,14 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 2, 2, 6, 8 },
{ 4 }
});
checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{
{ { new TermMatch(0, 0, 2) } },
{ { new TermMatch(0, 0, 2) } },
{ { new TermMatch(0, 0, 2) } },
{ { new TermMatch(0, 0, 2) }, { new TermMatch(2, 6, 8) } },
{}
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 1, 1, 1, 1, 0, 0 });
}
public void testTermQueryNoStoredOffsets() throws IOException {
@ -191,6 +316,7 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
{ 4 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 1, 2, 0, 0 });
}
public void testDisjunctionNoPositions() throws IOException {
@ -215,6 +341,7 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 5, 5, 15, 17 },
{ 4 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 0, 2, 0, 0 });
}
public void testReqOptNoPositions() throws IOException {
@ -248,6 +375,7 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 2, 2, 6, 8, 3, 3, 9, 11, 5, 5, 15, 17 },
{ 4 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 3, 1, 3, 3, 0, 0 });
}
public void testMinShouldMatchNoPositions() throws IOException {
@ -331,6 +459,7 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 3, 0, 0, 0, 2, 1, 1, 3, 5, 2, 2, 6, 8, 4, 4, 12, 14 },
{ 4 }
});
checkLabelCount(rq, FIELD_WITH_OFFSETS, new int[]{ 2, 2, 2, 2, 0 });
}
@ -392,12 +521,55 @@ public class TestMatchesIterator extends LuceneTestCase {
// 0 1 2 3 4 5 6 7
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
public void testSloppyPhraseQueryWithRepeats() throws IOException {
Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
PhraseQuery pq = new PhraseQuery(10, FIELD_WITH_OFFSETS, "phrase", "sentence", "sentence");
checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 1, 6, 2, 43, 2, 11, 9, 75, 5, 11, 28, 75, 6, 11, 35, 75 }
});
checkLabelCount(pq, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
checkTermMatches(pq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
{}, {}, {}, {},
{ {
new TermMatch(1, 2, 8),
new TermMatch(2, 9, 17),
new TermMatch(6, 35, 43)
}, {
new TermMatch(5, 28, 34),
new TermMatch(2, 9, 17),
new TermMatch(11, 67, 75)
}, {
new TermMatch(5, 28, 34),
new TermMatch(6, 35, 43),
new TermMatch(11, 67, 75)
}, {
new TermMatch(10, 60, 66),
new TermMatch(6, 35, 43),
new TermMatch(11, 67, 75)
} }
});
}
public void testSloppyPhraseQuery() throws IOException {
Term a = new Term(FIELD_WITH_OFFSETS, "a");
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
PhraseQuery pq = new PhraseQuery(4, FIELD_WITH_OFFSETS, "a", "sentence");
checkMatches(pq, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 0, 2, 0, 17, 6, 9, 35, 59, 9, 11, 58, 75 }
});
checkTermMatches(pq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
{}, {}, {}, {},
{ {
new TermMatch(0, 0, 1), new TermMatch(2, 9, 17)
}, {
new TermMatch(9, 58, 59), new TermMatch(6, 35, 43)
}, {
new TermMatch(9, 58, 59), new TermMatch(11, 67, 75)
} }
});
}
public void testExactPhraseQuery() throws IOException {
@ -407,28 +579,57 @@ public class TestMatchesIterator extends LuceneTestCase {
{ 4, 1, 2, 2, 17, 5, 6, 28, 43, 10, 11, 60, 75 }
});
Term a = new Term(FIELD_WITH_OFFSETS, "a");
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
PhraseQuery pq2 = new PhraseQuery.Builder()
.add(new Term(FIELD_WITH_OFFSETS, "a"))
.add(new Term(FIELD_WITH_OFFSETS, "sentence"), 2)
.add(a)
.add(s, 2)
.build();
checkMatches(pq2, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 0, 2, 0, 17, 9, 11, 58, 75 }
});
checkTermMatches(pq2, FIELD_WITH_OFFSETS, new TermMatch[][][]{
{}, {}, {}, {},
{ {
new TermMatch(0, 0, 1), new TermMatch(2, 9, 17)
}, {
new TermMatch(9, 58, 59), new TermMatch(11, 67, 75)
} }
});
}
// 0 1 2 3 4 5 6 7
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
public void testSloppyMultiPhraseQuery() throws IOException {
Term p = new Term(FIELD_WITH_OFFSETS, "phrase");
Term s = new Term(FIELD_WITH_OFFSETS, "sentence");
Term i = new Term(FIELD_WITH_OFFSETS, "iterations");
MultiPhraseQuery mpq = new MultiPhraseQuery.Builder()
.add(new Term(FIELD_WITH_OFFSETS, "phrase"))
.add(new Term[]{ new Term(FIELD_WITH_OFFSETS, "sentence"), new Term(FIELD_WITH_OFFSETS, "iterations") })
.add(p)
.add(new Term[]{ s, i })
.setSlop(4)
.build();
checkMatches(mpq, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 1, 2, 2, 17, 5, 7, 28, 54, 5, 7, 28, 54, 10, 11, 60, 75 }
{ 4, 1, 2, 2, 17, 5, 6, 28, 43, 5, 7, 28, 54, 10, 11, 60, 75 }
});
checkTermMatches(mpq, FIELD_WITH_OFFSETS, new TermMatch[][][]{
{}, {}, {}, {},
{ {
new TermMatch(1, 2, 8),
new TermMatch(2, 9, 17)
}, {
new TermMatch(5, 28, 34),
new TermMatch(6, 35, 43)
}, {
new TermMatch(5, 28, 34),
new TermMatch(7, 44, 54)
}, {
new TermMatch(10, 60, 66),
new TermMatch(11, 67, 75)
} }
});
}
@ -452,4 +653,35 @@ public class TestMatchesIterator extends LuceneTestCase {
});
}
// 0 1 2 3 4 5 6 7
// "a phrase sentence with many phrase sentence iterations of a phrase sentence",
public void testSpanQuery() throws IOException {
SpanQuery subq = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "with")))
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "many")))
.build();
Query q = SpanNearQuery.newOrderedNearQuery(FIELD_WITH_OFFSETS)
.addClause(new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "sentence")))
.addClause(new SpanOrQuery(subq, new SpanTermQuery(new Term(FIELD_WITH_OFFSETS, "iterations"))))
.build();
checkMatches(q, FIELD_WITH_OFFSETS, new int[][]{
{ 0 }, { 1 }, { 2 }, { 3 },
{ 4, 2, 4, 9, 27, 6, 7, 35, 54 }
});
checkLabelCount(q, FIELD_WITH_OFFSETS, new int[]{ 0, 0, 0, 0, 1 });
checkTermMatches(q, FIELD_WITH_OFFSETS, new TermMatch[][][]{
{}, {}, {}, {},
{
{
new TermMatch(2, 9, 17),
new TermMatch(3, 18, 22),
new TermMatch(4, 23, 27)
}, {
new TermMatch(6, 35, 43), new TermMatch(7, 44, 54)
}
}
});
}
}

View File

@ -67,4 +67,15 @@ class AssertingMatchesIterator implements MatchesIterator {
return in.endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
assert state == State.ITERATING : state;
return in.getSubMatches();
}
@Override
public Object label() {
assert state == State.ITERATING : state;
return in.label();
}
}