LUCENE-6394: Add two-phase support to SpanNotQuery

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673016 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2015-04-12 15:07:24 +00:00
parent 607c87a66b
commit 113d6c7c8f
10 changed files with 205 additions and 286 deletions

View File

@ -46,6 +46,10 @@ New Features
and its subclasses: SpanPositionRangeQuery, SpanPayloadCheckQuery,
SpanNearPayloadCheckQuery, SpanFirstQuery. (Paul Elschot, Robert Muir)
* LUCENE-6394: Add two-phase support to SpanNotQuery and refactor
FilterSpans to just have an accept(Spans candidate) method for
subclasses. (Robert Muir)
* LUCENE-6352: Added a new query time join to the join module that uses
global ordinals, which is faster for subsequent joins between reopens.
(Martijn van Groningen, Adrien Grand)

View File

@ -25,60 +25,104 @@ import org.apache.lucene.search.TwoPhaseIterator;
/**
* A {@link Spans} implementation wrapping another spans instance,
* allowing to override selected methods in a subclass.
* allowing to filter spans matches easily by implementing {@link #accept}
*/
public abstract class FilterSpans extends Spans {
/** The wrapped spans instance. */
protected final Spans in;
private boolean atFirstInCurrentDoc = false;
private int startPos = -1;
/** Wrap the given {@link Spans}. */
public FilterSpans(Spans in) {
protected FilterSpans(Spans in) {
this.in = Objects.requireNonNull(in);
}
/**
* Returns YES if the candidate should be an accepted match,
* NO if it should not, and NO_MORE_IN_CURRENT_DOC if iteration
* should move on to the next document.
*/
protected abstract AcceptStatus accept(Spans candidate) throws IOException;
@Override
public int nextDoc() throws IOException {
return in.nextDoc();
public final int nextDoc() throws IOException {
while (true) {
int doc = in.nextDoc();
if (doc == NO_MORE_DOCS) {
return NO_MORE_DOCS;
} else if (twoPhaseCurrentDocMatches()) {
return doc;
}
}
}
@Override
public int advance(int target) throws IOException {
return in.advance(target);
public final int advance(int target) throws IOException {
int doc = in.advance(target);
while (doc != NO_MORE_DOCS) {
if (twoPhaseCurrentDocMatches()) {
break;
}
doc = in.nextDoc();
}
return doc;
}
@Override
public int docID() {
public final int docID() {
return in.docID();
}
@Override
public int nextStartPosition() throws IOException {
return in.nextStartPosition();
public final int nextStartPosition() throws IOException {
if (atFirstInCurrentDoc) {
atFirstInCurrentDoc = false;
return startPos;
}
for (;;) {
startPos = in.nextStartPosition();
if (startPos == NO_MORE_POSITIONS) {
return NO_MORE_POSITIONS;
}
switch(accept(in)) {
case YES:
return startPos;
case NO:
break;
case NO_MORE_IN_CURRENT_DOC:
return startPos = NO_MORE_POSITIONS; // startPos ahead for the current doc.
}
}
}
@Override
public int startPosition() {
return in.startPosition();
public final int startPosition() {
return atFirstInCurrentDoc ? -1 : startPos;
}
@Override
public final int endPosition() {
return atFirstInCurrentDoc ? -1
: (startPos != NO_MORE_POSITIONS) ? in.endPosition() : NO_MORE_POSITIONS;
}
@Override
public int endPosition() {
return in.endPosition();
}
@Override
public Collection<byte[]> getPayload() throws IOException {
public final Collection<byte[]> getPayload() throws IOException {
return in.getPayload();
}
@Override
public boolean isPayloadAvailable() throws IOException {
public final boolean isPayloadAvailable() throws IOException {
return in.isPayloadAvailable();
}
@Override
public long cost() {
public final long cost() {
return in.cost();
}
@ -88,7 +132,7 @@ public abstract class FilterSpans extends Spans {
}
@Override
public TwoPhaseIterator asTwoPhaseIterator() {
public final TwoPhaseIterator asTwoPhaseIterator() {
TwoPhaseIterator inner = in.asTwoPhaseIterator();
if (inner != null) {
// wrapped instance has an approximation
@ -115,5 +159,46 @@ public abstract class FilterSpans extends Spans {
* <p>
* This is called during two-phase processing.
*/
public abstract boolean twoPhaseCurrentDocMatches() throws IOException;
// return true if the current document matches
@SuppressWarnings("fallthrough")
private final boolean twoPhaseCurrentDocMatches() throws IOException {
atFirstInCurrentDoc = false;
startPos = in.nextStartPosition();
assert startPos != NO_MORE_POSITIONS;
for (;;) {
switch(accept(in)) {
case YES:
atFirstInCurrentDoc = true;
return true;
case NO:
startPos = in.nextStartPosition();
if (startPos != NO_MORE_POSITIONS) {
break;
}
// else fallthrough
case NO_MORE_IN_CURRENT_DOC:
startPos = -1;
return false;
}
}
}
/**
* Status returned from {@link FilterSpans#accept(Spans)} that indicates
* whether a candidate match should be accepted, rejected, or rejected
* and move on to the next document.
*/
public static enum AcceptStatus {
/** Indicates the match should be accepted */
YES,
/** Indicates the match should be rejected */
NO,
/**
* Indicates the match should be rejected, and the enumeration may continue
* with the next document.
*/
NO_MORE_IN_CURRENT_DOC
};
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.PriorityQueue;
import java.io.IOException;
@ -71,11 +72,12 @@ public class NearSpansUnordered extends NearSpans {
private int totalSpanLength;
private SpansCell maxEndPositionCell;
private class SpansCell extends FilterSpans {
private class SpansCell extends Spans {
private int spanLength = -1;
final Spans in;
public SpansCell(Spans spans) {
super(spans);
this.in = spans;
}
@Override
@ -106,8 +108,48 @@ public class NearSpansUnordered extends NearSpans {
}
@Override
public boolean twoPhaseCurrentDocMatches() throws IOException {
return true; // we don't modify the spans, we just capture information from it.
public int startPosition() {
return in.startPosition();
}
@Override
public int endPosition() {
return in.endPosition();
}
@Override
public Collection<byte[]> getPayload() throws IOException {
return in.getPayload();
}
@Override
public boolean isPayloadAvailable() throws IOException {
return in.isPayloadAvailable();
}
@Override
public TwoPhaseIterator asTwoPhaseIterator() {
return in.asTwoPhaseIterator();
}
@Override
public int docID() {
return in.docID();
}
@Override
public int nextDoc() throws IOException {
return in.nextDoc();
}
@Override
public int advance(int target) throws IOException {
return in.advance(target);
}
@Override
public long cost() {
return in.cost();
}
@Override

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;

View File

@ -16,6 +16,7 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;

View File

@ -21,13 +21,13 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.Objects;
@ -115,158 +115,52 @@ public class SpanNotQuery extends SpanQuery implements Cloneable {
if (excludeSpans == null) {
return includeSpans;
}
return new Spans() {
private boolean moreInclude = true;
private int includeStart = -1;
private int includeEnd = -1;
private boolean atFirstInCurrentDoc = false;
private boolean moreExclude = excludeSpans.nextDoc() != NO_MORE_DOCS;
private int excludeStart = moreExclude ? excludeSpans.nextStartPosition() : NO_MORE_POSITIONS;
TwoPhaseIterator excludeTwoPhase = excludeSpans.asTwoPhaseIterator();
DocIdSetIterator excludeApproximation = excludeTwoPhase == null ? null : excludeTwoPhase.approximation();
return new FilterSpans(includeSpans) {
// last document we have checked matches() against for the exclusion, and failed
// when using approximations, so we don't call it again, and pass thru all inclusions.
int lastNonMatchingDoc = -1;
@Override
public int nextDoc() throws IOException {
if (moreInclude) {
moreInclude = includeSpans.nextDoc() != NO_MORE_DOCS;
if (moreInclude) {
atFirstInCurrentDoc = true;
includeStart = includeSpans.nextStartPosition();
assert includeStart != NO_MORE_POSITIONS;
}
}
toNextIncluded();
int res = moreInclude ? includeSpans.docID() : NO_MORE_DOCS;
return res;
}
private void toNextIncluded() throws IOException {
while (moreInclude && moreExclude) {
if (includeSpans.docID() > excludeSpans.docID()) {
moreExclude = excludeSpans.advance(includeSpans.docID()) != NO_MORE_DOCS;
if (moreExclude) {
excludeStart = -1; // only use exclude positions at same doc
}
}
if (excludeForwardInCurrentDocAndAtMatch()) {
break; // at match.
}
// else intersected: keep scanning, to next doc if needed
includeStart = includeSpans.nextStartPosition();
if (includeStart == NO_MORE_POSITIONS) {
moreInclude = includeSpans.nextDoc() != NO_MORE_DOCS;
if (moreInclude) {
atFirstInCurrentDoc = true;
includeStart = includeSpans.nextStartPosition();
assert includeStart != NO_MORE_POSITIONS;
protected AcceptStatus accept(Spans candidate) throws IOException {
int doc = candidate.docID();
if (doc > excludeSpans.docID()) {
// catch up 'exclude' to the current doc
if (excludeTwoPhase != null) {
if (excludeApproximation.advance(doc) == doc) {
if (!excludeTwoPhase.matches()) {
lastNonMatchingDoc = doc; // mark as non-match
}
}
} else {
excludeSpans.advance(doc);
}
}
}
private boolean excludeForwardInCurrentDocAndAtMatch() throws IOException {
assert moreInclude;
assert includeStart != NO_MORE_POSITIONS;
if (! moreExclude) {
return true;
if (doc == lastNonMatchingDoc || doc != excludeSpans.docID()) {
return AcceptStatus.YES;
}
if (includeSpans.docID() != excludeSpans.docID()) {
return true;
if (excludeSpans.startPosition() == -1) { // init exclude start position if needed
excludeSpans.nextStartPosition();
}
// at same doc
if (excludeStart == -1) { // init exclude start position if needed
excludeStart = excludeSpans.nextStartPosition();
assert excludeStart != NO_MORE_POSITIONS;
}
while (excludeSpans.endPosition() <= includeStart - pre) {
while (excludeSpans.endPosition() <= candidate.startPosition() - pre) {
// exclude end position is before a possible exclusion
excludeStart = excludeSpans.nextStartPosition();
if (excludeStart == NO_MORE_POSITIONS) {
return true; // no more exclude at current doc.
if (excludeSpans.nextStartPosition() == NO_MORE_POSITIONS) {
return AcceptStatus.YES; // no more exclude at current doc.
}
}
// exclude end position far enough in current doc, check start position:
boolean res = includeSpans.endPosition() + post <= excludeStart;
return res;
}
@Override
public int advance(int target) throws IOException {
if (moreInclude) {
assert target > includeSpans.docID() : "target="+target+", includeSpans.docID()="+includeSpans.docID();
moreInclude = includeSpans.advance(target) != NO_MORE_DOCS;
if (moreInclude) {
atFirstInCurrentDoc = true;
includeStart = includeSpans.nextStartPosition();
assert includeStart != NO_MORE_POSITIONS;
}
if (candidate.endPosition() + post <= excludeSpans.startPosition()) {
return AcceptStatus.YES;
} else {
return AcceptStatus.NO;
}
toNextIncluded();
int res = moreInclude ? includeSpans.docID() : NO_MORE_DOCS;
return res;
}
@Override
public int docID() {
int res = includeSpans.docID();
return res;
}
@Override
public int nextStartPosition() throws IOException {
assert moreInclude;
if (atFirstInCurrentDoc) {
atFirstInCurrentDoc = false;
assert includeStart != NO_MORE_POSITIONS;
return includeStart;
}
includeStart = includeSpans.nextStartPosition();
while ((includeStart != NO_MORE_POSITIONS)
&& (! excludeForwardInCurrentDocAndAtMatch()))
{
includeStart = includeSpans.nextStartPosition();
}
return includeStart;
}
@Override
public int startPosition() {
assert includeStart == includeSpans.startPosition();
return atFirstInCurrentDoc ? -1 : includeStart;
}
@Override
public int endPosition() {
return atFirstInCurrentDoc ? -1 : includeSpans.endPosition();
}
@Override
public Collection<byte[]> getPayload() throws IOException {
ArrayList<byte[]> result = null;
if (includeSpans.isPayloadAvailable()) {
result = new ArrayList<>(includeSpans.getPayload());
}
return result;
}
@Override
public boolean isPayloadAvailable() throws IOException {
return includeSpans.isPayloadAvailable();
}
@Override
public long cost() {
return includeSpans.cost();
}
@Override
public String toString() {
return "spans(" + SpanNotQuery.this.toString() + ")";
}
};
}

View File

@ -16,6 +16,7 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;

View File

@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.Bits;
import java.io.IOException;
@ -58,23 +59,6 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
match.extractTerms(terms);
}
/**
* Return value for {@link SpanPositionCheckQuery#acceptPosition(Spans)}.
*/
protected static enum AcceptStatus {
/** Indicates the match should be accepted */
YES,
/** Indicates the match should be rejected */
NO,
/**
* Indicates the match should be rejected, and the enumeration may continue
* with the next document.
*/
NO_MORE_IN_CURRENT_DOC
};
/**
* Implementing classes are required to return whether the current position is a match for the passed in
* "match" {@link SpanQuery}.
@ -95,10 +79,14 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
@Override
public Spans getSpans(final LeafReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
Spans matchSpans = match.getSpans(context, acceptDocs, termContexts);
return (matchSpans == null) ? null : new PositionCheckSpans(matchSpans);
return (matchSpans == null) ? null : new FilterSpans(matchSpans) {
@Override
protected AcceptStatus accept(Spans candidate) throws IOException {
return acceptPosition(candidate);
}
};
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
SpanPositionCheckQuery clone = null;
@ -116,104 +104,6 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
}
}
protected class PositionCheckSpans extends FilterSpans {
private boolean atFirstInCurrentDoc = false;
private int startPos = -1;
public PositionCheckSpans(Spans matchSpans) throws IOException {
super(matchSpans);
}
@Override
public int nextDoc() throws IOException {
while (true) {
int doc = in.nextDoc();
if (doc == NO_MORE_DOCS) {
return NO_MORE_DOCS;
} else if (twoPhaseCurrentDocMatches()) {
return doc;
}
}
}
@Override
public int advance(int target) throws IOException {
int doc = in.advance(target);
while (doc != NO_MORE_DOCS) {
if (twoPhaseCurrentDocMatches()) {
break;
}
doc = in.nextDoc();
}
return doc;
}
@Override
public int nextStartPosition() throws IOException {
if (atFirstInCurrentDoc) {
atFirstInCurrentDoc = false;
return startPos;
}
for (;;) {
startPos = in.nextStartPosition();
if (startPos == NO_MORE_POSITIONS) {
return NO_MORE_POSITIONS;
}
switch(acceptPosition(in)) {
case YES:
return startPos;
case NO:
break;
case NO_MORE_IN_CURRENT_DOC:
return startPos = NO_MORE_POSITIONS; // startPos ahead for the current doc.
}
}
}
// return true if the current document matches
@SuppressWarnings("fallthrough")
public boolean twoPhaseCurrentDocMatches() throws IOException {
atFirstInCurrentDoc = false;
startPos = in.nextStartPosition();
assert startPos != NO_MORE_POSITIONS;
for (;;) {
switch(acceptPosition(in)) {
case YES:
atFirstInCurrentDoc = true;
return true;
case NO:
startPos = in.nextStartPosition();
if (startPos != NO_MORE_POSITIONS) {
break;
}
// else fallthrough
case NO_MORE_IN_CURRENT_DOC:
startPos = -1;
return false;
}
}
}
@Override
public int startPosition() {
return atFirstInCurrentDoc ? -1 : startPos;
}
@Override
public int endPosition() {
return atFirstInCurrentDoc ? -1
: (startPos != NO_MORE_POSITIONS) ? in.endPosition() : NO_MORE_POSITIONS;
}
@Override
public String toString() {
return "spans(" + SpanPositionCheckQuery.this.toString() + ")";
}
}
/** Returns true iff <code>o</code> is equal to this. */
@Override
public boolean equals(Object o) {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.spans;
*/
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;

View File

@ -341,7 +341,7 @@ public class TestBasics extends LuceneTestCase {
assertTrue(searcher.explain(query, 891).getValue() > 0.0f);
}
@Test
@Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-6418")
public void testNpeInSpanNearInSpanFirstInSpanNot() throws Exception {
int n = 5;
SpanTermQuery hun = new SpanTermQuery(new Term("field", "hundred"));