LUCENE-6394: Add two-phase support to SpanNotQuery

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673016 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2015-04-12 15:07:24 +00:00
parent 607c87a66b
commit 113d6c7c8f
10 changed files with 205 additions and 286 deletions

View File

@ -46,6 +46,10 @@ New Features
and its subclasses: SpanPositionRangeQuery, SpanPayloadCheckQuery, and its subclasses: SpanPositionRangeQuery, SpanPayloadCheckQuery,
SpanNearPayloadCheckQuery, SpanFirstQuery. (Paul Elschot, Robert Muir) SpanNearPayloadCheckQuery, SpanFirstQuery. (Paul Elschot, Robert Muir)
* LUCENE-6394: Add two-phase support to SpanNotQuery and refactor
FilterSpans to just have an accept(Spans candidate) method for
subclasses. (Robert Muir)
* LUCENE-6352: Added a new query time join to the join module that uses * LUCENE-6352: Added a new query time join to the join module that uses
global ordinals, which is faster for subsequent joins between reopens. global ordinals, which is faster for subsequent joins between reopens.
(Martijn van Groningen, Adrien Grand) (Martijn van Groningen, Adrien Grand)

View File

@ -25,60 +25,104 @@ import org.apache.lucene.search.TwoPhaseIterator;
/** /**
* A {@link Spans} implementation wrapping another spans instance, * A {@link Spans} implementation wrapping another spans instance,
* allowing to override selected methods in a subclass. * allowing to filter spans matches easily by implementing {@link #accept}
*/ */
public abstract class FilterSpans extends Spans { public abstract class FilterSpans extends Spans {
/** The wrapped spans instance. */ /** The wrapped spans instance. */
protected final Spans in; protected final Spans in;
private boolean atFirstInCurrentDoc = false;
private int startPos = -1;
/** Wrap the given {@link Spans}. */ /** Wrap the given {@link Spans}. */
public FilterSpans(Spans in) { protected FilterSpans(Spans in) {
this.in = Objects.requireNonNull(in); this.in = Objects.requireNonNull(in);
} }
/**
* Returns YES if the candidate should be an accepted match,
* NO if it should not, and NO_MORE_IN_CURRENT_DOC if iteration
* should move on to the next document.
*/
protected abstract AcceptStatus accept(Spans candidate) throws IOException;
@Override @Override
public int nextDoc() throws IOException { public final int nextDoc() throws IOException {
return in.nextDoc(); while (true) {
int doc = in.nextDoc();
if (doc == NO_MORE_DOCS) {
return NO_MORE_DOCS;
} else if (twoPhaseCurrentDocMatches()) {
return doc;
}
}
} }
@Override @Override
public int advance(int target) throws IOException { public final int advance(int target) throws IOException {
return in.advance(target); int doc = in.advance(target);
while (doc != NO_MORE_DOCS) {
if (twoPhaseCurrentDocMatches()) {
break;
}
doc = in.nextDoc();
}
return doc;
} }
@Override @Override
public int docID() { public final int docID() {
return in.docID(); return in.docID();
} }
@Override @Override
public int nextStartPosition() throws IOException { public final int nextStartPosition() throws IOException {
return in.nextStartPosition(); if (atFirstInCurrentDoc) {
atFirstInCurrentDoc = false;
return startPos;
}
for (;;) {
startPos = in.nextStartPosition();
if (startPos == NO_MORE_POSITIONS) {
return NO_MORE_POSITIONS;
}
switch(accept(in)) {
case YES:
return startPos;
case NO:
break;
case NO_MORE_IN_CURRENT_DOC:
return startPos = NO_MORE_POSITIONS; // startPos ahead for the current doc.
}
}
} }
@Override @Override
public int startPosition() { public final int startPosition() {
return in.startPosition(); return atFirstInCurrentDoc ? -1 : startPos;
}
@Override
public final int endPosition() {
return atFirstInCurrentDoc ? -1
: (startPos != NO_MORE_POSITIONS) ? in.endPosition() : NO_MORE_POSITIONS;
} }
@Override @Override
public int endPosition() { public final Collection<byte[]> getPayload() throws IOException {
return in.endPosition();
}
@Override
public Collection<byte[]> getPayload() throws IOException {
return in.getPayload(); return in.getPayload();
} }
@Override @Override
public boolean isPayloadAvailable() throws IOException { public final boolean isPayloadAvailable() throws IOException {
return in.isPayloadAvailable(); return in.isPayloadAvailable();
} }
@Override @Override
public long cost() { public final long cost() {
return in.cost(); return in.cost();
} }
@ -88,7 +132,7 @@ public abstract class FilterSpans extends Spans {
} }
@Override @Override
public TwoPhaseIterator asTwoPhaseIterator() { public final TwoPhaseIterator asTwoPhaseIterator() {
TwoPhaseIterator inner = in.asTwoPhaseIterator(); TwoPhaseIterator inner = in.asTwoPhaseIterator();
if (inner != null) { if (inner != null) {
// wrapped instance has an approximation // wrapped instance has an approximation
@ -115,5 +159,46 @@ public abstract class FilterSpans extends Spans {
* <p> * <p>
* This is called during two-phase processing. * This is called during two-phase processing.
*/ */
public abstract boolean twoPhaseCurrentDocMatches() throws IOException; // return true if the current document matches
@SuppressWarnings("fallthrough")
private final boolean twoPhaseCurrentDocMatches() throws IOException {
atFirstInCurrentDoc = false;
startPos = in.nextStartPosition();
assert startPos != NO_MORE_POSITIONS;
for (;;) {
switch(accept(in)) {
case YES:
atFirstInCurrentDoc = true;
return true;
case NO:
startPos = in.nextStartPosition();
if (startPos != NO_MORE_POSITIONS) {
break;
}
// else fallthrough
case NO_MORE_IN_CURRENT_DOC:
startPos = -1;
return false;
}
}
}
/**
* Status returned from {@link FilterSpans#accept(Spans)} that indicates
* whether a candidate match should be accepted, rejected, or rejected
* and move on to the next document.
*/
public static enum AcceptStatus {
/** Indicates the match should be accepted */
YES,
/** Indicates the match should be rejected */
NO,
/**
* Indicates the match should be rejected, and the enumeration may continue
* with the next document.
*/
NO_MORE_IN_CURRENT_DOC
};
} }

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.spans;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
import java.io.IOException; import java.io.IOException;
@ -71,11 +72,12 @@ public class NearSpansUnordered extends NearSpans {
private int totalSpanLength; private int totalSpanLength;
private SpansCell maxEndPositionCell; private SpansCell maxEndPositionCell;
private class SpansCell extends FilterSpans { private class SpansCell extends Spans {
private int spanLength = -1; private int spanLength = -1;
final Spans in;
public SpansCell(Spans spans) { public SpansCell(Spans spans) {
super(spans); this.in = spans;
} }
@Override @Override
@ -106,8 +108,48 @@ public class NearSpansUnordered extends NearSpans {
} }
@Override @Override
public boolean twoPhaseCurrentDocMatches() throws IOException { public int startPosition() {
return true; // we don't modify the spans, we just capture information from it. return in.startPosition();
}
@Override
public int endPosition() {
return in.endPosition();
}
@Override
public Collection<byte[]> getPayload() throws IOException {
return in.getPayload();
}
@Override
public boolean isPayloadAvailable() throws IOException {
return in.isPayloadAvailable();
}
@Override
public TwoPhaseIterator asTwoPhaseIterator() {
return in.asTwoPhaseIterator();
}
@Override
public int docID() {
return in.docID();
}
@Override
public int nextDoc() throws IOException {
return in.nextDoc();
}
@Override
public int advance(int target) throws IOException {
return in.advance(target);
}
@Override
public long cost() {
return in.cost();
} }
@Override @Override

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.spans;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import java.io.IOException; import java.io.IOException;

View File

@ -16,6 +16,7 @@ package org.apache.lucene.search.spans;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import java.io.IOException; import java.io.IOException;

View File

@ -21,13 +21,13 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext; import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.Objects; import java.util.Objects;
@ -115,158 +115,52 @@ public class SpanNotQuery extends SpanQuery implements Cloneable {
if (excludeSpans == null) { if (excludeSpans == null) {
return includeSpans; return includeSpans;
} }
return new Spans() { TwoPhaseIterator excludeTwoPhase = excludeSpans.asTwoPhaseIterator();
private boolean moreInclude = true; DocIdSetIterator excludeApproximation = excludeTwoPhase == null ? null : excludeTwoPhase.approximation();
private int includeStart = -1;
private int includeEnd = -1; return new FilterSpans(includeSpans) {
private boolean atFirstInCurrentDoc = false; // last document we have checked matches() against for the exclusion, and failed
// when using approximations, so we don't call it again, and pass thru all inclusions.
private boolean moreExclude = excludeSpans.nextDoc() != NO_MORE_DOCS; int lastNonMatchingDoc = -1;
private int excludeStart = moreExclude ? excludeSpans.nextStartPosition() : NO_MORE_POSITIONS;
@Override @Override
public int nextDoc() throws IOException { protected AcceptStatus accept(Spans candidate) throws IOException {
if (moreInclude) { int doc = candidate.docID();
moreInclude = includeSpans.nextDoc() != NO_MORE_DOCS; if (doc > excludeSpans.docID()) {
if (moreInclude) { // catch up 'exclude' to the current doc
atFirstInCurrentDoc = true; if (excludeTwoPhase != null) {
includeStart = includeSpans.nextStartPosition(); if (excludeApproximation.advance(doc) == doc) {
assert includeStart != NO_MORE_POSITIONS; if (!excludeTwoPhase.matches()) {
} lastNonMatchingDoc = doc; // mark as non-match
} }
toNextIncluded();
int res = moreInclude ? includeSpans.docID() : NO_MORE_DOCS;
return res;
}
private void toNextIncluded() throws IOException {
while (moreInclude && moreExclude) {
if (includeSpans.docID() > excludeSpans.docID()) {
moreExclude = excludeSpans.advance(includeSpans.docID()) != NO_MORE_DOCS;
if (moreExclude) {
excludeStart = -1; // only use exclude positions at same doc
}
}
if (excludeForwardInCurrentDocAndAtMatch()) {
break; // at match.
}
// else intersected: keep scanning, to next doc if needed
includeStart = includeSpans.nextStartPosition();
if (includeStart == NO_MORE_POSITIONS) {
moreInclude = includeSpans.nextDoc() != NO_MORE_DOCS;
if (moreInclude) {
atFirstInCurrentDoc = true;
includeStart = includeSpans.nextStartPosition();
assert includeStart != NO_MORE_POSITIONS;
} }
} else {
excludeSpans.advance(doc);
} }
} }
}
if (doc == lastNonMatchingDoc || doc != excludeSpans.docID()) {
private boolean excludeForwardInCurrentDocAndAtMatch() throws IOException { return AcceptStatus.YES;
assert moreInclude;
assert includeStart != NO_MORE_POSITIONS;
if (! moreExclude) {
return true;
} }
if (includeSpans.docID() != excludeSpans.docID()) {
return true; if (excludeSpans.startPosition() == -1) { // init exclude start position if needed
excludeSpans.nextStartPosition();
} }
// at same doc
if (excludeStart == -1) { // init exclude start position if needed while (excludeSpans.endPosition() <= candidate.startPosition() - pre) {
excludeStart = excludeSpans.nextStartPosition();
assert excludeStart != NO_MORE_POSITIONS;
}
while (excludeSpans.endPosition() <= includeStart - pre) {
// exclude end position is before a possible exclusion // exclude end position is before a possible exclusion
excludeStart = excludeSpans.nextStartPosition(); if (excludeSpans.nextStartPosition() == NO_MORE_POSITIONS) {
if (excludeStart == NO_MORE_POSITIONS) { return AcceptStatus.YES; // no more exclude at current doc.
return true; // no more exclude at current doc.
} }
} }
// exclude end position far enough in current doc, check start position: // exclude end position far enough in current doc, check start position:
boolean res = includeSpans.endPosition() + post <= excludeStart; if (candidate.endPosition() + post <= excludeSpans.startPosition()) {
return res; return AcceptStatus.YES;
} } else {
return AcceptStatus.NO;
@Override
public int advance(int target) throws IOException {
if (moreInclude) {
assert target > includeSpans.docID() : "target="+target+", includeSpans.docID()="+includeSpans.docID();
moreInclude = includeSpans.advance(target) != NO_MORE_DOCS;
if (moreInclude) {
atFirstInCurrentDoc = true;
includeStart = includeSpans.nextStartPosition();
assert includeStart != NO_MORE_POSITIONS;
}
} }
toNextIncluded();
int res = moreInclude ? includeSpans.docID() : NO_MORE_DOCS;
return res;
}
@Override
public int docID() {
int res = includeSpans.docID();
return res;
}
@Override
public int nextStartPosition() throws IOException {
assert moreInclude;
if (atFirstInCurrentDoc) {
atFirstInCurrentDoc = false;
assert includeStart != NO_MORE_POSITIONS;
return includeStart;
}
includeStart = includeSpans.nextStartPosition();
while ((includeStart != NO_MORE_POSITIONS)
&& (! excludeForwardInCurrentDocAndAtMatch()))
{
includeStart = includeSpans.nextStartPosition();
}
return includeStart;
}
@Override
public int startPosition() {
assert includeStart == includeSpans.startPosition();
return atFirstInCurrentDoc ? -1 : includeStart;
}
@Override
public int endPosition() {
return atFirstInCurrentDoc ? -1 : includeSpans.endPosition();
}
@Override
public Collection<byte[]> getPayload() throws IOException {
ArrayList<byte[]> result = null;
if (includeSpans.isPayloadAvailable()) {
result = new ArrayList<>(includeSpans.getPayload());
}
return result;
}
@Override
public boolean isPayloadAvailable() throws IOException {
return includeSpans.isPayloadAvailable();
}
@Override
public long cost() {
return includeSpans.cost();
}
@Override
public String toString() {
return "spans(" + SpanNotQuery.this.toString() + ")";
} }
}; };
} }

View File

@ -16,6 +16,7 @@ package org.apache.lucene.search.spans;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import java.io.IOException; import java.io.IOException;

View File

@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext; import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import java.io.IOException; import java.io.IOException;
@ -58,23 +59,6 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
match.extractTerms(terms); match.extractTerms(terms);
} }
/**
* Return value for {@link SpanPositionCheckQuery#acceptPosition(Spans)}.
*/
protected static enum AcceptStatus {
/** Indicates the match should be accepted */
YES,
/** Indicates the match should be rejected */
NO,
/**
* Indicates the match should be rejected, and the enumeration may continue
* with the next document.
*/
NO_MORE_IN_CURRENT_DOC
};
/** /**
* Implementing classes are required to return whether the current position is a match for the passed in * Implementing classes are required to return whether the current position is a match for the passed in
* "match" {@link SpanQuery}. * "match" {@link SpanQuery}.
@ -95,10 +79,14 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
@Override @Override
public Spans getSpans(final LeafReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException { public Spans getSpans(final LeafReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
Spans matchSpans = match.getSpans(context, acceptDocs, termContexts); Spans matchSpans = match.getSpans(context, acceptDocs, termContexts);
return (matchSpans == null) ? null : new PositionCheckSpans(matchSpans); return (matchSpans == null) ? null : new FilterSpans(matchSpans) {
@Override
protected AcceptStatus accept(Spans candidate) throws IOException {
return acceptPosition(candidate);
}
};
} }
@Override @Override
public Query rewrite(IndexReader reader) throws IOException { public Query rewrite(IndexReader reader) throws IOException {
SpanPositionCheckQuery clone = null; SpanPositionCheckQuery clone = null;
@ -116,104 +104,6 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
} }
} }
protected class PositionCheckSpans extends FilterSpans {
private boolean atFirstInCurrentDoc = false;
private int startPos = -1;
public PositionCheckSpans(Spans matchSpans) throws IOException {
super(matchSpans);
}
@Override
public int nextDoc() throws IOException {
while (true) {
int doc = in.nextDoc();
if (doc == NO_MORE_DOCS) {
return NO_MORE_DOCS;
} else if (twoPhaseCurrentDocMatches()) {
return doc;
}
}
}
@Override
public int advance(int target) throws IOException {
int doc = in.advance(target);
while (doc != NO_MORE_DOCS) {
if (twoPhaseCurrentDocMatches()) {
break;
}
doc = in.nextDoc();
}
return doc;
}
@Override
public int nextStartPosition() throws IOException {
if (atFirstInCurrentDoc) {
atFirstInCurrentDoc = false;
return startPos;
}
for (;;) {
startPos = in.nextStartPosition();
if (startPos == NO_MORE_POSITIONS) {
return NO_MORE_POSITIONS;
}
switch(acceptPosition(in)) {
case YES:
return startPos;
case NO:
break;
case NO_MORE_IN_CURRENT_DOC:
return startPos = NO_MORE_POSITIONS; // startPos ahead for the current doc.
}
}
}
// return true if the current document matches
@SuppressWarnings("fallthrough")
public boolean twoPhaseCurrentDocMatches() throws IOException {
atFirstInCurrentDoc = false;
startPos = in.nextStartPosition();
assert startPos != NO_MORE_POSITIONS;
for (;;) {
switch(acceptPosition(in)) {
case YES:
atFirstInCurrentDoc = true;
return true;
case NO:
startPos = in.nextStartPosition();
if (startPos != NO_MORE_POSITIONS) {
break;
}
// else fallthrough
case NO_MORE_IN_CURRENT_DOC:
startPos = -1;
return false;
}
}
}
@Override
public int startPosition() {
return atFirstInCurrentDoc ? -1 : startPos;
}
@Override
public int endPosition() {
return atFirstInCurrentDoc ? -1
: (startPos != NO_MORE_POSITIONS) ? in.endPosition() : NO_MORE_POSITIONS;
}
@Override
public String toString() {
return "spans(" + SpanPositionCheckQuery.this.toString() + ")";
}
}
/** Returns true iff <code>o</code> is equal to this. */ /** Returns true iff <code>o</code> is equal to this. */
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search.spans;
*/ */
import org.apache.lucene.search.spans.FilterSpans.AcceptStatus;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import java.io.IOException; import java.io.IOException;

View File

@ -341,7 +341,7 @@ public class TestBasics extends LuceneTestCase {
assertTrue(searcher.explain(query, 891).getValue() > 0.0f); assertTrue(searcher.explain(query, 891).getValue() > 0.0f);
} }
@Test @Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-6418")
public void testNpeInSpanNearInSpanFirstInSpanNot() throws Exception { public void testNpeInSpanNearInSpanFirstInSpanNot() throws Exception {
int n = 5; int n = 5;
SpanTermQuery hun = new SpanTermQuery(new Term("field", "hundred")); SpanTermQuery hun = new SpanTermQuery(new Term("field", "hundred"));