mirror of https://github.com/apache/lucene.git
LUCENE-6198: Two-phase execution for phrase queries and conjunctions.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1659599 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
82eff4eb4d
commit
5b4c02a3a1
|
@ -81,6 +81,11 @@ Optimizations
|
||||||
* LUCENE-6233 Speed up CheckIndex when the index has term vectors
|
* LUCENE-6233 Speed up CheckIndex when the index has term vectors
|
||||||
(Robert Muir, Mike McCandless)
|
(Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-6198: Added the TwoPhaseDocIdSetIterator API, exposed on scorers which
|
||||||
|
is for now only used on phrase queries and conjunctions in order to check
|
||||||
|
positions lazily if the phrase query is in a conjunction with other queries.
|
||||||
|
(Robert Muir, Adrien Grand)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-6204, LUCENE-6208: Simplify CompoundFormat: remove files()
|
* LUCENE-6204, LUCENE-6208: Simplify CompoundFormat: remove files()
|
||||||
|
|
|
@ -0,0 +1,202 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.CollectionUtil;
|
||||||
|
|
||||||
|
class ConjunctionDISI extends DocIdSetIterator {
|
||||||
|
|
||||||
|
/** Create a conjunction over the provided iterators, taking advantage of
|
||||||
|
* {@link TwoPhaseDocIdSetIterator}. */
|
||||||
|
public static ConjunctionDISI intersect(List<? extends DocIdSetIterator> iterators) {
|
||||||
|
final List<DocIdSetIterator> allIterators = new ArrayList<>();
|
||||||
|
final List<TwoPhaseDocIdSetIterator> twoPhaseIterators = new ArrayList<>();
|
||||||
|
for (DocIdSetIterator iterator : iterators) {
|
||||||
|
if (iterator instanceof Scorer) {
|
||||||
|
// if we have a scorer, check if it supports two-phase iteration
|
||||||
|
TwoPhaseDocIdSetIterator twoPhaseIterator = ((Scorer) iterator).asTwoPhaseIterator();
|
||||||
|
if (twoPhaseIterator != null) {
|
||||||
|
// Note:
|
||||||
|
allIterators.add(twoPhaseIterator.approximation());
|
||||||
|
twoPhaseIterators.add(twoPhaseIterator);
|
||||||
|
} else {
|
||||||
|
allIterators.add(iterator);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// no approximation support, use the iterator as-is
|
||||||
|
allIterators.add(iterator);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (twoPhaseIterators.isEmpty()) {
|
||||||
|
return new ConjunctionDISI(allIterators);
|
||||||
|
} else {
|
||||||
|
return new TwoPhase(allIterators, twoPhaseIterators);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final DocIdSetIterator lead;
|
||||||
|
final DocIdSetIterator[] others;
|
||||||
|
|
||||||
|
ConjunctionDISI(List<? extends DocIdSetIterator> iterators) {
|
||||||
|
// Sort the array the first time to allow the least frequent DocsEnum to
|
||||||
|
// lead the matching.
|
||||||
|
CollectionUtil.timSort(iterators, new Comparator<DocIdSetIterator>() {
|
||||||
|
@Override
|
||||||
|
public int compare(DocIdSetIterator o1, DocIdSetIterator o2) {
|
||||||
|
return Long.compare(o1.cost(), o2.cost());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
lead = iterators.get(0);
|
||||||
|
others = iterators.subList(1, iterators.size()).toArray(new DocIdSetIterator[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean matches() throws IOException {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int doNext(int doc) throws IOException {
|
||||||
|
for(;;) {
|
||||||
|
|
||||||
|
if (doc == NO_MORE_DOCS) {
|
||||||
|
// we need this check because it is only ok to call #matches when positioned
|
||||||
|
return NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
|
||||||
|
advanceHead: for(;;) {
|
||||||
|
for (DocIdSetIterator other : others) {
|
||||||
|
// invariant: docsAndFreqs[i].doc <= doc at this point.
|
||||||
|
|
||||||
|
// docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead"
|
||||||
|
// on the previous iteration and the advance on the lead scorer exactly matched.
|
||||||
|
if (other.docID() < doc) {
|
||||||
|
final int next = other.advance(doc);
|
||||||
|
|
||||||
|
if (next > doc) {
|
||||||
|
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
|
||||||
|
doc = lead.advance(next);
|
||||||
|
break advanceHead;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matches()) {
|
||||||
|
// success - all DocsEnums are on the same doc
|
||||||
|
return doc;
|
||||||
|
} else {
|
||||||
|
doc = lead.nextDoc();
|
||||||
|
break advanceHead;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
return doNext(lead.advance(target));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return lead.docID();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
return doNext(lead.nextDoc());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return lead.cost();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link TwoPhaseDocIdSetIterator} view of a {@link TwoPhase} conjunction.
|
||||||
|
*/
|
||||||
|
private static class TwoPhaseConjunctionDISI extends TwoPhaseDocIdSetIterator {
|
||||||
|
|
||||||
|
private final ConjunctionDISI approximation;
|
||||||
|
private final TwoPhaseDocIdSetIterator[] twoPhaseIterators;
|
||||||
|
|
||||||
|
private TwoPhaseConjunctionDISI(List<? extends DocIdSetIterator> iterators, List<TwoPhaseDocIdSetIterator> twoPhaseIterators) {
|
||||||
|
approximation = new ConjunctionDISI(iterators);
|
||||||
|
assert twoPhaseIterators.size() > 0;
|
||||||
|
this.twoPhaseIterators = twoPhaseIterators.toArray(new TwoPhaseDocIdSetIterator[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocIdSetIterator approximation() {
|
||||||
|
return approximation;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean matches() throws IOException {
|
||||||
|
for (TwoPhaseDocIdSetIterator twoPhaseIterator : twoPhaseIterators) {
|
||||||
|
if (twoPhaseIterator.matches() == false) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A conjunction DISI built on top of approximations. This implementation
|
||||||
|
* verifies that documents actually match by consulting the provided
|
||||||
|
* {@link TwoPhaseDocIdSetIterator}s.
|
||||||
|
*
|
||||||
|
* Another important difference with {@link ConjunctionDISI} is that this
|
||||||
|
* implementation supports approximations too: the approximation of this
|
||||||
|
* impl is the conjunction of the approximations of the wrapped iterators.
|
||||||
|
* This allows eg. {@code +"A B" +C} to be approximated as
|
||||||
|
* {@code +(+A +B) +C}.
|
||||||
|
*/
|
||||||
|
// NOTE: this is essentially the same as TwoPhaseDocIdSetIterator.asDocIdSetIterator
|
||||||
|
// but is its own impl in order to be able to expose a two-phase view
|
||||||
|
private static class TwoPhase extends ConjunctionDISI {
|
||||||
|
|
||||||
|
final TwoPhaseConjunctionDISI twoPhaseView;
|
||||||
|
|
||||||
|
private TwoPhase(List<? extends DocIdSetIterator> iterators, List<TwoPhaseDocIdSetIterator> twoPhaseIterators) {
|
||||||
|
super(iterators);
|
||||||
|
twoPhaseView = new TwoPhaseConjunctionDISI(iterators, twoPhaseIterators);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TwoPhaseConjunctionDISI asTwoPhaseIterator() {
|
||||||
|
return twoPhaseView;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean matches() throws IOException {
|
||||||
|
return twoPhaseView.matches();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -20,18 +20,14 @@ package org.apache.lucene.search;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/** Scorer for conjunctions, sets of queries, all of which are required. */
|
/** Scorer for conjunctions, sets of queries, all of which are required. */
|
||||||
class ConjunctionScorer extends Scorer {
|
class ConjunctionScorer extends Scorer {
|
||||||
|
|
||||||
protected int lastDoc = -1;
|
private final ConjunctionDISI disi;
|
||||||
protected final DocsAndFreqs[] docsAndFreqs;
|
|
||||||
private final DocsAndFreqs lead;
|
|
||||||
private final Scorer[] scorers;
|
private final Scorer[] scorers;
|
||||||
private final float coord;
|
private final float coord;
|
||||||
|
|
||||||
|
@ -44,68 +40,28 @@ class ConjunctionScorer extends Scorer {
|
||||||
super(weight);
|
super(weight);
|
||||||
assert required.containsAll(scorers);
|
assert required.containsAll(scorers);
|
||||||
this.coord = coord;
|
this.coord = coord;
|
||||||
this.docsAndFreqs = new DocsAndFreqs[required.size()];
|
this.disi = ConjunctionDISI.intersect(required);
|
||||||
for (int i = 0; i < required.size(); ++i) {
|
|
||||||
docsAndFreqs[i] = new DocsAndFreqs(required.get(i));
|
|
||||||
}
|
|
||||||
// Sort the array the first time to allow the least frequent DocsEnum to
|
|
||||||
// lead the matching.
|
|
||||||
ArrayUtil.timSort(docsAndFreqs, new Comparator<DocsAndFreqs>() {
|
|
||||||
@Override
|
|
||||||
public int compare(DocsAndFreqs o1, DocsAndFreqs o2) {
|
|
||||||
return Long.compare(o1.cost, o2.cost);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
lead = docsAndFreqs[0]; // least frequent DocsEnum leads the intersection
|
|
||||||
|
|
||||||
this.scorers = scorers.toArray(new Scorer[scorers.size()]);
|
this.scorers = scorers.toArray(new Scorer[scorers.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int doNext(int doc) throws IOException {
|
@Override
|
||||||
for(;;) {
|
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||||
// doc may already be NO_MORE_DOCS here, but we don't check explicitly
|
return disi.asTwoPhaseIterator();
|
||||||
// since all scorers should advance to NO_MORE_DOCS, match, then
|
|
||||||
// return that value.
|
|
||||||
advanceHead: for(;;) {
|
|
||||||
for (int i = 1; i < docsAndFreqs.length; i++) {
|
|
||||||
// invariant: docsAndFreqs[i].doc <= doc at this point.
|
|
||||||
|
|
||||||
// docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead"
|
|
||||||
// on the previous iteration and the advance on the lead scorer exactly matched.
|
|
||||||
if (docsAndFreqs[i].doc < doc) {
|
|
||||||
docsAndFreqs[i].doc = docsAndFreqs[i].iterator.advance(doc);
|
|
||||||
|
|
||||||
if (docsAndFreqs[i].doc > doc) {
|
|
||||||
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
|
|
||||||
doc = docsAndFreqs[i].doc;
|
|
||||||
break advanceHead;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// success - all DocsEnums are on the same doc
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
// advance head for next iteration
|
|
||||||
doc = lead.doc = lead.iterator.advance(doc);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int advance(int target) throws IOException {
|
public int advance(int target) throws IOException {
|
||||||
lead.doc = lead.iterator.advance(target);
|
return disi.advance(target);
|
||||||
return lastDoc = doNext(lead.doc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int docID() {
|
public int docID() {
|
||||||
return lastDoc;
|
return disi.docID();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextDoc() throws IOException {
|
public int nextDoc() throws IOException {
|
||||||
lead.doc = lead.iterator.nextDoc();
|
return disi.nextDoc();
|
||||||
return lastDoc = doNext(lead.doc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -120,7 +76,7 @@ class ConjunctionScorer extends Scorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int freq() {
|
public int freq() {
|
||||||
return docsAndFreqs.length;
|
return scorers.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -145,12 +101,12 @@ class ConjunctionScorer extends Scorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long cost() {
|
public long cost() {
|
||||||
return lead.iterator.cost();
|
return disi.cost();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<ChildScorer> getChildren() {
|
public Collection<ChildScorer> getChildren() {
|
||||||
ArrayList<ChildScorer> children = new ArrayList<>(docsAndFreqs.length);
|
ArrayList<ChildScorer> children = new ArrayList<>();
|
||||||
for (Scorer scorer : scorers) {
|
for (Scorer scorer : scorers) {
|
||||||
children.add(new ChildScorer(scorer, "MUST"));
|
children.add(new ChildScorer(scorer, "MUST"));
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,9 +18,11 @@ package org.apache.lucene.search;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.index.*;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
import org.apache.lucene.search.similarities.Similarity;
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
@ -49,10 +51,11 @@ final class ExactPhraseScorer extends Scorer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final ConjunctionDISI conjunction;
|
||||||
|
|
||||||
private final ChunkState[] chunkStates;
|
private final ChunkState[] chunkStates;
|
||||||
private final PostingsEnum lead;
|
private final PostingsEnum lead;
|
||||||
|
|
||||||
private int docID = -1;
|
|
||||||
private int freq;
|
private int freq;
|
||||||
|
|
||||||
private final Similarity.SimScorer docScorer;
|
private final Similarity.SimScorer docScorer;
|
||||||
|
@ -72,49 +75,46 @@ final class ExactPhraseScorer extends Scorer {
|
||||||
// min(cost)
|
// min(cost)
|
||||||
cost = lead.cost();
|
cost = lead.cost();
|
||||||
|
|
||||||
|
List<DocIdSetIterator> iterators = new ArrayList<>();
|
||||||
for(int i=0;i<postings.length;i++) {
|
for(int i=0;i<postings.length;i++) {
|
||||||
chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position);
|
chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position);
|
||||||
|
iterators.add(postings[i].postings);
|
||||||
}
|
}
|
||||||
|
conjunction = ConjunctionDISI.intersect(iterators);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||||
|
return new TwoPhaseDocIdSetIterator() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean matches() throws IOException {
|
||||||
|
return phraseFreq() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocIdSetIterator approximation() {
|
||||||
|
return conjunction;
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private int doNext(int doc) throws IOException {
|
private int doNext(int doc) throws IOException {
|
||||||
for(;;) {
|
for (;; doc = conjunction.nextDoc()) {
|
||||||
// TODO: don't dup this logic from conjunctionscorer :)
|
if (doc == NO_MORE_DOCS || phraseFreq() > 0) {
|
||||||
advanceHead: for(;;) {
|
return doc;
|
||||||
for (int i = 1; i < chunkStates.length; i++) {
|
|
||||||
final PostingsEnum de = chunkStates[i].posEnum;
|
|
||||||
if (de.docID() < doc) {
|
|
||||||
int d = de.advance(doc);
|
|
||||||
|
|
||||||
if (d > doc) {
|
|
||||||
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
|
|
||||||
doc = d;
|
|
||||||
break advanceHead;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// all DocsEnums are on the same doc
|
|
||||||
if (doc == NO_MORE_DOCS) {
|
|
||||||
return doc;
|
|
||||||
} else if (phraseFreq() > 0) {
|
|
||||||
return doc; // success: matches phrase
|
|
||||||
} else {
|
|
||||||
doc = lead.nextDoc(); // doesn't match phrase
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// advance head for next iteration
|
|
||||||
doc = lead.advance(doc);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextDoc() throws IOException {
|
public int nextDoc() throws IOException {
|
||||||
return docID = doNext(lead.nextDoc());
|
return doNext(conjunction.nextDoc());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int advance(int target) throws IOException {
|
public int advance(int target) throws IOException {
|
||||||
return docID = doNext(lead.advance(target));
|
return doNext(conjunction.advance(target));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -149,12 +149,12 @@ final class ExactPhraseScorer extends Scorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int docID() {
|
public int docID() {
|
||||||
return docID;
|
return conjunction.docID();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public float score() {
|
public float score() {
|
||||||
return docScorer.score(docID, freq);
|
return docScorer.score(docID(), freq);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int phraseFreq() throws IOException {
|
private int phraseFreq() throws IOException {
|
||||||
|
|
|
@ -99,4 +99,23 @@ public abstract class Scorer extends PostingsEnum {
|
||||||
this.relationship = relationship;
|
this.relationship = relationship;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optional method: Return a {@link TwoPhaseDocIdSetIterator} view of this
|
||||||
|
* {@link Scorer}. A return value of {@code null} indicates that
|
||||||
|
* two-phase iteration is not supported.
|
||||||
|
*
|
||||||
|
* Note that the returned {@link TwoPhaseDocIdSetIterator}'s
|
||||||
|
* {@link TwoPhaseDocIdSetIterator#approximation() approximation} must
|
||||||
|
* advance synchronously with this iterator: advancing the approximation must
|
||||||
|
* advance this iterator and vice-versa.
|
||||||
|
*
|
||||||
|
* Implementing this method is typically useful on {@link Scorer}s
|
||||||
|
* that have a high per-document overhead in order to confirm matches.
|
||||||
|
*
|
||||||
|
* The default implementation returns {@code null}.
|
||||||
|
*/
|
||||||
|
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,81 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An approximation of a {@link DocIdSetIterator}. When the {@link #approximation()}'s
|
||||||
|
* {@link DocIdSetIterator#nextDoc()} or {@link DocIdSetIterator#advance(int)}
|
||||||
|
* return, {@link #matches()} needs to be checked in order to know whether the
|
||||||
|
* returned doc ID actually matches.
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public abstract class TwoPhaseDocIdSetIterator {
|
||||||
|
|
||||||
|
/** Return a {@link DocIdSetIterator} view of the provided
|
||||||
|
* {@link TwoPhaseDocIdSetIterator}. */
|
||||||
|
public static DocIdSetIterator asDocIdSetIterator(TwoPhaseDocIdSetIterator twoPhaseIterator) {
|
||||||
|
final DocIdSetIterator approximation = twoPhaseIterator.approximation();
|
||||||
|
return new DocIdSetIterator() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return approximation.docID();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
return doNext(approximation.nextDoc());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
return doNext(approximation.advance(target));
|
||||||
|
}
|
||||||
|
|
||||||
|
private int doNext(int doc) throws IOException {
|
||||||
|
for (;; doc = approximation.nextDoc()) {
|
||||||
|
if (doc == NO_MORE_DOCS) {
|
||||||
|
return NO_MORE_DOCS;
|
||||||
|
} else if (twoPhaseIterator.matches()) {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return approximation.cost();
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return an approximation. The returned {@link DocIdSetIterator} is a
|
||||||
|
* superset of the matching documents, and each match needs to be confirmed
|
||||||
|
* with {@link #matches()} in order to know whether it matches or not. */
|
||||||
|
public abstract DocIdSetIterator approximation();
|
||||||
|
|
||||||
|
/** Return whether the current doc ID that the iterator is on matches. This
|
||||||
|
* method should only be called when the iterator is positionned, ie. not
|
||||||
|
* when {@link DocIdSetIterator#docID()} is {@code -1} or
|
||||||
|
* {@link DocIdSetIterator#NO_MORE_DOCS}. */
|
||||||
|
public abstract boolean matches() throws IOException;
|
||||||
|
|
||||||
|
}
|
|
@ -590,4 +590,33 @@ public class TestBooleanQuery extends LuceneTestCase {
|
||||||
w.close();
|
w.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testConjunctionSupportsApproximations() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
Field f = newTextField("field", "a b c", Field.Store.NO);
|
||||||
|
doc.add(f);
|
||||||
|
w.addDocument(doc);
|
||||||
|
w.commit();
|
||||||
|
|
||||||
|
DirectoryReader reader = w.getReader();
|
||||||
|
final IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
|
||||||
|
PhraseQuery pq = new PhraseQuery();
|
||||||
|
pq.add(new Term("field", "a"));
|
||||||
|
pq.add(new Term("field", "b"));
|
||||||
|
|
||||||
|
BooleanQuery q = new BooleanQuery();
|
||||||
|
q.add(pq, Occur.MUST);
|
||||||
|
q.add(new TermQuery(new Term("field", "c")), Occur.FILTER);
|
||||||
|
|
||||||
|
final Weight weight = searcher.createNormalizedWeight(q, random().nextBoolean());
|
||||||
|
final Scorer scorer = weight.scorer(reader.leaves().get(0), null);
|
||||||
|
assertNotNull(scorer.asTwoPhaseIterator());
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
w.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,273 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BitDocIdSet;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
public class TestConjunctionDISI extends LuceneTestCase {
|
||||||
|
|
||||||
|
private static TwoPhaseDocIdSetIterator approximation(final DocIdSetIterator iterator, final FixedBitSet confirmed) {
|
||||||
|
return new TwoPhaseDocIdSetIterator() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocIdSetIterator approximation() {
|
||||||
|
return iterator;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean matches() throws IOException {
|
||||||
|
return confirmed.get(iterator.docID());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Scorer scorer(TwoPhaseDocIdSetIterator twoPhaseIterator) {
|
||||||
|
return scorer(TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator), twoPhaseIterator);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a {@link Scorer} that wraps the given {@link DocIdSetIterator}. It
|
||||||
|
* also accepts a {@link TwoPhaseDocIdSetIterator} view, which is exposed in
|
||||||
|
* {@link Scorer#asTwoPhaseIterator()}. When the two-phase view is not null,
|
||||||
|
* then {@link Scorer#nextDoc()} and {@link Scorer#advance(int)} will raise
|
||||||
|
* an exception in order to make sure that {@link ConjunctionDISI} takes
|
||||||
|
* advantage of the {@link TwoPhaseDocIdSetIterator} view.
|
||||||
|
*/
|
||||||
|
private static Scorer scorer(DocIdSetIterator it, TwoPhaseDocIdSetIterator twoPhaseIterator) {
|
||||||
|
return new Scorer(null) {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||||
|
return twoPhaseIterator;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
if (twoPhaseIterator != null) {
|
||||||
|
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
|
||||||
|
}
|
||||||
|
return it.docID();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
if (twoPhaseIterator != null) {
|
||||||
|
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
|
||||||
|
}
|
||||||
|
return it.nextDoc();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
if (twoPhaseIterator != null) {
|
||||||
|
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
|
||||||
|
}
|
||||||
|
return it.advance(target);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
if (twoPhaseIterator != null) {
|
||||||
|
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
|
||||||
|
}
|
||||||
|
return it.cost();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float score() throws IOException {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int freq() throws IOException {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextPosition() throws IOException {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int startOffset() throws IOException {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int endOffset() throws IOException {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef getPayload() throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FixedBitSet randomSet(int maxDoc) {
|
||||||
|
final int step = TestUtil.nextInt(random(), 1, 10);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
for (int doc = random().nextInt(step); doc < maxDoc; doc += TestUtil.nextInt(random(), 1, step)) {
|
||||||
|
set.set(doc);
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FixedBitSet clearRandomBits(FixedBitSet other) {
|
||||||
|
final FixedBitSet set = new FixedBitSet(other.length());
|
||||||
|
set.or(other);
|
||||||
|
for (int i = 0; i < set.length(); ++i) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
set.clear(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FixedBitSet intersect(FixedBitSet[] bitSets) {
|
||||||
|
final FixedBitSet intersection = new FixedBitSet(bitSets[0].length());
|
||||||
|
intersection.or(bitSets[0]);
|
||||||
|
for (int i = 1; i < bitSets.length; ++i) {
|
||||||
|
intersection.and(bitSets[i]);
|
||||||
|
}
|
||||||
|
return intersection;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FixedBitSet toBitSet(int maxDoc, DocIdSetIterator iterator) throws IOException {
|
||||||
|
final FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) {
|
||||||
|
set.set(doc);
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that the conjunction iterator is correct
|
||||||
|
public void testConjunction() throws IOException {
|
||||||
|
final int iters = atLeast(100);
|
||||||
|
for (int iter = 0; iter < iters; ++iter) {
|
||||||
|
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
|
||||||
|
final int numIterators = TestUtil.nextInt(random(), 2, 5);
|
||||||
|
final FixedBitSet[] sets = new FixedBitSet[numIterators];
|
||||||
|
final DocIdSetIterator[] iterators = new DocIdSetIterator[numIterators];
|
||||||
|
for (int i = 0; i < iterators.length; ++i) {
|
||||||
|
final FixedBitSet set = randomSet(maxDoc);
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
// simple iterator
|
||||||
|
sets[i] = set;
|
||||||
|
iterators[i] = new BitDocIdSet(set).iterator();
|
||||||
|
} else {
|
||||||
|
// scorer with approximation
|
||||||
|
final FixedBitSet confirmed = clearRandomBits(set);
|
||||||
|
sets[i] = confirmed;
|
||||||
|
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
|
||||||
|
iterators[i] = scorer(approximation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final ConjunctionDISI conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators));
|
||||||
|
assertEquals(intersect(sets), toBitSet(maxDoc, conjunction));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that the conjunction approximation is correct
|
||||||
|
public void testConjunctionApproximation() throws IOException {
|
||||||
|
final int iters = atLeast(100);
|
||||||
|
for (int iter = 0; iter < iters; ++iter) {
|
||||||
|
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
|
||||||
|
final int numIterators = TestUtil.nextInt(random(), 2, 5);
|
||||||
|
final FixedBitSet[] sets = new FixedBitSet[numIterators];
|
||||||
|
final DocIdSetIterator[] iterators = new DocIdSetIterator[numIterators];
|
||||||
|
boolean hasApproximation = false;
|
||||||
|
for (int i = 0; i < iterators.length; ++i) {
|
||||||
|
final FixedBitSet set = randomSet(maxDoc);
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
// simple iterator
|
||||||
|
sets[i] = set;
|
||||||
|
iterators[i] = new BitDocIdSet(set).iterator();
|
||||||
|
} else {
|
||||||
|
// scorer with approximation
|
||||||
|
final FixedBitSet confirmed = clearRandomBits(set);
|
||||||
|
sets[i] = confirmed;
|
||||||
|
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
|
||||||
|
iterators[i] = scorer(approximation);
|
||||||
|
hasApproximation = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final ConjunctionDISI conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators));
|
||||||
|
TwoPhaseDocIdSetIterator twoPhaseIterator = conjunction.asTwoPhaseIterator();
|
||||||
|
assertEquals(hasApproximation, twoPhaseIterator != null);
|
||||||
|
if (hasApproximation) {
|
||||||
|
assertEquals(intersect(sets), toBitSet(maxDoc, TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This test makes sure that when nesting scorers with ConjunctionDISI, confirmations are pushed to the root.
|
||||||
|
public void testRecursiveConjunctionApproximation() throws IOException {
|
||||||
|
final int iters = atLeast(100);
|
||||||
|
for (int iter = 0; iter < iters; ++iter) {
|
||||||
|
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
|
||||||
|
final int numIterators = TestUtil.nextInt(random(), 2, 5);
|
||||||
|
final FixedBitSet[] sets = new FixedBitSet[numIterators];
|
||||||
|
DocIdSetIterator conjunction = null;
|
||||||
|
boolean hasApproximation = false;
|
||||||
|
for (int i = 0; i < numIterators; ++i) {
|
||||||
|
final FixedBitSet set = randomSet(maxDoc);
|
||||||
|
final DocIdSetIterator newIterator;
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
// simple iterator
|
||||||
|
sets[i] = set;
|
||||||
|
newIterator = new BitDocIdSet(set).iterator();
|
||||||
|
} else {
|
||||||
|
// scorer with approximation
|
||||||
|
final FixedBitSet confirmed = clearRandomBits(set);
|
||||||
|
sets[i] = confirmed;
|
||||||
|
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
|
||||||
|
newIterator = scorer(approximation);
|
||||||
|
hasApproximation = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (conjunction == null) {
|
||||||
|
conjunction = newIterator;
|
||||||
|
} else {
|
||||||
|
final ConjunctionDISI conj = ConjunctionDISI.intersect(Arrays.asList(conjunction, newIterator));
|
||||||
|
conjunction = scorer(conj, conj.asTwoPhaseIterator());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TwoPhaseDocIdSetIterator twoPhaseIterator = ((Scorer) conjunction).asTwoPhaseIterator();
|
||||||
|
assertEquals(hasApproximation, twoPhaseIterator != null);
|
||||||
|
if (hasApproximation) {
|
||||||
|
assertEquals(intersect(sets), toBitSet(maxDoc, TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator)));
|
||||||
|
} else {
|
||||||
|
assertEquals(intersect(sets), toBitSet(maxDoc, conjunction));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue