LUCENE-6373: complete two phase doc id iteration support for Spans

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675776 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2015-04-24 05:12:20 +00:00
parent 558d85f892
commit 5105d036bd
15 changed files with 429 additions and 285 deletions

View File

@ -50,6 +50,10 @@ New Features
FilterSpans to just have an accept(Spans candidate) method for
subclasses. (Robert Muir)
* LUCENE-6373: SpanOrQuery shares disjunction logic with boolean
queries, and supports two-phased iterators to avoid loading
positions when possible. (Paul Elschot via Robert Muir)
* LUCENE-6352: Added a new query time join to the join module that uses
global ordinals, which is faster for subsequent joins between reopens.
(Martijn van Groningen, Adrien Grand)

View File

@ -23,7 +23,6 @@ import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.search.spans.Spans;
/** A conjunction of DocIdSetIterators.
* This iterates over the doc ids that are present in each given DocIdSetIterator.
@ -35,20 +34,16 @@ public class ConjunctionDISI extends DocIdSetIterator {
/** Create a conjunction over the provided iterators, taking advantage of
* {@link TwoPhaseIterator}. */
public static ConjunctionDISI intersect(List<? extends DocIdSetIterator> iterators) {
assert iterators.size() >= 2;
final List<DocIdSetIterator> allIterators = new ArrayList<>();
final List<TwoPhaseIterator> twoPhaseIterators = new ArrayList<>();
for (DocIdSetIterator iterator : iterators) {
TwoPhaseIterator twoPhaseIterator = null;
if (iterator instanceof Scorer) {
twoPhaseIterator = ((Scorer) iterator).asTwoPhaseIterator();
} else if (iterator instanceof Spans) {
twoPhaseIterator = ((Spans) iterator).asTwoPhaseIterator();
}
if (twoPhaseIterator != null) {
allIterators.add(twoPhaseIterator.approximation());
twoPhaseIterators.add(twoPhaseIterator);
for (DocIdSetIterator iter : iterators) {
TwoPhaseIterator twoPhaseIter = TwoPhaseIterator.asTwoPhaseIterator(iter);
if (twoPhaseIter != null) {
allIterators.add(twoPhaseIter.approximation());
twoPhaseIterators.add(twoPhaseIter);
} else { // no approximation support, use the iterator as-is
allIterators.add(iterator);
allIterators.add(iter);
}
}
@ -63,6 +58,7 @@ public class ConjunctionDISI extends DocIdSetIterator {
final DocIdSetIterator[] others;
ConjunctionDISI(List<? extends DocIdSetIterator> iterators) {
assert iterators.size() >= 2;
// Sort the array the first time to allow the least frequent DocsEnum to
// lead the matching.
CollectionUtil.timSort(iterators, new Comparator<DocIdSetIterator>() {

View File

@ -23,37 +23,13 @@ import java.util.Iterator;
import org.apache.lucene.util.PriorityQueue;
/**
* A priority queue of scorers that orders by current doc ID.
* A priority queue of DocIdSetIterators that orders by current doc ID.
* This specialization is needed over {@link PriorityQueue} because the
* pluggable comparison function makes the rebalancing quite slow.
* @lucene.internal
*/
final class ScorerPriorityQueue implements Iterable<org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper> {
static class ScorerWrapper {
final Scorer scorer;
final long cost;
int doc; // the current doc, used for comparison
ScorerWrapper next; // reference to a next element, see #topList
// An approximation of the scorer, or the scorer itself if it does not
// support two-phase iteration
final DocIdSetIterator approximation;
// A two-phase view of the scorer, or null if the scorer does not support
// two-phase iteration
final TwoPhaseIterator twoPhaseView;
ScorerWrapper(Scorer scorer) {
this.scorer = scorer;
this.cost = scorer.cost();
this.doc = -1;
this.twoPhaseView = scorer.asTwoPhaseIterator();
if (twoPhaseView != null) {
approximation = twoPhaseView.approximation();
} else {
approximation = scorer;
}
}
}
public final class DisiPriorityQueue<Iter extends DocIdSetIterator>
implements Iterable<DisiWrapper<Iter>> {
static int leftNode(int node) {
return ((node + 1) << 1) - 1;
@ -67,27 +43,27 @@ final class ScorerPriorityQueue implements Iterable<org.apache.lucene.search.Sco
return ((node + 1) >>> 1) - 1;
}
private final ScorerWrapper[] heap;
private final DisiWrapper<Iter>[] heap;
private int size;
ScorerPriorityQueue(int maxSize) {
heap = new ScorerWrapper[maxSize];
public DisiPriorityQueue(int maxSize) {
heap = new DisiWrapper[maxSize];
size = 0;
}
int size() {
public int size() {
return size;
}
ScorerWrapper top() {
public DisiWrapper<Iter> top() {
return heap[0];
}
/** Get the list of scorers which are on the current doc. */
ScorerWrapper topList() {
final ScorerWrapper[] heap = this.heap;
public DisiWrapper<Iter> topList() {
final DisiWrapper<Iter>[] heap = this.heap;
final int size = this.size;
ScorerWrapper list = heap[0];
DisiWrapper<Iter> list = heap[0];
list.next = null;
if (size >= 3) {
list = topList(list, heap, size, 1);
@ -98,14 +74,15 @@ final class ScorerPriorityQueue implements Iterable<org.apache.lucene.search.Sco
return list;
}
// prepend w1 (scorer) to w2 (list)
private static ScorerWrapper prepend(ScorerWrapper w1, ScorerWrapper w2) {
// prepend w1 (iterator) to w2 (list)
private DisiWrapper<Iter> prepend(DisiWrapper<Iter> w1, DisiWrapper<Iter> w2) {
w1.next = w2;
return w1;
}
private static ScorerWrapper topList(ScorerWrapper list, ScorerWrapper[] heap, int size, int i) {
final ScorerWrapper w = heap[i];
private DisiWrapper<Iter> topList(DisiWrapper<Iter> list, DisiWrapper<Iter>[] heap,
int size, int i) {
final DisiWrapper<Iter> w = heap[i];
if (w.doc == list.doc) {
list = prepend(w, list);
final int left = leftNode(i);
@ -120,37 +97,37 @@ final class ScorerPriorityQueue implements Iterable<org.apache.lucene.search.Sco
return list;
}
ScorerWrapper add(ScorerWrapper entry) {
final ScorerWrapper[] heap = this.heap;
public DisiWrapper<Iter> add(DisiWrapper<Iter> entry) {
final DisiWrapper<Iter>[] heap = this.heap;
final int size = this.size;
heap[size] = entry;
upHeap(heap, size);
upHeap(size);
this.size = size + 1;
return heap[0];
}
ScorerWrapper pop() {
final ScorerWrapper[] heap = this.heap;
final ScorerWrapper result = heap[0];
public DisiWrapper<Iter> pop() {
final DisiWrapper<Iter>[] heap = this.heap;
final DisiWrapper<Iter> result = heap[0];
final int i = --size;
heap[0] = heap[i];
heap[i] = null;
downHeap(heap, i);
downHeap(i);
return result;
}
ScorerWrapper updateTop() {
downHeap(heap, size);
public DisiWrapper<Iter> updateTop() {
downHeap(size);
return heap[0];
}
ScorerWrapper updateTop(ScorerWrapper topReplacement) {
DisiWrapper<Iter> updateTop(DisiWrapper<Iter> topReplacement) {
heap[0] = topReplacement;
return updateTop();
}
static void upHeap(ScorerWrapper[] heap, int i) {
final ScorerWrapper node = heap[i];
void upHeap(int i) {
final DisiWrapper<Iter> node = heap[i];
final int nodeDoc = node.doc;
int j = parentNode(i);
while (j >= 0 && nodeDoc < heap[j].doc) {
@ -161,9 +138,9 @@ final class ScorerPriorityQueue implements Iterable<org.apache.lucene.search.Sco
heap[i] = node;
}
static void downHeap(ScorerWrapper[] heap, int size) {
void downHeap(int size) {
int i = 0;
final ScorerWrapper node = heap[0];
final DisiWrapper<Iter> node = heap[0];
int j = leftNode(i);
if (j < size) {
int k = rightNode(j);
@ -186,8 +163,10 @@ final class ScorerPriorityQueue implements Iterable<org.apache.lucene.search.Sco
}
@Override
public Iterator<ScorerWrapper> iterator() {
public Iterator<DisiWrapper<Iter>> iterator() {
return Arrays.asList(heap).subList(0, size).iterator();
}
}

View File

@ -0,0 +1,55 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Wrapper used in {@link DisiPriorityQueue}.
* @lucene.internal
*/
public class DisiWrapper<Iter extends DocIdSetIterator> {
public final Iter iterator;
public final long cost;
public int doc; // the current doc, used for comparison
public DisiWrapper<Iter> next; // reference to a next element, see #topList
// An approximation of the iterator, or the iterator itself if it does not
// support two-phase iteration
public final DocIdSetIterator approximation;
// A two-phase view of the iterator, or null if the iterator does not support
// two-phase iteration
public final TwoPhaseIterator twoPhaseView;
public int lastApproxMatchDoc; // last doc of approximation that did match
public int lastApproxNonMatchDoc; // last doc of approximation that did not match
public DisiWrapper(Iter iterator) {
this.iterator = iterator;
this.cost = iterator.cost();
this.doc = -1;
this.twoPhaseView = TwoPhaseIterator.asTwoPhaseIterator(iterator);
if (twoPhaseView != null) {
approximation = twoPhaseView.approximation();
} else {
approximation = iterator;
}
this.lastApproxNonMatchDoc = -2;
this.lastApproxMatchDoc = -2;
}
}

View File

@ -0,0 +1,75 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/**
* A {@link DocIdSetIterator} which is a disjunction of the approximations of
* the provided iterators.
* @lucene.internal
*/
public class DisjunctionDISIApproximation<Iter extends DocIdSetIterator>
extends DocIdSetIterator {
final DisiPriorityQueue<Iter> subIterators;
final long cost;
public DisjunctionDISIApproximation(DisiPriorityQueue<Iter> subIterators) {
this.subIterators = subIterators;
long cost = 0;
for (DisiWrapper<Iter> w : subIterators) {
cost += w.cost;
}
this.cost = cost;
}
@Override
public long cost() {
return cost;
}
@Override
public int docID() {
return subIterators.top().doc;
}
@Override
public int nextDoc() throws IOException {
DisiWrapper<Iter> top = subIterators.top();
final int doc = top.doc;
do {
top.doc = top.approximation.nextDoc();
top = subIterators.updateTop();
} while (top.doc == doc);
return top.doc;
}
@Override
public int advance(int target) throws IOException {
DisiWrapper<Iter> top = subIterators.top();
do {
top.doc = top.approximation.advance(target);
top = subIterators.updateTop();
} while (top.doc < target);
return top.doc;
}
}

View File

@ -19,8 +19,6 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper;
/**
* The Scorer for DisjunctionMaxQuery. The union of all documents generated by the the subquery scorers
* is generated in document number order. The score for each document is the maximum of the scores computed
@ -48,11 +46,11 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
}
@Override
protected float score(ScorerWrapper topList) throws IOException {
protected float score(DisiWrapper<Scorer> topList) throws IOException {
float scoreSum = 0;
float scoreMax = 0;
for (ScorerWrapper w = topList; w != null; w = w.next) {
final float subScore = w.scorer.score();
for (DisiWrapper<Scorer> w = topList; w != null; w = w.next) {
final float subScore = w.iterator.score();
scoreSum += subScore;
if (subScore > scoreMax) {
scoreMax = subScore;

View File

@ -22,29 +22,27 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper;
/**
* Base class for Scorers that score disjunctions.
*/
abstract class DisjunctionScorer extends Scorer {
private final boolean needsScores;
private final ScorerPriorityQueue subScorers;
private final DisiPriorityQueue<Scorer> subScorers;
private final long cost;
/** Linked list of scorers which are on the current doc */
private ScorerWrapper topScorers;
private DisiWrapper<Scorer> topScorers;
protected DisjunctionScorer(Weight weight, List<Scorer> subScorers, boolean needsScores) {
super(weight);
if (subScorers.size() <= 1) {
throw new IllegalArgumentException("There must be at least 2 subScorers");
}
this.subScorers = new ScorerPriorityQueue(subScorers.size());
this.subScorers = new DisiPriorityQueue<Scorer>(subScorers.size());
long cost = 0;
for (Scorer scorer : subScorers) {
final ScorerWrapper w = new ScorerWrapper(scorer);
final DisiWrapper<Scorer> w = new DisiWrapper<>(scorer);
cost += w.cost;
this.subScorers.add(w);
}
@ -52,69 +50,17 @@ abstract class DisjunctionScorer extends Scorer {
this.needsScores = needsScores;
}
/**
* A {@link DocIdSetIterator} which is a disjunction of the approximations of
* the provided iterators.
*/
private static class DisjunctionDISIApproximation extends DocIdSetIterator {
final ScorerPriorityQueue subScorers;
final long cost;
DisjunctionDISIApproximation(ScorerPriorityQueue subScorers) {
this.subScorers = subScorers;
long cost = 0;
for (ScorerWrapper w : subScorers) {
cost += w.cost;
}
this.cost = cost;
}
@Override
public long cost() {
return cost;
}
@Override
public int docID() {
return subScorers.top().doc;
}
@Override
public int nextDoc() throws IOException {
ScorerWrapper top = subScorers.top();
final int doc = top.doc;
do {
top.doc = top.approximation.nextDoc();
top = subScorers.updateTop();
} while (top.doc == doc);
return top.doc;
}
@Override
public int advance(int target) throws IOException {
ScorerWrapper top = subScorers.top();
do {
top.doc = top.approximation.advance(target);
top = subScorers.updateTop();
} while (top.doc < target);
return top.doc;
}
}
@Override
public TwoPhaseIterator asTwoPhaseIterator() {
boolean hasApproximation = false;
for (ScorerWrapper w : subScorers) {
for (DisiWrapper<Scorer> w : subScorers) {
if (w.twoPhaseView != null) {
hasApproximation = true;
break;
}
}
if (hasApproximation == false) {
if (! hasApproximation) {
// none of the sub scorers supports approximations
return null;
}
@ -122,13 +68,13 @@ abstract class DisjunctionScorer extends Scorer {
// note it is important to share the same pq as this scorer so that
// rebalancing the pq through the approximation will also rebalance
// the pq in this scorer.
return new TwoPhaseIterator(new DisjunctionDISIApproximation(subScorers)) {
return new TwoPhaseIterator(new DisjunctionDISIApproximation<Scorer>(subScorers)) {
@Override
public boolean matches() throws IOException {
ScorerWrapper topScorers = subScorers.topList();
DisiWrapper<Scorer> topScorers = subScorers.topList();
// remove the head of the list as long as it does not match
while (topScorers.twoPhaseView != null && topScorers.twoPhaseView.matches() == false) {
while (topScorers.twoPhaseView != null && ! topScorers.twoPhaseView.matches()) {
topScorers = topScorers.next;
if (topScorers == null) {
return false;
@ -138,9 +84,9 @@ abstract class DisjunctionScorer extends Scorer {
if (needsScores) {
// if scores or freqs are needed, we also need to remove scorers
// from the top list that do not actually match
ScorerWrapper previous = topScorers;
for (ScorerWrapper w = topScorers.next; w != null; w = w.next) {
if (w.twoPhaseView != null && w.twoPhaseView.matches() == false) {
DisiWrapper<Scorer> previous = topScorers;
for (DisiWrapper<Scorer> w = topScorers.next; w != null; w = w.next) {
if (w.twoPhaseView != null && ! w.twoPhaseView.matches()) {
// w does not match, remove it
previous.next = w.next;
} else {
@ -175,10 +121,10 @@ abstract class DisjunctionScorer extends Scorer {
@Override
public final int nextDoc() throws IOException {
topScorers = null;
ScorerWrapper top = subScorers.top();
DisiWrapper<Scorer> top = subScorers.top();
final int doc = top.doc;
do {
top.doc = top.scorer.nextDoc();
top.doc = top.iterator.nextDoc();
top = subScorers.updateTop();
} while (top.doc == doc);
@ -188,9 +134,9 @@ abstract class DisjunctionScorer extends Scorer {
@Override
public final int advance(int target) throws IOException {
topScorers = null;
ScorerWrapper top = subScorers.top();
DisiWrapper<Scorer> top = subScorers.top();
do {
top.doc = top.scorer.advance(target);
top.doc = top.iterator.advance(target);
top = subScorers.updateTop();
} while (top.doc < target);
@ -203,7 +149,7 @@ abstract class DisjunctionScorer extends Scorer {
topScorers = subScorers.topList();
}
int freq = 1;
for (ScorerWrapper w = topScorers.next; w != null; w = w.next) {
for (DisiWrapper<Scorer> w = topScorers.next; w != null; w = w.next) {
freq += 1;
}
return freq;
@ -218,13 +164,13 @@ abstract class DisjunctionScorer extends Scorer {
}
/** Compute the score for the given linked list of scorers. */
protected abstract float score(ScorerWrapper topList) throws IOException;
protected abstract float score(DisiWrapper<Scorer> topList) throws IOException;
@Override
public final Collection<ChildScorer> getChildren() {
ArrayList<ChildScorer> children = new ArrayList<>();
for (ScorerWrapper scorer : subScorers) {
children.add(new ChildScorer(scorer.scorer, "SHOULD"));
for (DisiWrapper<Scorer> scorer : subScorers) {
children.add(new ChildScorer(scorer.iterator, "SHOULD"));
}
return children;
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper;
/** A Scorer for OR like queries, counterpart of <code>ConjunctionScorer</code>.
* This Scorer implements {@link Scorer#advance(int)} and uses advance() on the given Scorers.
*/
@ -39,11 +37,11 @@ final class DisjunctionSumScorer extends DisjunctionScorer {
}
@Override
protected float score(ScorerWrapper topList) throws IOException {
protected float score(DisiWrapper<Scorer> topList) throws IOException {
double score = 0;
int freq = 0;
for (ScorerWrapper w = topList; w != null; w = w.next) {
score += w.scorer.score();
for (DisiWrapper<Scorer> w = topList; w != null; w = w.next) {
score += w.iterator.score();
freq += 1;
}
return (float)score * coord[freq];

View File

@ -19,6 +19,8 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.search.spans.Spans;
/**
* This abstract class defines methods to iterate over a set of non-decreasing
* doc ids. Note that this class assumes it iterates on doc Ids, and therefore
@ -175,4 +177,5 @@ public abstract class DocIdSetIterator {
* completely inaccurate.
*/
public abstract long cost();
}

View File

@ -23,12 +23,11 @@ import java.util.Collection;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper;
import org.apache.lucene.util.PriorityQueue;
import static org.apache.lucene.search.ScorerPriorityQueue.leftNode;
import static org.apache.lucene.search.ScorerPriorityQueue.parentNode;
import static org.apache.lucene.search.ScorerPriorityQueue.rightNode;
import static org.apache.lucene.search.DisiPriorityQueue.leftNode;
import static org.apache.lucene.search.DisiPriorityQueue.parentNode;
import static org.apache.lucene.search.DisiPriorityQueue.rightNode;
/**
* A {@link Scorer} for {@link BooleanQuery} when
@ -83,17 +82,17 @@ final class MinShouldMatchSumScorer extends Scorer {
// list of scorers which 'lead' the iteration and are currently
// positioned on 'doc'
ScorerWrapper lead;
DisiWrapper<Scorer> lead;
int doc; // current doc ID of the leads
int freq; // number of scorers on the desired doc ID
// priority queue of scorers that are too advanced compared to the current
// doc. Ordered by doc ID.
final ScorerPriorityQueue head;
final DisiPriorityQueue<Scorer> head;
// priority queue of scorers which are behind the current doc.
// Ordered by cost.
final ScorerWrapper[] tail;
final DisiWrapper<Scorer>[] tail;
int tailSize;
final Collection<ChildScorer> childScorers;
@ -113,13 +112,13 @@ final class MinShouldMatchSumScorer extends Scorer {
this.coord = coord;
this.doc = -1;
head = new ScorerPriorityQueue(scorers.size() - minShouldMatch + 1);
head = new DisiPriorityQueue<Scorer>(scorers.size() - minShouldMatch + 1);
// there can be at most minShouldMatch - 1 scorers beyond the current position
// otherwise we might be skipping over matching documents
tail = new ScorerWrapper[minShouldMatch - 1];
tail = new DisiWrapper[minShouldMatch - 1];
for (Scorer scorer : scorers) {
addLead(new ScorerWrapper(scorer));
addLead(new DisiWrapper<Scorer>(scorer));
}
List<ChildScorer> children = new ArrayList<>();
@ -145,13 +144,13 @@ final class MinShouldMatchSumScorer extends Scorer {
// We are moving to the next doc ID, so scorers in 'lead' need to go in
// 'tail'. If there is not enough space in 'tail', then we take the least
// costly scorers and advance them.
for (ScorerWrapper s = lead; s != null; s = s.next) {
final ScorerWrapper evicted = insertTailWithOverFlow(s);
for (DisiWrapper<Scorer> s = lead; s != null; s = s.next) {
final DisiWrapper<Scorer> evicted = insertTailWithOverFlow(s);
if (evicted != null) {
if (evicted.doc == doc) {
evicted.doc = evicted.scorer.nextDoc();
evicted.doc = evicted.iterator.nextDoc();
} else {
evicted.doc = evicted.scorer.advance(doc + 1);
evicted.doc = evicted.iterator.advance(doc + 1);
}
head.add(evicted);
}
@ -164,23 +163,23 @@ final class MinShouldMatchSumScorer extends Scorer {
@Override
public int advance(int target) throws IOException {
// Same logic as in nextDoc
for (ScorerWrapper s = lead; s != null; s = s.next) {
final ScorerWrapper evicted = insertTailWithOverFlow(s);
for (DisiWrapper<Scorer> s = lead; s != null; s = s.next) {
final DisiWrapper<Scorer> evicted = insertTailWithOverFlow(s);
if (evicted != null) {
evicted.doc = evicted.scorer.advance(target);
evicted.doc = evicted.iterator.advance(target);
head.add(evicted);
}
}
// But this time there might also be scorers in 'head' behind the desired
// target so we need to do the same thing that we did on 'lead' on 'head'
ScorerWrapper headTop = head.top();
DisiWrapper<Scorer> headTop = head.top();
while (headTop.doc < target) {
final ScorerWrapper evicted = insertTailWithOverFlow(headTop);
final DisiWrapper<Scorer> evicted = insertTailWithOverFlow(headTop);
// We know that the tail is full since it contains at most
// minShouldMatch - 1 entries and we just moved at least minShouldMatch
// entries to it, so evicted is not null
evicted.doc = evicted.scorer.advance(target);
evicted.doc = evicted.iterator.advance(target);
headTop = head.updateTop(evicted);
}
@ -188,20 +187,20 @@ final class MinShouldMatchSumScorer extends Scorer {
return doNext();
}
private void addLead(ScorerWrapper lead) {
private void addLead(DisiWrapper<Scorer> lead) {
lead.next = this.lead;
this.lead = lead;
freq += 1;
}
private void pushBackLeads() throws IOException {
for (ScorerWrapper s = lead; s != null; s = s.next) {
for (DisiWrapper<Scorer> s = lead; s != null; s = s.next) {
addTail(s);
}
}
private void advanceTail(ScorerWrapper top) throws IOException {
top.doc = top.scorer.advance(doc);
private void advanceTail(DisiWrapper<Scorer> top) throws IOException {
top.doc = top.iterator.advance(doc);
if (top.doc == doc) {
addLead(top);
} else {
@ -210,7 +209,7 @@ final class MinShouldMatchSumScorer extends Scorer {
}
private void advanceTail() throws IOException {
final ScorerWrapper top = popTail();
final DisiWrapper<Scorer> top = popTail();
advanceTail(top);
}
@ -276,8 +275,8 @@ final class MinShouldMatchSumScorer extends Scorer {
// we need to know about all matches
updateFreq();
double score = 0;
for (ScorerWrapper s = lead; s != null; s = s.next) {
score += s.scorer.score();
for (DisiWrapper<Scorer> s = lead; s != null; s = s.next) {
score += s.iterator.score();
}
return coord[freq] * (float) score;
}
@ -289,12 +288,12 @@ final class MinShouldMatchSumScorer extends Scorer {
}
/** Insert an entry in 'tail' and evict the least-costly scorer if full. */
private ScorerWrapper insertTailWithOverFlow(ScorerWrapper s) {
private DisiWrapper<Scorer> insertTailWithOverFlow(DisiWrapper<Scorer> s) {
if (tailSize < tail.length) {
addTail(s);
return null;
} else if (tail.length >= 1) {
final ScorerWrapper top = tail[0];
final DisiWrapper<Scorer> top = tail[0];
if (top.cost < s.cost) {
tail[0] = s;
downHeapCost(tail, tailSize);
@ -305,16 +304,16 @@ final class MinShouldMatchSumScorer extends Scorer {
}
/** Add an entry to 'tail'. Fails if over capacity. */
private void addTail(ScorerWrapper s) {
private void addTail(DisiWrapper<Scorer> s) {
tail[tailSize] = s;
upHeapCost(tail, tailSize);
tailSize += 1;
}
/** Pop the least-costly scorer from 'tail'. */
private ScorerWrapper popTail() {
private DisiWrapper<Scorer> popTail() {
assert tailSize > 0;
final ScorerWrapper result = tail[0];
final DisiWrapper<Scorer> result = tail[0];
tail[0] = tail[--tailSize];
downHeapCost(tail, tailSize);
return result;
@ -322,8 +321,8 @@ final class MinShouldMatchSumScorer extends Scorer {
/** Heap helpers */
private static void upHeapCost(ScorerWrapper[] heap, int i) {
final ScorerWrapper node = heap[i];
private static void upHeapCost(DisiWrapper<Scorer>[] heap, int i) {
final DisiWrapper<Scorer> node = heap[i];
final long nodeCost = node.cost;
int j = parentNode(i);
while (j >= 0 && nodeCost < heap[j].cost) {
@ -334,9 +333,9 @@ final class MinShouldMatchSumScorer extends Scorer {
heap[i] = node;
}
private static void downHeapCost(ScorerWrapper[] heap, int size) {
private static void downHeapCost(DisiWrapper<Scorer>[] heap, int size) {
int i = 0;
final ScorerWrapper node = heap[0];
final DisiWrapper<Scorer> node = heap[0];
int j = leftNode(i);
if (j < size) {
int k = rightNode(j);

View File

@ -20,9 +20,13 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.search.spans.Spans;
/**
* Returned by {@link Scorer#asTwoPhaseIterator()} to expose an approximation of
* a {@link DocIdSetIterator}. When the {@link #approximation()}'s
* Returned by {@link Scorer#asTwoPhaseIterator()}
* and {@link Spans#asTwoPhaseIterator()}
* to expose an approximation of a {@link DocIdSetIterator}.
* When the {@link #approximation()}'s
* {@link DocIdSetIterator#nextDoc()} or {@link DocIdSetIterator#advance(int)}
* return, {@link #matches()} needs to be checked in order to know whether the
* returned doc ID actually matches.
@ -89,4 +93,16 @@ public abstract class TwoPhaseIterator {
* {@link DocIdSetIterator#NO_MORE_DOCS} -- and at most once. */
public abstract boolean matches() throws IOException;
/**
* Returns a {@link TwoPhaseIterator} for this {@link DocIdSetIterator}
* when available * otherwise returns null.
*/
public static TwoPhaseIterator asTwoPhaseIterator(DocIdSetIterator iter) {
return (iter instanceof Scorer)
? ((Scorer) iter).asTwoPhaseIterator()
: (iter instanceof Spans)
? ((Spans) iter).asTwoPhaseIterator()
: null;
}
}

View File

@ -31,9 +31,13 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.DisiPriorityQueue;
import org.apache.lucene.search.DisiWrapper;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.DisjunctionDISIApproximation;
/** Matches the union of its clauses.
*/
@ -42,7 +46,7 @@ public class SpanOrQuery extends SpanQuery implements Cloneable {
private String field;
/** Construct a SpanOrQuery merging the provided clauses.
* All clauses must have the same field.
* All clauses must have the same field.
*/
public SpanOrQuery(SpanQuery... clauses) {
this.clauses = new ArrayList<>(clauses.length);
@ -146,35 +150,16 @@ public class SpanOrQuery extends SpanQuery implements Cloneable {
}
private class SpanQueue extends PriorityQueue<Spans> {
public SpanQueue(int size) {
super(size);
}
@Override
protected final boolean lessThan(Spans spans1, Spans spans2) {
if (spans1.docID() == spans2.docID()) {
if (spans1.startPosition() == spans2.startPosition()) {
return spans1.endPosition() < spans2.endPosition();
} else {
return spans1.startPosition() < spans2.startPosition();
}
} else {
return spans1.docID() < spans2.docID();
}
}
}
@Override
public Spans getSpans(final LeafReaderContext context, final Bits acceptDocs, final Map<Term,TermContext> termContexts)
throws IOException {
ArrayList<Spans> subSpans = new ArrayList<>(clauses.size());
for (SpanQuery seq : clauses) {
Spans subSpan = seq.getSpans(context, acceptDocs, termContexts);
if (subSpan != null) {
subSpans.add(subSpan);
for (SpanQuery sq : clauses) {
Spans spans = sq.getSpans(context, acceptDocs, termContexts);
if (spans != null) {
subSpans.add(spans);
}
}
@ -184,114 +169,168 @@ public class SpanOrQuery extends SpanQuery implements Cloneable {
return subSpans.get(0);
}
SpanQueue queue = new SpanQueue(clauses.size());
DisiPriorityQueue<Spans> byDocQueue = new DisiPriorityQueue<>(subSpans.size());
for (Spans spans : subSpans) {
queue.add(spans);
byDocQueue.add(new DisiWrapper<>(spans));
}
SpanPositionQueue byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1
return new Spans() {
Spans topPositionSpans = null;
@Override
public int nextDoc() throws IOException {
if (queue.size() == 0) { // all done
return NO_MORE_DOCS;
}
int currentDoc = top().docID();
if (currentDoc == -1) { // initially
return advance(0);
}
topPositionSpans = null;
DisiWrapper<Spans> topDocSpans = byDocQueue.top();
int currentDoc = topDocSpans.doc;
do {
if (top().nextDoc() != NO_MORE_DOCS) { // move top to next doc
queue.updateTop();
} else {
queue.pop(); // exhausted a clause
if (queue.size() == 0) {
return NO_MORE_DOCS;
}
}
// assert queue.size() > 0;
int doc = top().docID();
if (doc > currentDoc) {
return doc;
}
} while (true);
}
private Spans top() {
return queue.top();
topDocSpans.doc = topDocSpans.iterator.nextDoc();
topDocSpans = byDocQueue.updateTop();
} while (topDocSpans.doc == currentDoc);
return topDocSpans.doc;
}
@Override
public int advance(int target) throws IOException {
while ((queue.size() > 0) && (top().docID() < target)) {
if (top().advance(target) != NO_MORE_DOCS) {
queue.updateTop();
} else {
queue.pop();
}
}
return (queue.size() > 0) ? top().docID() : NO_MORE_DOCS;
topPositionSpans = null;
DisiWrapper<Spans> topDocSpans = byDocQueue.top();
do {
topDocSpans.doc = topDocSpans.iterator.advance(target);
topDocSpans = byDocQueue.updateTop();
} while (topDocSpans.doc < target);
return topDocSpans.doc;
}
@Override
public int docID() {
return (queue == null) ? -1
: (queue.size() > 0) ? top().docID()
: NO_MORE_DOCS;
DisiWrapper<Spans> topDocSpans = byDocQueue.top();
return topDocSpans.doc;
}
@Override
public int nextStartPosition() throws IOException {
top().nextStartPosition();
queue.updateTop();
int startPos = top().startPosition();
while (startPos == -1) { // initially at this doc
top().nextStartPosition();
queue.updateTop();
startPos = top().startPosition();
public TwoPhaseIterator asTwoPhaseIterator() {
boolean hasApproximation = false;
for (DisiWrapper<Spans> w : byDocQueue) {
if (w.twoPhaseView != null) {
hasApproximation = true;
break;
}
}
return startPos;
if (! hasApproximation) { // none of the sub spans supports approximations
return null;
}
return new TwoPhaseIterator(new DisjunctionDISIApproximation<Spans>(byDocQueue)) {
@Override
public boolean matches() throws IOException {
return twoPhaseCurrentDocMatches();
}
};
}
int lastDocTwoPhaseMatched = -1;
boolean twoPhaseCurrentDocMatches() throws IOException {
DisiWrapper<Spans> listAtCurrentDoc = byDocQueue.topList();
// remove the head of the list as long as it does not match
final int currentDoc = listAtCurrentDoc.doc;
while (listAtCurrentDoc.twoPhaseView != null) {
if (listAtCurrentDoc.twoPhaseView.matches()) {
// use this spans for positions at current doc:
listAtCurrentDoc.lastApproxMatchDoc = currentDoc;
break;
}
// do not use this spans for positions at current doc:
listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc;
listAtCurrentDoc = listAtCurrentDoc.next;
if (listAtCurrentDoc == null) {
return false;
}
}
lastDocTwoPhaseMatched = currentDoc;
topPositionSpans = null;
return true;
}
void fillPositionQueue() throws IOException { // called at first nextStartPosition
assert byPositionQueue.size() == 0;
// add all matching Spans at current doc to byPositionQueue
DisiWrapper<Spans> listAtCurrentDoc = byDocQueue.topList();
while (listAtCurrentDoc != null) {
Spans spansAtDoc = listAtCurrentDoc.iterator;
if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation
if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation
if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false
spansAtDoc = null;
} else {
if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) {
if (! listAtCurrentDoc.twoPhaseView.matches()) {
spansAtDoc = null;
}
}
}
}
}
if (spansAtDoc != null) {
assert spansAtDoc.docID() == listAtCurrentDoc.doc;
assert spansAtDoc.startPosition() == -1;
spansAtDoc.nextStartPosition();
assert spansAtDoc.startPosition() != NO_MORE_POSITIONS;
byPositionQueue.add(spansAtDoc);
}
listAtCurrentDoc = listAtCurrentDoc.next;
}
assert byPositionQueue.size() > 0;
}
@Override
public int nextStartPosition() throws IOException {
DisiWrapper<Spans> topDocSpans = byDocQueue.top();
assert topDocSpans.doc != NO_MORE_DOCS;
if (topPositionSpans == null) {
byPositionQueue.clear();
fillPositionQueue(); // fills byPositionQueue at first position
topPositionSpans = byPositionQueue.top();
} else {
topPositionSpans.nextStartPosition();
topPositionSpans = byPositionQueue.updateTop();
}
return topPositionSpans.startPosition();
}
@Override
public int startPosition() {
return top().startPosition();
return topPositionSpans == null ? -1 : topPositionSpans.startPosition();
}
@Override
public int endPosition() {
return top().endPosition();
return topPositionSpans == null ? -1 : topPositionSpans.endPosition();
}
@Override
public Collection<byte[]> getPayload() throws IOException {
ArrayList<byte[]> result = null;
Spans theTop = top();
if (theTop != null && theTop.isPayloadAvailable()) {
result = new ArrayList<>(theTop.getPayload());
}
return result;
return topPositionSpans == null
? null
: topPositionSpans.isPayloadAvailable()
? new ArrayList<>(topPositionSpans.getPayload())
: null;
}
@Override
public boolean isPayloadAvailable() throws IOException {
Spans top = top();
return top != null && top.isPayloadAvailable();
return (topPositionSpans != null) && topPositionSpans.isPayloadAvailable();
}
@Override
public String toString() {
return "spans("+SpanOrQuery.this+")@"+
((queue == null)?"START"
:(queue.size()>0?(docID()+": "+top().startPosition()+" - "+top().endPosition()):"END"));
return "spanOr("+SpanOrQuery.this+")@"+docID()+": "+startPosition()+" - "+endPosition();
}
private long cost = -1;
long cost = -1;
@Override
public long cost() {
@ -303,8 +342,8 @@ public class SpanOrQuery extends SpanQuery implements Cloneable {
}
return cost;
}
};
}
}

View File

@ -0,0 +1,35 @@
package org.apache.lucene.search.spans;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.PriorityQueue;
class SpanPositionQueue extends PriorityQueue<Spans> {
SpanPositionQueue(int maxSize) {
super(maxSize, false); // do not prepopulate
}
protected boolean lessThan(Spans s1, Spans s2) {
int start1 = s1.startPosition();
int start2 = s2.startPosition();
return (start1 < start2) ? true
: (start1 == start2) ? s1.endPosition() < s2.endPosition()
: false;
}
}

View File

@ -86,11 +86,12 @@ public abstract class Spans extends DocIdSetIterator {
*
* Note that the returned {@link TwoPhaseIterator}'s
* {@link TwoPhaseIterator#approximation() approximation} must
* advance synchronously with this iterator: advancing the approximation must
* advance documents synchronously with this iterator:
* advancing the approximation must
* advance this iterator and vice-versa.
*
* Implementing this method is typically useful on {@link Spans}s
* that have a high per-document overhead in order to confirm matches.
* Implementing this method is typically useful on a {@link Spans}
* that has a high per-document overhead for confirming matches.
*
* The default implementation returns {@code null}.
*/

View File

@ -89,7 +89,7 @@ public abstract class PriorityQueue<T> {
* value (i.e., {@link #lessThan} should always favor the
* non-sentinel values).<br>
*
* By default, this method returns false, which means the queue will not be
* By default, this method returns null, which means the queue will not be
* filled with sentinel values. Otherwise, the value returned will be used to
* pre-populate the queue. Adds sentinel values to the queue.<br>
*