From 5105d036bd3d7818df85342afea12dbdc1ca5b50 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 24 Apr 2015 05:12:20 +0000 Subject: [PATCH] LUCENE-6373: complete two phase doc id iteration support for Spans git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675776 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../apache/lucene/search/ConjunctionDISI.java | 20 +- ...orityQueue.java => DisiPriorityQueue.java} | 89 +++---- .../org/apache/lucene/search/DisiWrapper.java | 55 +++++ .../search/DisjunctionDISIApproximation.java | 75 ++++++ .../lucene/search/DisjunctionMaxScorer.java | 8 +- .../lucene/search/DisjunctionScorer.java | 94 ++------ .../lucene/search/DisjunctionSumScorer.java | 8 +- .../lucene/search/DocIdSetIterator.java | 3 + .../search/MinShouldMatchSumScorer.java | 71 +++--- .../lucene/search/TwoPhaseIterator.java | 20 +- .../lucene/search/spans/SpanOrQuery.java | 223 ++++++++++-------- .../search/spans/SpanPositionQueue.java | 35 +++ .../org/apache/lucene/search/spans/Spans.java | 7 +- .../org/apache/lucene/util/PriorityQueue.java | 2 +- 15 files changed, 429 insertions(+), 285 deletions(-) rename lucene/core/src/java/org/apache/lucene/search/{ScorerPriorityQueue.java => DisiPriorityQueue.java} (60%) create mode 100644 lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7c6c143228b..87ca4349952 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -50,6 +50,10 @@ New Features FilterSpans to just have an accept(Spans candidate) method for subclasses. (Robert Muir) +* LUCENE-6373: SpanOrQuery shares disjunction logic with boolean + queries, and supports two-phased iterators to avoid loading + positions when possible. (Paul Elschot via Robert Muir) + * LUCENE-6352: Added a new query time join to the join module that uses global ordinals, which is faster for subsequent joins between reopens. (Martijn van Groningen, Adrien Grand) diff --git a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java index 53342b57870..2a03960fb72 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java @@ -23,7 +23,6 @@ import java.util.Comparator; import java.util.List; import org.apache.lucene.util.CollectionUtil; -import org.apache.lucene.search.spans.Spans; /** A conjunction of DocIdSetIterators. * This iterates over the doc ids that are present in each given DocIdSetIterator. @@ -35,20 +34,16 @@ public class ConjunctionDISI extends DocIdSetIterator { /** Create a conjunction over the provided iterators, taking advantage of * {@link TwoPhaseIterator}. */ public static ConjunctionDISI intersect(List iterators) { + assert iterators.size() >= 2; final List allIterators = new ArrayList<>(); final List twoPhaseIterators = new ArrayList<>(); - for (DocIdSetIterator iterator : iterators) { - TwoPhaseIterator twoPhaseIterator = null; - if (iterator instanceof Scorer) { - twoPhaseIterator = ((Scorer) iterator).asTwoPhaseIterator(); - } else if (iterator instanceof Spans) { - twoPhaseIterator = ((Spans) iterator).asTwoPhaseIterator(); - } - if (twoPhaseIterator != null) { - allIterators.add(twoPhaseIterator.approximation()); - twoPhaseIterators.add(twoPhaseIterator); + for (DocIdSetIterator iter : iterators) { + TwoPhaseIterator twoPhaseIter = TwoPhaseIterator.asTwoPhaseIterator(iter); + if (twoPhaseIter != null) { + allIterators.add(twoPhaseIter.approximation()); + twoPhaseIterators.add(twoPhaseIter); } else { // no approximation support, use the iterator as-is - allIterators.add(iterator); + allIterators.add(iter); } } @@ -63,6 +58,7 @@ public class ConjunctionDISI extends DocIdSetIterator { final DocIdSetIterator[] others; ConjunctionDISI(List iterators) { + assert iterators.size() >= 2; // Sort the array the first time to allow the least frequent DocsEnum to // lead the matching. CollectionUtil.timSort(iterators, new Comparator() { diff --git a/lucene/core/src/java/org/apache/lucene/search/ScorerPriorityQueue.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java similarity index 60% rename from lucene/core/src/java/org/apache/lucene/search/ScorerPriorityQueue.java rename to lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java index ea0f1dec31b..55f61b5802e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScorerPriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java @@ -23,37 +23,13 @@ import java.util.Iterator; import org.apache.lucene.util.PriorityQueue; /** - * A priority queue of scorers that orders by current doc ID. + * A priority queue of DocIdSetIterators that orders by current doc ID. * This specialization is needed over {@link PriorityQueue} because the * pluggable comparison function makes the rebalancing quite slow. + * @lucene.internal */ -final class ScorerPriorityQueue implements Iterable { - - static class ScorerWrapper { - final Scorer scorer; - final long cost; - int doc; // the current doc, used for comparison - ScorerWrapper next; // reference to a next element, see #topList - - // An approximation of the scorer, or the scorer itself if it does not - // support two-phase iteration - final DocIdSetIterator approximation; - // A two-phase view of the scorer, or null if the scorer does not support - // two-phase iteration - final TwoPhaseIterator twoPhaseView; - - ScorerWrapper(Scorer scorer) { - this.scorer = scorer; - this.cost = scorer.cost(); - this.doc = -1; - this.twoPhaseView = scorer.asTwoPhaseIterator(); - if (twoPhaseView != null) { - approximation = twoPhaseView.approximation(); - } else { - approximation = scorer; - } - } - } +public final class DisiPriorityQueue +implements Iterable> { static int leftNode(int node) { return ((node + 1) << 1) - 1; @@ -67,27 +43,27 @@ final class ScorerPriorityQueue implements Iterable>> 1) - 1; } - private final ScorerWrapper[] heap; + private final DisiWrapper[] heap; private int size; - ScorerPriorityQueue(int maxSize) { - heap = new ScorerWrapper[maxSize]; + public DisiPriorityQueue(int maxSize) { + heap = new DisiWrapper[maxSize]; size = 0; } - int size() { + public int size() { return size; } - ScorerWrapper top() { + public DisiWrapper top() { return heap[0]; } /** Get the list of scorers which are on the current doc. */ - ScorerWrapper topList() { - final ScorerWrapper[] heap = this.heap; + public DisiWrapper topList() { + final DisiWrapper[] heap = this.heap; final int size = this.size; - ScorerWrapper list = heap[0]; + DisiWrapper list = heap[0]; list.next = null; if (size >= 3) { list = topList(list, heap, size, 1); @@ -98,14 +74,15 @@ final class ScorerPriorityQueue implements Iterable prepend(DisiWrapper w1, DisiWrapper w2) { w1.next = w2; return w1; } - private static ScorerWrapper topList(ScorerWrapper list, ScorerWrapper[] heap, int size, int i) { - final ScorerWrapper w = heap[i]; + private DisiWrapper topList(DisiWrapper list, DisiWrapper[] heap, + int size, int i) { + final DisiWrapper w = heap[i]; if (w.doc == list.doc) { list = prepend(w, list); final int left = leftNode(i); @@ -120,37 +97,37 @@ final class ScorerPriorityQueue implements Iterable add(DisiWrapper entry) { + final DisiWrapper[] heap = this.heap; final int size = this.size; heap[size] = entry; - upHeap(heap, size); + upHeap(size); this.size = size + 1; return heap[0]; } - ScorerWrapper pop() { - final ScorerWrapper[] heap = this.heap; - final ScorerWrapper result = heap[0]; + public DisiWrapper pop() { + final DisiWrapper[] heap = this.heap; + final DisiWrapper result = heap[0]; final int i = --size; heap[0] = heap[i]; heap[i] = null; - downHeap(heap, i); + downHeap(i); return result; } - ScorerWrapper updateTop() { - downHeap(heap, size); + public DisiWrapper updateTop() { + downHeap(size); return heap[0]; } - ScorerWrapper updateTop(ScorerWrapper topReplacement) { + DisiWrapper updateTop(DisiWrapper topReplacement) { heap[0] = topReplacement; return updateTop(); } - static void upHeap(ScorerWrapper[] heap, int i) { - final ScorerWrapper node = heap[i]; + void upHeap(int i) { + final DisiWrapper node = heap[i]; final int nodeDoc = node.doc; int j = parentNode(i); while (j >= 0 && nodeDoc < heap[j].doc) { @@ -161,9 +138,9 @@ final class ScorerPriorityQueue implements Iterable node = heap[0]; int j = leftNode(i); if (j < size) { int k = rightNode(j); @@ -186,8 +163,10 @@ final class ScorerPriorityQueue implements Iterable iterator() { + public Iterator> iterator() { return Arrays.asList(heap).subList(0, size).iterator(); } } + + diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java new file mode 100644 index 00000000000..d3f52948b8e --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java @@ -0,0 +1,55 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Wrapper used in {@link DisiPriorityQueue}. + * @lucene.internal + */ +public class DisiWrapper { + public final Iter iterator; + public final long cost; + public int doc; // the current doc, used for comparison + public DisiWrapper next; // reference to a next element, see #topList + + // An approximation of the iterator, or the iterator itself if it does not + // support two-phase iteration + public final DocIdSetIterator approximation; + // A two-phase view of the iterator, or null if the iterator does not support + // two-phase iteration + public final TwoPhaseIterator twoPhaseView; + + public int lastApproxMatchDoc; // last doc of approximation that did match + public int lastApproxNonMatchDoc; // last doc of approximation that did not match + + public DisiWrapper(Iter iterator) { + this.iterator = iterator; + this.cost = iterator.cost(); + this.doc = -1; + this.twoPhaseView = TwoPhaseIterator.asTwoPhaseIterator(iterator); + + if (twoPhaseView != null) { + approximation = twoPhaseView.approximation(); + } else { + approximation = iterator; + } + this.lastApproxNonMatchDoc = -2; + this.lastApproxMatchDoc = -2; + } +} + diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java new file mode 100644 index 00000000000..1672d79e938 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java @@ -0,0 +1,75 @@ +package org.apache.lucene.search; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * A {@link DocIdSetIterator} which is a disjunction of the approximations of + * the provided iterators. + * @lucene.internal + */ +public class DisjunctionDISIApproximation +extends DocIdSetIterator { + + final DisiPriorityQueue subIterators; + final long cost; + + public DisjunctionDISIApproximation(DisiPriorityQueue subIterators) { + this.subIterators = subIterators; + long cost = 0; + for (DisiWrapper w : subIterators) { + cost += w.cost; + } + this.cost = cost; + } + + @Override + public long cost() { + return cost; + } + + @Override + public int docID() { + return subIterators.top().doc; + } + + @Override + public int nextDoc() throws IOException { + DisiWrapper top = subIterators.top(); + final int doc = top.doc; + do { + top.doc = top.approximation.nextDoc(); + top = subIterators.updateTop(); + } while (top.doc == doc); + + return top.doc; + } + + @Override + public int advance(int target) throws IOException { + DisiWrapper top = subIterators.top(); + do { + top.doc = top.approximation.advance(target); + top = subIterators.updateTop(); + } while (top.doc < target); + + return top.doc; + } +} + + diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java index e3e7ed74f03..8f3048e0fdc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java @@ -19,8 +19,6 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.List; -import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper; - /** * The Scorer for DisjunctionMaxQuery. The union of all documents generated by the the subquery scorers * is generated in document number order. The score for each document is the maximum of the scores computed @@ -48,11 +46,11 @@ final class DisjunctionMaxScorer extends DisjunctionScorer { } @Override - protected float score(ScorerWrapper topList) throws IOException { + protected float score(DisiWrapper topList) throws IOException { float scoreSum = 0; float scoreMax = 0; - for (ScorerWrapper w = topList; w != null; w = w.next) { - final float subScore = w.scorer.score(); + for (DisiWrapper w = topList; w != null; w = w.next) { + final float subScore = w.iterator.score(); scoreSum += subScore; if (subScore > scoreMax) { scoreMax = subScore; diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java index 2d2132633c4..7dbb733718a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java @@ -22,29 +22,27 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; -import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper; - /** * Base class for Scorers that score disjunctions. */ abstract class DisjunctionScorer extends Scorer { private final boolean needsScores; - private final ScorerPriorityQueue subScorers; + private final DisiPriorityQueue subScorers; private final long cost; /** Linked list of scorers which are on the current doc */ - private ScorerWrapper topScorers; + private DisiWrapper topScorers; protected DisjunctionScorer(Weight weight, List subScorers, boolean needsScores) { super(weight); if (subScorers.size() <= 1) { throw new IllegalArgumentException("There must be at least 2 subScorers"); } - this.subScorers = new ScorerPriorityQueue(subScorers.size()); + this.subScorers = new DisiPriorityQueue(subScorers.size()); long cost = 0; for (Scorer scorer : subScorers) { - final ScorerWrapper w = new ScorerWrapper(scorer); + final DisiWrapper w = new DisiWrapper<>(scorer); cost += w.cost; this.subScorers.add(w); } @@ -52,69 +50,17 @@ abstract class DisjunctionScorer extends Scorer { this.needsScores = needsScores; } - /** - * A {@link DocIdSetIterator} which is a disjunction of the approximations of - * the provided iterators. - */ - private static class DisjunctionDISIApproximation extends DocIdSetIterator { - - final ScorerPriorityQueue subScorers; - final long cost; - - DisjunctionDISIApproximation(ScorerPriorityQueue subScorers) { - this.subScorers = subScorers; - long cost = 0; - for (ScorerWrapper w : subScorers) { - cost += w.cost; - } - this.cost = cost; - } - - @Override - public long cost() { - return cost; - } - - @Override - public int docID() { - return subScorers.top().doc; - } - - @Override - public int nextDoc() throws IOException { - ScorerWrapper top = subScorers.top(); - final int doc = top.doc; - do { - top.doc = top.approximation.nextDoc(); - top = subScorers.updateTop(); - } while (top.doc == doc); - - return top.doc; - } - - @Override - public int advance(int target) throws IOException { - ScorerWrapper top = subScorers.top(); - do { - top.doc = top.approximation.advance(target); - top = subScorers.updateTop(); - } while (top.doc < target); - - return top.doc; - } - } - @Override public TwoPhaseIterator asTwoPhaseIterator() { boolean hasApproximation = false; - for (ScorerWrapper w : subScorers) { + for (DisiWrapper w : subScorers) { if (w.twoPhaseView != null) { hasApproximation = true; break; } } - if (hasApproximation == false) { + if (! hasApproximation) { // none of the sub scorers supports approximations return null; } @@ -122,13 +68,13 @@ abstract class DisjunctionScorer extends Scorer { // note it is important to share the same pq as this scorer so that // rebalancing the pq through the approximation will also rebalance // the pq in this scorer. - return new TwoPhaseIterator(new DisjunctionDISIApproximation(subScorers)) { + return new TwoPhaseIterator(new DisjunctionDISIApproximation(subScorers)) { @Override public boolean matches() throws IOException { - ScorerWrapper topScorers = subScorers.topList(); + DisiWrapper topScorers = subScorers.topList(); // remove the head of the list as long as it does not match - while (topScorers.twoPhaseView != null && topScorers.twoPhaseView.matches() == false) { + while (topScorers.twoPhaseView != null && ! topScorers.twoPhaseView.matches()) { topScorers = topScorers.next; if (topScorers == null) { return false; @@ -138,9 +84,9 @@ abstract class DisjunctionScorer extends Scorer { if (needsScores) { // if scores or freqs are needed, we also need to remove scorers // from the top list that do not actually match - ScorerWrapper previous = topScorers; - for (ScorerWrapper w = topScorers.next; w != null; w = w.next) { - if (w.twoPhaseView != null && w.twoPhaseView.matches() == false) { + DisiWrapper previous = topScorers; + for (DisiWrapper w = topScorers.next; w != null; w = w.next) { + if (w.twoPhaseView != null && ! w.twoPhaseView.matches()) { // w does not match, remove it previous.next = w.next; } else { @@ -175,10 +121,10 @@ abstract class DisjunctionScorer extends Scorer { @Override public final int nextDoc() throws IOException { topScorers = null; - ScorerWrapper top = subScorers.top(); + DisiWrapper top = subScorers.top(); final int doc = top.doc; do { - top.doc = top.scorer.nextDoc(); + top.doc = top.iterator.nextDoc(); top = subScorers.updateTop(); } while (top.doc == doc); @@ -188,9 +134,9 @@ abstract class DisjunctionScorer extends Scorer { @Override public final int advance(int target) throws IOException { topScorers = null; - ScorerWrapper top = subScorers.top(); + DisiWrapper top = subScorers.top(); do { - top.doc = top.scorer.advance(target); + top.doc = top.iterator.advance(target); top = subScorers.updateTop(); } while (top.doc < target); @@ -203,7 +149,7 @@ abstract class DisjunctionScorer extends Scorer { topScorers = subScorers.topList(); } int freq = 1; - for (ScorerWrapper w = topScorers.next; w != null; w = w.next) { + for (DisiWrapper w = topScorers.next; w != null; w = w.next) { freq += 1; } return freq; @@ -218,13 +164,13 @@ abstract class DisjunctionScorer extends Scorer { } /** Compute the score for the given linked list of scorers. */ - protected abstract float score(ScorerWrapper topList) throws IOException; + protected abstract float score(DisiWrapper topList) throws IOException; @Override public final Collection getChildren() { ArrayList children = new ArrayList<>(); - for (ScorerWrapper scorer : subScorers) { - children.add(new ChildScorer(scorer.scorer, "SHOULD")); + for (DisiWrapper scorer : subScorers) { + children.add(new ChildScorer(scorer.iterator, "SHOULD")); } return children; } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java index d6b25b15568..6cf167b1876 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java @@ -20,8 +20,6 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.List; -import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper; - /** A Scorer for OR like queries, counterpart of ConjunctionScorer. * This Scorer implements {@link Scorer#advance(int)} and uses advance() on the given Scorers. */ @@ -39,11 +37,11 @@ final class DisjunctionSumScorer extends DisjunctionScorer { } @Override - protected float score(ScorerWrapper topList) throws IOException { + protected float score(DisiWrapper topList) throws IOException { double score = 0; int freq = 0; - for (ScorerWrapper w = topList; w != null; w = w.next) { - score += w.scorer.score(); + for (DisiWrapper w = topList; w != null; w = w.next) { + score += w.iterator.score(); freq += 1; } return (float)score * coord[freq]; diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java index fe26e52382a..bb5d49870da 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java @@ -19,6 +19,8 @@ package org.apache.lucene.search; import java.io.IOException; +import org.apache.lucene.search.spans.Spans; + /** * This abstract class defines methods to iterate over a set of non-decreasing * doc ids. Note that this class assumes it iterates on doc Ids, and therefore @@ -175,4 +177,5 @@ public abstract class DocIdSetIterator { * completely inaccurate. */ public abstract long cost(); + } diff --git a/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java b/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java index d989ff41c7a..36714ea081b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MinShouldMatchSumScorer.java @@ -23,12 +23,11 @@ import java.util.Collection; import java.util.Collections; import java.util.List; -import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper; import org.apache.lucene.util.PriorityQueue; -import static org.apache.lucene.search.ScorerPriorityQueue.leftNode; -import static org.apache.lucene.search.ScorerPriorityQueue.parentNode; -import static org.apache.lucene.search.ScorerPriorityQueue.rightNode; +import static org.apache.lucene.search.DisiPriorityQueue.leftNode; +import static org.apache.lucene.search.DisiPriorityQueue.parentNode; +import static org.apache.lucene.search.DisiPriorityQueue.rightNode; /** * A {@link Scorer} for {@link BooleanQuery} when @@ -83,17 +82,17 @@ final class MinShouldMatchSumScorer extends Scorer { // list of scorers which 'lead' the iteration and are currently // positioned on 'doc' - ScorerWrapper lead; + DisiWrapper lead; int doc; // current doc ID of the leads int freq; // number of scorers on the desired doc ID // priority queue of scorers that are too advanced compared to the current // doc. Ordered by doc ID. - final ScorerPriorityQueue head; + final DisiPriorityQueue head; // priority queue of scorers which are behind the current doc. // Ordered by cost. - final ScorerWrapper[] tail; + final DisiWrapper[] tail; int tailSize; final Collection childScorers; @@ -113,13 +112,13 @@ final class MinShouldMatchSumScorer extends Scorer { this.coord = coord; this.doc = -1; - head = new ScorerPriorityQueue(scorers.size() - minShouldMatch + 1); + head = new DisiPriorityQueue(scorers.size() - minShouldMatch + 1); // there can be at most minShouldMatch - 1 scorers beyond the current position // otherwise we might be skipping over matching documents - tail = new ScorerWrapper[minShouldMatch - 1]; + tail = new DisiWrapper[minShouldMatch - 1]; for (Scorer scorer : scorers) { - addLead(new ScorerWrapper(scorer)); + addLead(new DisiWrapper(scorer)); } List children = new ArrayList<>(); @@ -145,13 +144,13 @@ final class MinShouldMatchSumScorer extends Scorer { // We are moving to the next doc ID, so scorers in 'lead' need to go in // 'tail'. If there is not enough space in 'tail', then we take the least // costly scorers and advance them. - for (ScorerWrapper s = lead; s != null; s = s.next) { - final ScorerWrapper evicted = insertTailWithOverFlow(s); + for (DisiWrapper s = lead; s != null; s = s.next) { + final DisiWrapper evicted = insertTailWithOverFlow(s); if (evicted != null) { if (evicted.doc == doc) { - evicted.doc = evicted.scorer.nextDoc(); + evicted.doc = evicted.iterator.nextDoc(); } else { - evicted.doc = evicted.scorer.advance(doc + 1); + evicted.doc = evicted.iterator.advance(doc + 1); } head.add(evicted); } @@ -164,23 +163,23 @@ final class MinShouldMatchSumScorer extends Scorer { @Override public int advance(int target) throws IOException { // Same logic as in nextDoc - for (ScorerWrapper s = lead; s != null; s = s.next) { - final ScorerWrapper evicted = insertTailWithOverFlow(s); + for (DisiWrapper s = lead; s != null; s = s.next) { + final DisiWrapper evicted = insertTailWithOverFlow(s); if (evicted != null) { - evicted.doc = evicted.scorer.advance(target); + evicted.doc = evicted.iterator.advance(target); head.add(evicted); } } // But this time there might also be scorers in 'head' behind the desired // target so we need to do the same thing that we did on 'lead' on 'head' - ScorerWrapper headTop = head.top(); + DisiWrapper headTop = head.top(); while (headTop.doc < target) { - final ScorerWrapper evicted = insertTailWithOverFlow(headTop); + final DisiWrapper evicted = insertTailWithOverFlow(headTop); // We know that the tail is full since it contains at most // minShouldMatch - 1 entries and we just moved at least minShouldMatch // entries to it, so evicted is not null - evicted.doc = evicted.scorer.advance(target); + evicted.doc = evicted.iterator.advance(target); headTop = head.updateTop(evicted); } @@ -188,20 +187,20 @@ final class MinShouldMatchSumScorer extends Scorer { return doNext(); } - private void addLead(ScorerWrapper lead) { + private void addLead(DisiWrapper lead) { lead.next = this.lead; this.lead = lead; freq += 1; } private void pushBackLeads() throws IOException { - for (ScorerWrapper s = lead; s != null; s = s.next) { + for (DisiWrapper s = lead; s != null; s = s.next) { addTail(s); } } - private void advanceTail(ScorerWrapper top) throws IOException { - top.doc = top.scorer.advance(doc); + private void advanceTail(DisiWrapper top) throws IOException { + top.doc = top.iterator.advance(doc); if (top.doc == doc) { addLead(top); } else { @@ -210,7 +209,7 @@ final class MinShouldMatchSumScorer extends Scorer { } private void advanceTail() throws IOException { - final ScorerWrapper top = popTail(); + final DisiWrapper top = popTail(); advanceTail(top); } @@ -276,8 +275,8 @@ final class MinShouldMatchSumScorer extends Scorer { // we need to know about all matches updateFreq(); double score = 0; - for (ScorerWrapper s = lead; s != null; s = s.next) { - score += s.scorer.score(); + for (DisiWrapper s = lead; s != null; s = s.next) { + score += s.iterator.score(); } return coord[freq] * (float) score; } @@ -289,12 +288,12 @@ final class MinShouldMatchSumScorer extends Scorer { } /** Insert an entry in 'tail' and evict the least-costly scorer if full. */ - private ScorerWrapper insertTailWithOverFlow(ScorerWrapper s) { + private DisiWrapper insertTailWithOverFlow(DisiWrapper s) { if (tailSize < tail.length) { addTail(s); return null; } else if (tail.length >= 1) { - final ScorerWrapper top = tail[0]; + final DisiWrapper top = tail[0]; if (top.cost < s.cost) { tail[0] = s; downHeapCost(tail, tailSize); @@ -305,16 +304,16 @@ final class MinShouldMatchSumScorer extends Scorer { } /** Add an entry to 'tail'. Fails if over capacity. */ - private void addTail(ScorerWrapper s) { + private void addTail(DisiWrapper s) { tail[tailSize] = s; upHeapCost(tail, tailSize); tailSize += 1; } /** Pop the least-costly scorer from 'tail'. */ - private ScorerWrapper popTail() { + private DisiWrapper popTail() { assert tailSize > 0; - final ScorerWrapper result = tail[0]; + final DisiWrapper result = tail[0]; tail[0] = tail[--tailSize]; downHeapCost(tail, tailSize); return result; @@ -322,8 +321,8 @@ final class MinShouldMatchSumScorer extends Scorer { /** Heap helpers */ - private static void upHeapCost(ScorerWrapper[] heap, int i) { - final ScorerWrapper node = heap[i]; + private static void upHeapCost(DisiWrapper[] heap, int i) { + final DisiWrapper node = heap[i]; final long nodeCost = node.cost; int j = parentNode(i); while (j >= 0 && nodeCost < heap[j].cost) { @@ -334,9 +333,9 @@ final class MinShouldMatchSumScorer extends Scorer { heap[i] = node; } - private static void downHeapCost(ScorerWrapper[] heap, int size) { + private static void downHeapCost(DisiWrapper[] heap, int size) { int i = 0; - final ScorerWrapper node = heap[0]; + final DisiWrapper node = heap[0]; int j = leftNode(i); if (j < size) { int k = rightNode(j); diff --git a/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java b/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java index 0da7b0e241a..3df07a81b4f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java @@ -20,9 +20,13 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Objects; +import org.apache.lucene.search.spans.Spans; + /** - * Returned by {@link Scorer#asTwoPhaseIterator()} to expose an approximation of - * a {@link DocIdSetIterator}. When the {@link #approximation()}'s + * Returned by {@link Scorer#asTwoPhaseIterator()} + * and {@link Spans#asTwoPhaseIterator()} + * to expose an approximation of a {@link DocIdSetIterator}. + * When the {@link #approximation()}'s * {@link DocIdSetIterator#nextDoc()} or {@link DocIdSetIterator#advance(int)} * return, {@link #matches()} needs to be checked in order to know whether the * returned doc ID actually matches. @@ -89,4 +93,16 @@ public abstract class TwoPhaseIterator { * {@link DocIdSetIterator#NO_MORE_DOCS} -- and at most once. */ public abstract boolean matches() throws IOException; + /** + * Returns a {@link TwoPhaseIterator} for this {@link DocIdSetIterator} + * when available * otherwise returns null. + */ + public static TwoPhaseIterator asTwoPhaseIterator(DocIdSetIterator iter) { + return (iter instanceof Scorer) + ? ((Scorer) iter).asTwoPhaseIterator() + : (iter instanceof Spans) + ? ((Spans) iter).asTwoPhaseIterator() + : null; + } + } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java index 71215d063cb..eca3635ecb5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java @@ -31,9 +31,13 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.search.Query; +import org.apache.lucene.search.DisiPriorityQueue; +import org.apache.lucene.search.DisiWrapper; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.DisjunctionDISIApproximation; + /** Matches the union of its clauses. */ @@ -42,7 +46,7 @@ public class SpanOrQuery extends SpanQuery implements Cloneable { private String field; /** Construct a SpanOrQuery merging the provided clauses. - * All clauses must have the same field. + * All clauses must have the same field. */ public SpanOrQuery(SpanQuery... clauses) { this.clauses = new ArrayList<>(clauses.length); @@ -146,35 +150,16 @@ public class SpanOrQuery extends SpanQuery implements Cloneable { } - private class SpanQueue extends PriorityQueue { - public SpanQueue(int size) { - super(size); - } - - @Override - protected final boolean lessThan(Spans spans1, Spans spans2) { - if (spans1.docID() == spans2.docID()) { - if (spans1.startPosition() == spans2.startPosition()) { - return spans1.endPosition() < spans2.endPosition(); - } else { - return spans1.startPosition() < spans2.startPosition(); - } - } else { - return spans1.docID() < spans2.docID(); - } - } - } - @Override public Spans getSpans(final LeafReaderContext context, final Bits acceptDocs, final Map termContexts) throws IOException { ArrayList subSpans = new ArrayList<>(clauses.size()); - for (SpanQuery seq : clauses) { - Spans subSpan = seq.getSpans(context, acceptDocs, termContexts); - if (subSpan != null) { - subSpans.add(subSpan); + for (SpanQuery sq : clauses) { + Spans spans = sq.getSpans(context, acceptDocs, termContexts); + if (spans != null) { + subSpans.add(spans); } } @@ -184,114 +169,168 @@ public class SpanOrQuery extends SpanQuery implements Cloneable { return subSpans.get(0); } - SpanQueue queue = new SpanQueue(clauses.size()); + DisiPriorityQueue byDocQueue = new DisiPriorityQueue<>(subSpans.size()); for (Spans spans : subSpans) { - queue.add(spans); + byDocQueue.add(new DisiWrapper<>(spans)); } + SpanPositionQueue byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1 + return new Spans() { + Spans topPositionSpans = null; @Override public int nextDoc() throws IOException { - if (queue.size() == 0) { // all done - return NO_MORE_DOCS; - } - - int currentDoc = top().docID(); - - if (currentDoc == -1) { // initially - return advance(0); - } - + topPositionSpans = null; + DisiWrapper topDocSpans = byDocQueue.top(); + int currentDoc = topDocSpans.doc; do { - if (top().nextDoc() != NO_MORE_DOCS) { // move top to next doc - queue.updateTop(); - } else { - queue.pop(); // exhausted a clause - if (queue.size() == 0) { - return NO_MORE_DOCS; - } - } - // assert queue.size() > 0; - int doc = top().docID(); - if (doc > currentDoc) { - return doc; - } - } while (true); - } - - private Spans top() { - return queue.top(); + topDocSpans.doc = topDocSpans.iterator.nextDoc(); + topDocSpans = byDocQueue.updateTop(); + } while (topDocSpans.doc == currentDoc); + return topDocSpans.doc; } @Override public int advance(int target) throws IOException { - - while ((queue.size() > 0) && (top().docID() < target)) { - if (top().advance(target) != NO_MORE_DOCS) { - queue.updateTop(); - } else { - queue.pop(); - } - } - - return (queue.size() > 0) ? top().docID() : NO_MORE_DOCS; + topPositionSpans = null; + DisiWrapper topDocSpans = byDocQueue.top(); + do { + topDocSpans.doc = topDocSpans.iterator.advance(target); + topDocSpans = byDocQueue.updateTop(); + } while (topDocSpans.doc < target); + return topDocSpans.doc; } @Override public int docID() { - return (queue == null) ? -1 - : (queue.size() > 0) ? top().docID() - : NO_MORE_DOCS; + DisiWrapper topDocSpans = byDocQueue.top(); + return topDocSpans.doc; } @Override - public int nextStartPosition() throws IOException { - top().nextStartPosition(); - queue.updateTop(); - int startPos = top().startPosition(); - while (startPos == -1) { // initially at this doc - top().nextStartPosition(); - queue.updateTop(); - startPos = top().startPosition(); + public TwoPhaseIterator asTwoPhaseIterator() { + boolean hasApproximation = false; + for (DisiWrapper w : byDocQueue) { + if (w.twoPhaseView != null) { + hasApproximation = true; + break; + } } - return startPos; + + if (! hasApproximation) { // none of the sub spans supports approximations + return null; + } + + return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) { + @Override + public boolean matches() throws IOException { + return twoPhaseCurrentDocMatches(); + } + }; + } + + int lastDocTwoPhaseMatched = -1; + + boolean twoPhaseCurrentDocMatches() throws IOException { + DisiWrapper listAtCurrentDoc = byDocQueue.topList(); + // remove the head of the list as long as it does not match + final int currentDoc = listAtCurrentDoc.doc; + while (listAtCurrentDoc.twoPhaseView != null) { + if (listAtCurrentDoc.twoPhaseView.matches()) { + // use this spans for positions at current doc: + listAtCurrentDoc.lastApproxMatchDoc = currentDoc; + break; + } + // do not use this spans for positions at current doc: + listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc; + listAtCurrentDoc = listAtCurrentDoc.next; + if (listAtCurrentDoc == null) { + return false; + } + } + lastDocTwoPhaseMatched = currentDoc; + topPositionSpans = null; + return true; + } + + void fillPositionQueue() throws IOException { // called at first nextStartPosition + assert byPositionQueue.size() == 0; + // add all matching Spans at current doc to byPositionQueue + DisiWrapper listAtCurrentDoc = byDocQueue.topList(); + while (listAtCurrentDoc != null) { + Spans spansAtDoc = listAtCurrentDoc.iterator; + if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation + if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation + if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false + spansAtDoc = null; + } else { + if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) { + if (! listAtCurrentDoc.twoPhaseView.matches()) { + spansAtDoc = null; + } + } + } + } + } + + if (spansAtDoc != null) { + assert spansAtDoc.docID() == listAtCurrentDoc.doc; + assert spansAtDoc.startPosition() == -1; + spansAtDoc.nextStartPosition(); + assert spansAtDoc.startPosition() != NO_MORE_POSITIONS; + byPositionQueue.add(spansAtDoc); + } + listAtCurrentDoc = listAtCurrentDoc.next; + } + assert byPositionQueue.size() > 0; + } + + @Override + public int nextStartPosition() throws IOException { + DisiWrapper topDocSpans = byDocQueue.top(); + assert topDocSpans.doc != NO_MORE_DOCS; + if (topPositionSpans == null) { + byPositionQueue.clear(); + fillPositionQueue(); // fills byPositionQueue at first position + topPositionSpans = byPositionQueue.top(); + } else { + topPositionSpans.nextStartPosition(); + topPositionSpans = byPositionQueue.updateTop(); + } + return topPositionSpans.startPosition(); } @Override public int startPosition() { - return top().startPosition(); + return topPositionSpans == null ? -1 : topPositionSpans.startPosition(); } @Override public int endPosition() { - return top().endPosition(); + return topPositionSpans == null ? -1 : topPositionSpans.endPosition(); } @Override public Collection getPayload() throws IOException { - ArrayList result = null; - Spans theTop = top(); - if (theTop != null && theTop.isPayloadAvailable()) { - result = new ArrayList<>(theTop.getPayload()); - } - return result; + return topPositionSpans == null + ? null + : topPositionSpans.isPayloadAvailable() + ? new ArrayList<>(topPositionSpans.getPayload()) + : null; } @Override public boolean isPayloadAvailable() throws IOException { - Spans top = top(); - return top != null && top.isPayloadAvailable(); + return (topPositionSpans != null) && topPositionSpans.isPayloadAvailable(); } @Override public String toString() { - return "spans("+SpanOrQuery.this+")@"+ - ((queue == null)?"START" - :(queue.size()>0?(docID()+": "+top().startPosition()+" - "+top().endPosition()):"END")); + return "spanOr("+SpanOrQuery.this+")@"+docID()+": "+startPosition()+" - "+endPosition(); } - private long cost = -1; + long cost = -1; @Override public long cost() { @@ -303,8 +342,8 @@ public class SpanOrQuery extends SpanQuery implements Cloneable { } return cost; } - }; } } + diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java new file mode 100644 index 00000000000..cf83d1384a9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java @@ -0,0 +1,35 @@ +package org.apache.lucene.search.spans; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.PriorityQueue; + +class SpanPositionQueue extends PriorityQueue { + SpanPositionQueue(int maxSize) { + super(maxSize, false); // do not prepopulate + } + + protected boolean lessThan(Spans s1, Spans s2) { + int start1 = s1.startPosition(); + int start2 = s2.startPosition(); + return (start1 < start2) ? true + : (start1 == start2) ? s1.endPosition() < s2.endPosition() + : false; + } +} + diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java index 1a799727e62..7bf112365e5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java @@ -86,11 +86,12 @@ public abstract class Spans extends DocIdSetIterator { * * Note that the returned {@link TwoPhaseIterator}'s * {@link TwoPhaseIterator#approximation() approximation} must - * advance synchronously with this iterator: advancing the approximation must + * advance documents synchronously with this iterator: + * advancing the approximation must * advance this iterator and vice-versa. * - * Implementing this method is typically useful on {@link Spans}s - * that have a high per-document overhead in order to confirm matches. + * Implementing this method is typically useful on a {@link Spans} + * that has a high per-document overhead for confirming matches. * * The default implementation returns {@code null}. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java index a973461e783..66b71bb8f21 100644 --- a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java @@ -89,7 +89,7 @@ public abstract class PriorityQueue { * value (i.e., {@link #lessThan} should always favor the * non-sentinel values).
* - * By default, this method returns false, which means the queue will not be + * By default, this method returns null, which means the queue will not be * filled with sentinel values. Otherwise, the value returned will be used to * pre-populate the queue. Adds sentinel values to the queue.
*