From 0a1de2c4a57d2f67ed7751c3d5cc7cb0250230b2 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 25 May 2018 08:47:47 +0200 Subject: [PATCH] LUCENE-8312: Leverage impacts to speed up SynonymQuery. --- .../apache/lucene/document/FeatureQuery.java | 18 +- .../org/apache/lucene/index/ImpactsEnum.java | 29 +- .../apache/lucene/index/ImpactsSource.java | 54 ++++ .../org/apache/lucene/search/ImpactsDISI.java | 149 +++++++++ .../apache/lucene/search/MaxScoreCache.java | 52 ++-- .../apache/lucene/search/SynonymQuery.java | 292 ++++++++++++++++-- .../org/apache/lucene/search/TermQuery.java | 6 +- .../org/apache/lucene/search/TermScorer.java | 114 ++----- .../lucene/search/TestSynonymQuery.java | 235 +++++++++++++- .../apache/lucene/search/TestTermScorer.java | 21 +- 10 files changed, 756 insertions(+), 214 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/index/ImpactsSource.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/ImpactsDISI.java diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java index 91b98ce1e2b..efc2d00e5c0 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java @@ -30,8 +30,8 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.ImpactsDISI; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MaxScoreCache; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; @@ -133,9 +133,9 @@ final class FeatureQuery extends Query { return null; } - SimScorer scorer = function.scorer(boost); - ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS); - MaxScoreCache maxScoreCache = new MaxScoreCache(impacts, scorer); + final SimScorer scorer = function.scorer(boost); + final ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS); + final ImpactsDISI impactsDisi = new ImpactsDISI(impacts, impacts, scorer); return new Scorer(this) { @@ -151,19 +151,23 @@ final class FeatureQuery extends Query { @Override public DocIdSetIterator iterator() { - return impacts; + return impactsDisi; } @Override public int advanceShallow(int target) throws IOException { - return maxScoreCache.advanceShallow(target); + return impactsDisi.advanceShallow(target); } @Override public float getMaxScore(int upTo) throws IOException { - return maxScoreCache.getMaxScore(upTo); + return impactsDisi.getMaxScore(upTo); } + @Override + public void setMinCompetitiveScore(float minScore) { + impactsDisi.setMinCompetitiveScore(minScore); + } }; } diff --git a/lucene/core/src/java/org/apache/lucene/index/ImpactsEnum.java b/lucene/core/src/java/org/apache/lucene/index/ImpactsEnum.java index 5a354770d0d..af25a12d898 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ImpactsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/ImpactsEnum.java @@ -16,41 +16,14 @@ */ package org.apache.lucene.index; -import java.io.IOException; - /** * Extension of {@link PostingsEnum} which also provides information about * upcoming impacts. * @lucene.experimental */ -public abstract class ImpactsEnum extends PostingsEnum { +public abstract class ImpactsEnum extends PostingsEnum implements ImpactsSource { /** Sole constructor. */ protected ImpactsEnum() {} - /** - * Shallow-advance to {@code target}. This is cheaper than calling - * {@link #advance(int)} and allows further calls to {@link #getImpacts()} - * to ignore doc IDs that are less than {@code target} in order to get more - * precise information about impacts. - * This method may not be called on targets that are less than the current - * {@link #docID()}. - * After this method has been called, {@link #nextDoc()} may not be called - * if the current doc ID is less than {@code target - 1} and - * {@link #advance(int)} may not be called on targets that are less than - * {@code target}. - */ - public abstract void advanceShallow(int target) throws IOException; - - /** - * Get information about upcoming impacts for doc ids that are greater than - * or equal to the maximum of {@link #docID()} and the last target that was - * passed to {@link #advanceShallow(int)}. - * This method may not be called on an unpositioned iterator on which - * {@link #advanceShallow(int)} has never been called. - * NOTE: advancing this iterator may invalidate the returned impacts, so they - * should not be used after the iterator has been advanced. - */ - public abstract Impacts getImpacts() throws IOException; - } diff --git a/lucene/core/src/java/org/apache/lucene/index/ImpactsSource.java b/lucene/core/src/java/org/apache/lucene/index/ImpactsSource.java new file mode 100644 index 00000000000..209b4dcf1bd --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/ImpactsSource.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; + +/** + * Source of {@link Impacts}. + * @lucene.internal + */ +public interface ImpactsSource { + + /** + * Shallow-advance to {@code target}. This is cheaper than calling + * {@link DocIdSetIterator#advance(int)} and allows further calls to + * {@link #getImpacts()} to ignore doc IDs that are less than {@code target} + * in order to get more precise information about impacts. + * This method may not be called on targets that are less than the current + * {@link DocIdSetIterator#docID()}. + * After this method has been called, {@link DocIdSetIterator#nextDoc()} may + * not be called if the current doc ID is less than {@code target - 1} and + * {@link DocIdSetIterator#advance(int)} may not be called on targets that + * are less than {@code target}. + */ + void advanceShallow(int target) throws IOException; + + /** + * Get information about upcoming impacts for doc ids that are greater than + * or equal to the maximum of {@link DocIdSetIterator#docID()} and the last + * target that was passed to {@link #advanceShallow(int)}. + * This method may not be called on an unpositioned iterator on which + * {@link #advanceShallow(int)} has never been called. + * NOTE: advancing this iterator may invalidate the returned impacts, so they + * should not be used after the iterator has been advanced. + */ + Impacts getImpacts() throws IOException; + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsDISI.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsDISI.java new file mode 100644 index 00000000000..0f978edd94b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsDISI.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.ImpactsSource; +import org.apache.lucene.search.similarities.Similarity.SimScorer; + +/** + * {@link DocIdSetIterator} that skips non-competitive docs thanks to the + * indexed impacts. Call {@link #setMinCompetitiveScore(float)} in order to + * give this iterator the ability to skip low-scoring documents. + * @lucene.internal + */ +public final class ImpactsDISI extends DocIdSetIterator { + + private final DocIdSetIterator in; + private final ImpactsSource impactsSource; + private final MaxScoreCache maxScoreCache; + private final float globalMaxScore; + private float minCompetitiveScore = 0; + private int upTo = DocIdSetIterator.NO_MORE_DOCS; + private float maxScore = Float.MAX_VALUE; + + /** + * Sole constructor. + * @param in wrapped iterator + * @param impactsSource source of impacts + * @param scorer scorer + */ + public ImpactsDISI(DocIdSetIterator in, ImpactsSource impactsSource, SimScorer scorer) { + this.in = in; + this.impactsSource = impactsSource; + this.maxScoreCache = new MaxScoreCache(impactsSource, scorer); + this.globalMaxScore = scorer.score(Float.MAX_VALUE, 1L); + } + + /** + * Set the minimum competitive score. + * @see Scorer#setMinCompetitiveScore(float) + */ + public void setMinCompetitiveScore(float minCompetitiveScore) { + assert minCompetitiveScore >= this.minCompetitiveScore; + if (minCompetitiveScore > this.minCompetitiveScore) { + this.minCompetitiveScore = minCompetitiveScore; + // force upTo and maxScore to be recomputed so that we will skip documents + // if the current block of documents is not competitive - only if the min + // competitive score actually increased + upTo = -1; + } + } + + /** + * Implement the contract of {@link Scorer#advanceShallow(int)} based on the + * wrapped {@link ImpactsEnum}. + * @see Scorer#advanceShallow(int) + */ + public int advanceShallow(int target) throws IOException { + impactsSource.advanceShallow(target); + Impacts impacts = impactsSource.getImpacts(); + return impacts.getDocIdUpTo(0); + } + + /** + * Implement the contract of {@link Scorer#getMaxScore(int)} based on the + * wrapped {@link ImpactsEnum} and {@link Scorer}. + * @see Scorer#getMaxScore(int) + */ + public float getMaxScore(int upTo) throws IOException { + final int level = maxScoreCache.getLevel(upTo); + if (level == -1) { + return globalMaxScore; + } else { + return maxScoreCache.getMaxScoreForLevel(level); + } + } + + private int advanceTarget(int target) throws IOException { + if (target <= upTo) { + // we are still in the current block, which is considered competitive + // according to impacts, no skipping + return target; + } + + upTo = advanceShallow(target); + maxScore = maxScoreCache.getMaxScoreForLevel(0); + + while (true) { + assert upTo >= target; + + if (maxScore >= minCompetitiveScore) { + return target; + } + + if (upTo == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + + final int skipUpTo = maxScoreCache.getSkipUpTo(minCompetitiveScore); + if (skipUpTo == -1) { // no further skipping + target = upTo + 1; + } else if (skipUpTo == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } else { + target = skipUpTo + 1; + } + upTo = advanceShallow(target); + maxScore = maxScoreCache.getMaxScoreForLevel(0); + } + } + + @Override + public int advance(int target) throws IOException { + return in.advance(advanceTarget(target)); + } + + @Override + public int nextDoc() throws IOException { + return advance(in.docID() + 1); + } + + @Override + public int docID() { + return in.docID(); + } + + @Override + public long cost() { + return in.cost(); + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java index 719bc146f8b..17e4efc0c16 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java @@ -22,7 +22,7 @@ import java.util.List; import org.apache.lucene.index.Impact; import org.apache.lucene.index.Impacts; -import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.ArrayUtil; @@ -30,22 +30,21 @@ import org.apache.lucene.util.ArrayUtil; * Compute maximum scores based on {@link Impacts} and keep them in a cache in * order not to run expensive similarity score computations multiple times on * the same data. + * @lucene.internal */ -public final class MaxScoreCache { +final class MaxScoreCache { - private final ImpactsEnum impactsEnum; + private final ImpactsSource impactsSource; private final SimScorer scorer; - private final float globalMaxScore; private float[] maxScoreCache; private int[] maxScoreCacheUpTo; /** * Sole constructor. */ - public MaxScoreCache(ImpactsEnum impactsEnum, SimScorer scorer) { - this.impactsEnum = impactsEnum; + public MaxScoreCache(ImpactsSource impactsSource, SimScorer scorer) { + this.impactsSource = impactsSource; this.scorer = scorer; - globalMaxScore = scorer.score(Integer.MAX_VALUE, 1L); maxScoreCache = new float[0]; maxScoreCacheUpTo = new int[0]; } @@ -71,8 +70,8 @@ public final class MaxScoreCache { * Return the first level that includes all doc IDs up to {@code upTo}, * or -1 if there is no such level. */ - private int getLevel(int upTo) throws IOException { - final Impacts impacts = impactsEnum.getImpacts(); + int getLevel(int upTo) throws IOException { + final Impacts impacts = impactsSource.getImpacts(); for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { final int impactsUpTo = impacts.getDocIdUpTo(level); if (upTo <= impactsUpTo) { @@ -86,7 +85,7 @@ public final class MaxScoreCache { * Return the maximum score for the given {@code level}. */ float getMaxScoreForLevel(int level) throws IOException { - final Impacts impacts = impactsEnum.getImpacts(); + final Impacts impacts = impactsSource.getImpacts(); ensureCacheSize(level + 1); final int levelUpTo = impacts.getDocIdUpTo(level); if (maxScoreCacheUpTo[level] < levelUpTo) { @@ -100,8 +99,7 @@ public final class MaxScoreCache { * Return the maximum level at which scores are all less than {@code minScore}, * or -1 if none. */ - int getSkipLevel(float minScore) throws IOException { - final Impacts impacts = impactsEnum.getImpacts(); + private int getSkipLevel(Impacts impacts, float minScore) throws IOException { final int numLevels = impacts.numLevels(); for (int level = 0; level < numLevels; ++level) { if (getMaxScoreForLevel(level) >= minScore) { @@ -112,27 +110,17 @@ public final class MaxScoreCache { } /** - * Implement the contract of {@link Scorer#advanceShallow(int)} based on the - * wrapped {@link ImpactsEnum}. - * @see Scorer#advanceShallow(int) + * Return the an inclusive upper bound of documents that all have a score that + * is less than {@code minScore}, or {@code -1} if the current document may + * be competitive. */ - public int advanceShallow(int target) throws IOException { - impactsEnum.advanceShallow(target); - Impacts impacts = impactsEnum.getImpacts(); - return impacts.getDocIdUpTo(0); + int getSkipUpTo(float minScore) throws IOException { + final Impacts impacts = impactsSource.getImpacts(); + final int level = getSkipLevel(impacts, minScore); + if (level == -1) { + return -1; + } + return impacts.getDocIdUpTo(level); } - /** - * Implement the contract of {@link Scorer#getMaxScore(int)} based on the - * wrapped {@link ImpactsEnum} and {@link Scorer}. - * @see Scorer#getMaxScore(int) - */ - public float getMaxScore(int upTo) throws IOException { - final int level = getLevel(upTo); - if (level == -1) { - return globalMaxScore; - } else { - return getMaxScoreForLevel(level); - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index 84ceab0ebd7..68453ac014d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -21,12 +21,19 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Objects; import java.util.Set; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SlowImpactsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermStates; @@ -34,6 +41,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; /** * A query that treats multiple terms as synonyms. @@ -112,7 +120,7 @@ public final class SynonymQuery extends Query { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { if (scoreMode.needsScores()) { - return new SynonymWeight(this, searcher, boost); + return new SynonymWeight(this, searcher, scoreMode, boost); } else { // if scores are not needed, let BooleanWeight deal with optimizing that case. BooleanQuery.Builder bq = new BooleanQuery.Builder(); @@ -127,9 +135,12 @@ public final class SynonymQuery extends Query { private final TermStates termStates[]; private final Similarity similarity; private final Similarity.SimScorer simWeight; + private final ScoreMode scoreMode; - SynonymWeight(Query query, IndexSearcher searcher, float boost) throws IOException { + SynonymWeight(Query query, IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { super(query); + assert scoreMode.needsScores(); + this.scoreMode = scoreMode; CollectionStatistics collectionStats = searcher.collectionStatistics(terms[0].field()); long docFreq = 0; long totalTermFreq = 0; @@ -176,8 +187,7 @@ public final class SynonymQuery extends Query { if (newDoc == doc) { final float freq; if (scorer instanceof SynonymScorer) { - SynonymScorer synScorer = (SynonymScorer) scorer; - freq = synScorer.tf(synScorer.getSubMatches()); + freq = ((SynonymScorer) scorer).freq(); } else { assert scorer instanceof TermScorer; freq = ((TermScorer)scorer).freq(); @@ -197,61 +207,277 @@ public final class SynonymQuery extends Query { @Override public Scorer scorer(LeafReaderContext context) throws IOException { - // we use termscorers + disjunction as an impl detail - List subScorers = new ArrayList<>(); + List iterators = new ArrayList<>(); + List impacts = new ArrayList<>(); for (int i = 0; i < terms.length; i++) { TermState state = termStates[i].get(context); if (state != null) { TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator(); termsEnum.seekExact(terms[i].bytes(), state); - LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), terms[0].field(), true); - subScorers.add(new TermScorer(this, termsEnum, ScoreMode.COMPLETE, simScorer)); + if (scoreMode == ScoreMode.TOP_SCORES) { + ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); + iterators.add(impactsEnum); + impacts.add(impactsEnum); + } else { + PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); + iterators.add(postingsEnum); + impacts.add(new SlowImpactsEnum(postingsEnum)); + } } } - if (subScorers.isEmpty()) { + + if (iterators.isEmpty()) { return null; - } else if (subScorers.size() == 1) { - // we must optimize this case (term not in segment), disjunctionscorer requires >= 2 subs - return subScorers.get(0); - } else { - LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), terms[0].field(), true); - return new SynonymScorer(simScorer, this, subScorers); } + + LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), terms[0].field(), true); + + // we must optimize this case (term not in segment), disjunctions require >= 2 subs + if (iterators.size() == 1) { + if (scoreMode == ScoreMode.TOP_SCORES) { + return new TermScorer(this, impacts.get(0), simScorer); + } else { + return new TermScorer(this, iterators.get(0), simScorer); + } + } + + // we use termscorers + disjunction as an impl detail + DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size()); + for (PostingsEnum postings : iterators) { + queue.add(new DisiWrapper(new TermScorer(this, postings, simScorer))); + } + // Even though it is called approximation, it is accurate since none of + // the sub iterators are two-phase iterators. + DocIdSetIterator iterator = new DisjunctionDISIApproximation(queue); + + ImpactsSource impactsSource = mergeImpacts(impacts.toArray(new ImpactsEnum[0])); + ImpactsDISI impactsDisi = new ImpactsDISI(iterator, impactsSource, simScorer.getSimScorer()); + + if (scoreMode == ScoreMode.TOP_SCORES) { + iterator = impactsDisi; + } + + return new SynonymScorer(this, queue, iterator, impactsDisi, simScorer); } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } - } - static class SynonymScorer extends DisjunctionScorer { - private final LeafSimScorer similarity; - - SynonymScorer(LeafSimScorer similarity, Weight weight, List subScorers) { - super(weight, subScorers, true); - this.similarity = similarity; + /** + * Merge impacts for multiple synonyms. + */ + static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums) { + return new ImpactsSource() { + + class SubIterator { + final Iterator iterator; + int previousFreq; + Impact current; + + SubIterator(Iterator iterator) { + this.iterator = iterator; + this.current = iterator.next(); + } + + void next() { + previousFreq = current.freq; + if (iterator.hasNext() == false) { + current = null; + } else { + current = iterator.next(); + } + } + + } + + @Override + public Impacts getImpacts() throws IOException { + final Impacts[] impacts = new Impacts[impactsEnums.length]; + // Use the impacts that have the lower next boundary as a lead. + // It will decide on the number of levels and the block boundaries. + Impacts tmpLead = null; + for (int i = 0; i < impactsEnums.length; ++i) { + impacts[i] = impactsEnums[i].getImpacts(); + if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) { + tmpLead = impacts[i]; + } + } + final Impacts lead = tmpLead; + return new Impacts() { + + @Override + public int numLevels() { + // Delegate to the lead + return lead.numLevels(); + } + + @Override + public int getDocIdUpTo(int level) { + // Delegate to the lead + return lead.getDocIdUpTo(level); + } + + /** + * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, + * or {@code -1} if there is no such level. + */ + private int getLevel(Impacts impacts, int docIdUpTo) { + for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { + if (impacts.getDocIdUpTo(level) >= docIdUpTo) { + return level; + } + } + return -1; + } + + @Override + public List getImpacts(int level) { + final int docIdUpTo = getDocIdUpTo(level); + + List> toMerge = new ArrayList<>(); + + for (int i = 0; i < impactsEnums.length; ++i) { + if (impactsEnums[i].docID() <= docIdUpTo) { + int impactsLevel = getLevel(impacts[i], docIdUpTo); + if (impactsLevel == -1) { + // One instance doesn't have impacts that cover up to docIdUpTo + // Return impacts that trigger the maximum score + return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + } + toMerge.add(impacts[i].getImpacts(impactsLevel)); + } + } + assert toMerge.size() > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong + + if (toMerge.size() == 1) { + // common if one synonym is common and the other one is rare + return toMerge.get(0); + } + + PriorityQueue pq = new PriorityQueue(impacts.length) { + @Override + protected boolean lessThan(SubIterator a, SubIterator b) { + if (a.current == null) { // means iteration is finished + return false; + } + if (b.current == null) { + return true; + } + return Long.compareUnsigned(a.current.norm, b.current.norm) < 0; + } + }; + for (List impacts : toMerge) { + pq.add(new SubIterator(impacts.iterator())); + } + + List mergedImpacts = new ArrayList<>(); + + // Idea: merge impacts by norm. The tricky thing is that we need to + // consider norm values that are not in the impacts too. For + // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], + // there might well be a document that has a freq of 2 and a length of 11, + // which was just not added to the list of impacts because {freq=2,norm=10} + // is more competitive. So the way it works is that we track the sum of + // the term freqs that we have seen so far in order to account for these + // implicit impacts. + + long sumTf = 0; + SubIterator top = pq.top(); + do { + final long norm = top.current.norm; + do { + sumTf += top.current.freq - top.previousFreq; + top.next(); + top = pq.updateTop(); + } while (top.current != null && top.current.norm == norm); + + final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf); + if (mergedImpacts.isEmpty()) { + mergedImpacts.add(new Impact(freqUpperBound, norm)); + } else { + Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1); + assert Long.compareUnsigned(prevImpact.norm, norm) < 0; + if (freqUpperBound > prevImpact.freq) { + mergedImpacts.add(new Impact(freqUpperBound, norm)); + } // otherwise the previous impact is already more competitive + } + } while (top.current != null); + + return mergedImpacts; + } + }; + } + + @Override + public void advanceShallow(int target) throws IOException { + for (ImpactsEnum impactsEnum : impactsEnums) { + if (impactsEnum.docID() < target) { + impactsEnum.advanceShallow(target); + } + } + } + }; + } + + private static class SynonymScorer extends Scorer { + + private final DisiPriorityQueue queue; + private final DocIdSetIterator iterator; + private final ImpactsDISI impactsDisi; + private final LeafSimScorer simScorer; + + SynonymScorer(Weight weight, DisiPriorityQueue queue, DocIdSetIterator iterator, + ImpactsDISI impactsDisi, LeafSimScorer simScorer) { + super(weight); + this.queue = queue; + this.iterator = iterator; + this.impactsDisi = impactsDisi; + this.simScorer = simScorer; } @Override - protected float score(DisiWrapper topList) throws IOException { - return similarity.score(topList.doc, tf(topList)); + public int docID() { + return iterator.docID(); + } + + int freq() throws IOException { + DisiWrapper w = queue.topList(); + int freq = ((PostingsEnum) w.iterator).freq(); + for (w = w.next; w != null; w = w.next) { + freq += ((PostingsEnum) w.iterator).freq(); + if (freq < 0) { // overflow + return Integer.MAX_VALUE; + } + } + return freq; + } + + @Override + public float score() throws IOException { + return simScorer.score(iterator.docID(), freq()); + } + + @Override + public DocIdSetIterator iterator() { + return iterator; } @Override public float getMaxScore(int upTo) throws IOException { - // TODO: merge impacts to get better score upper bounds - return similarity.getSimScorer().score(Float.MAX_VALUE, 1L); + return impactsDisi.getMaxScore(upTo); } - /** combines TF of all subs. */ - final int tf(DisiWrapper topList) throws IOException { - int tf = 0; - for (DisiWrapper w = topList; w != null; w = w.next) { - tf += ((TermScorer)w.scorer).freq(); - } - return tf; + @Override + public int advanceShallow(int target) throws IOException { + return impactsDisi.advanceShallow(target); + } + + @Override + public void setMinCompetitiveScore(float minScore) { + impactsDisi.setMinCompetitiveScore(minScore); } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index b7b25cab6e0..022dc994838 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -111,7 +111,11 @@ public class TermQuery extends Query { return null; } LeafSimScorer scorer = new LeafSimScorer(simScorer, context.reader(), term.field(), scoreMode.needsScores()); - return new TermScorer(this, termsEnum, scoreMode, scorer); + if (scoreMode == ScoreMode.TOP_SCORES) { + return new TermScorer(this, termsEnum.impacts(PostingsEnum.FREQS), scorer); + } else { + return new TermScorer(this, termsEnum.postings(null, PostingsEnum.FREQS), scorer); + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java index 66cf8332eae..7d3f6e3d1f2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java @@ -19,11 +19,9 @@ package org.apache.lucene.search; import java.io.IOException; -import org.apache.lucene.index.Impacts; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SlowImpactsEnum; -import org.apache.lucene.index.TermsEnum; /** Expert: A Scorer for documents matching a Term. */ @@ -32,101 +30,29 @@ final class TermScorer extends Scorer { private final ImpactsEnum impactsEnum; private final DocIdSetIterator iterator; private final LeafSimScorer docScorer; - private final MaxScoreCache maxScoreCache; - private float minCompetitiveScore; + private final ImpactsDISI impactsDisi; /** - * Construct a TermScorer. - * - * @param weight - * The weight of the Term in the query. - * @param te - * A {@link TermsEnum} positioned on the expected term. - * @param docScorer - * A {@link LeafSimScorer} for the appropriate field. + * Construct a {@link TermScorer} that will iterate all documents. */ - TermScorer(Weight weight, TermsEnum te, ScoreMode scoreMode, LeafSimScorer docScorer) throws IOException { + TermScorer(Weight weight, PostingsEnum postingsEnum, LeafSimScorer docScorer) { super(weight); + iterator = this.postingsEnum = postingsEnum; + impactsEnum = new SlowImpactsEnum(postingsEnum); + impactsDisi = new ImpactsDISI(impactsEnum, impactsEnum, docScorer.getSimScorer()); this.docScorer = docScorer; - if (scoreMode == ScoreMode.TOP_SCORES) { - impactsEnum = te.impacts(PostingsEnum.FREQS); - maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer()); - postingsEnum = impactsEnum; - iterator = new DocIdSetIterator() { + } - int upTo = -1; - float maxScore; - - private int advanceTarget(int target) throws IOException { - if (minCompetitiveScore == 0) { - // no potential for skipping - return target; - } - - if (target > upTo) { - impactsEnum.advanceShallow(target); - Impacts impacts = impactsEnum.getImpacts(); - upTo = impacts.getDocIdUpTo(0); - maxScore = maxScoreCache.getMaxScoreForLevel(0); - } - - while (true) { - assert upTo >= target; - - if (maxScore >= minCompetitiveScore) { - return target; - } - - if (upTo == NO_MORE_DOCS) { - return NO_MORE_DOCS; - } - - impactsEnum.advanceShallow(upTo + 1); - Impacts impacts = impactsEnum.getImpacts(); - final int level = maxScoreCache.getSkipLevel(minCompetitiveScore); - if (level >= 0) { - // we can skip more docs - int newUpTo = impacts.getDocIdUpTo(level); - if (newUpTo == NO_MORE_DOCS) { - return NO_MORE_DOCS; - } - target = newUpTo + 1; - impactsEnum.advanceShallow(target); - impacts = impactsEnum.getImpacts(); - } else { - target = upTo + 1; - } - upTo = impacts.getDocIdUpTo(0); - maxScore = maxScoreCache.getMaxScoreForLevel(0); - } - } - - @Override - public int advance(int target) throws IOException { - return impactsEnum.advance(advanceTarget(target)); - } - - @Override - public int nextDoc() throws IOException { - return advance(impactsEnum.docID() + 1); - } - - @Override - public int docID() { - return impactsEnum.docID(); - } - - @Override - public long cost() { - return impactsEnum.cost(); - } - }; - } else { - postingsEnum = te.postings(null, scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE); - impactsEnum = new SlowImpactsEnum(postingsEnum); - maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer()); - iterator = postingsEnum; - } + /** + * Construct a {@link TermScorer} that will use impacts to skip blocks of + * non-competitive documents. + */ + TermScorer(Weight weight, ImpactsEnum impactsEnum, LeafSimScorer docScorer) { + super(weight); + postingsEnum = this.impactsEnum = impactsEnum; + impactsDisi = new ImpactsDISI(impactsEnum, impactsEnum, docScorer.getSimScorer()); + iterator = impactsDisi; + this.docScorer = docScorer; } @Override @@ -151,17 +77,17 @@ final class TermScorer extends Scorer { @Override public int advanceShallow(int target) throws IOException { - return maxScoreCache.advanceShallow(target); + return impactsDisi.advanceShallow(target); } @Override public float getMaxScore(int upTo) throws IOException { - return maxScoreCache.getMaxScore(upTo); + return impactsDisi.getMaxScore(upTo); } @Override public void setMinCompetitiveScore(float minScore) { - this.minCompetitiveScore = minScore; + impactsDisi.setMinCompetitiveScore(minScore); } /** Returns a string representation of this TermScorer. */ diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSynonymQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestSynonymQuery.java index bc0a2d071dd..44f915e0c49 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSynonymQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSynonymQuery.java @@ -18,27 +18,39 @@ package org.apache.lucene.search; import java.io.IOException; +import java.util.Arrays; +import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.ImpactsSource; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; public class TestSynonymQuery extends LuceneTestCase { public void testEquals() { QueryUtils.checkEqual(new SynonymQuery(), new SynonymQuery()); - QueryUtils.checkEqual(new SynonymQuery(new Term("foo", "bar")), + QueryUtils.checkEqual(new SynonymQuery(new Term("foo", "bar")), new SynonymQuery(new Term("foo", "bar"))); - - QueryUtils.checkEqual(new SynonymQuery(new Term("a", "a"), new Term("a", "b")), + + QueryUtils.checkEqual(new SynonymQuery(new Term("a", "a"), new Term("a", "b")), new SynonymQuery(new Term("a", "b"), new Term("a", "a"))); } - + public void testBogusParams() { expectThrows(IllegalArgumentException.class, () -> { new SynonymQuery(new Term("field1", "a"), new Term("field2", "b")); @@ -54,6 +66,11 @@ public class TestSynonymQuery extends LuceneTestCase { } public void testScores() throws IOException { + doTestScores(false); + doTestScores(true); + } + + private void doTestScores(boolean trackTotalHits) throws IOException { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); @@ -71,8 +88,12 @@ public class TestSynonymQuery extends LuceneTestCase { IndexSearcher searcher = newSearcher(reader); SynonymQuery query = new SynonymQuery(new Term("f", "a"), new Term("f", "b")); - TopDocs topDocs = searcher.search(query, 20); - assertEquals(11, topDocs.totalHits); + TopScoreDocCollector collector = TopScoreDocCollector.create(20, null, trackTotalHits); + searcher.search(query, collector); + TopDocs topDocs = collector.topDocs(); + if (trackTotalHits) { + assertEquals(11, topDocs.totalHits); + } // All docs must have the same score for (int i = 0; i < topDocs.scoreDocs.length; ++i) { assertEquals(topDocs.scoreDocs[0].score, topDocs.scoreDocs[i].score, 0.0f); @@ -83,4 +104,206 @@ public class TestSynonymQuery extends LuceneTestCase { dir.close(); } + public void testMergeImpacts() throws IOException { + DummyImpactsEnum impacts1 = new DummyImpactsEnum(); + impacts1.reset(42, + new Impact[][] { + new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) }, + new Impact[] { new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) } + }, + new int[] { + 110, + 945 + }); + DummyImpactsEnum impacts2 = new DummyImpactsEnum(); + impacts2.reset(45, + new Impact[][] { + new Impact[] { new Impact(2, 10), new Impact(6, 13) }, + new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) } + }, + new int[] { + 90, + 1000 + }); + + ImpactsSource mergedImpacts = SynonymQuery.mergeImpacts(new ImpactsEnum[] { impacts1, impacts2 }); + assertEquals( + new Impact[][] { + new Impact[] { new Impact(5, 10), new Impact(7, 12), new Impact(14, 13) }, + new Impact[] { new Impact(Integer.MAX_VALUE, 1) } + }, + new int[] { + 90, + 1000 + }, + mergedImpacts.getImpacts()); + + // docID is > the first doIdUpTo of impacts1 + impacts2.reset(112, + new Impact[][] { + new Impact[] { new Impact(2, 10), new Impact(6, 13) }, + new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) } + }, + new int[] { + 150, + 1000 + }); + assertEquals( + new Impact[][] { + new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) }, // same as impacts1 + new Impact[] { new Impact(3, 9), new Impact(10, 11), new Impact(15, 13), new Impact(19, 14) } + }, + new int[] { + 110, + 945 + }, + mergedImpacts.getImpacts()); + } + + private static void assertEquals(Impact[][] impacts, int[] docIdUpTo, Impacts actual) { + assertEquals(impacts.length, actual.numLevels()); + for (int i = 0; i < impacts.length; ++i) { + assertEquals(docIdUpTo[i], actual.getDocIdUpTo(i)); + assertEquals(Arrays.asList(impacts[i]), actual.getImpacts(i)); + } + } + + private static class DummyImpactsEnum extends ImpactsEnum { + + private int docID; + private Impact[][] impacts; + private int[] docIdUpTo; + + void reset(int docID, Impact[][] impacts, int[] docIdUpTo) { + this.docID = docID; + this.impacts = impacts; + this.docIdUpTo = docIdUpTo; + } + + @Override + public void advanceShallow(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Impacts getImpacts() throws IOException { + return new Impacts() { + + @Override + public int numLevels() { + return impacts.length; + } + + @Override + public int getDocIdUpTo(int level) { + return docIdUpTo[level]; + } + + @Override + public List getImpacts(int level) { + return Arrays.asList(impacts[level]); + } + + }; + } + + @Override + public int freq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int nextPosition() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int startOffset() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int endOffset() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef getPayload() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); + } + + } + + public void testRandomTopDocs() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + int numDocs = atLeast(128 * 8 * 8 * 3); // make sure some terms have skip data + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + int numValues = random().nextInt(1 << random().nextInt(5)); + int start = random().nextInt(10); + for (int j = 0; j < numValues; ++j) { + int freq = TestUtil.nextInt(random(), 1, 1 << random().nextInt(3)); + for (int k = 0; k < freq; ++k) { + doc.add(new TextField("foo", Integer.toString(start + j), Store.NO)); + } + } + w.addDocument(doc); + } + IndexReader reader = DirectoryReader.open(w); + w.close(); + IndexSearcher searcher = newSearcher(reader); + + for (int term1 = 0; term1 < 15; ++term1) { + int term2; + do { + term2 = random().nextInt(15); + } while (term1 == term2); + Query query = new SynonymQuery( + new Term[] { + new Term("foo", Integer.toString(term1)), + new Term("foo", Integer.toString(term2))}); + + TopScoreDocCollector collector1 = TopScoreDocCollector.create(10, null, true); // COMPLETE + TopScoreDocCollector collector2 = TopScoreDocCollector.create(10, null, false); // TOP_SCORES + + searcher.search(query, collector1); + searcher.search(query, collector2); + CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs); + + int filterTerm = random().nextInt(15); + Query filteredQuery = new BooleanQuery.Builder() + .add(query, Occur.MUST) + .add(new TermQuery(new Term("foo", Integer.toString(filterTerm))), Occur.FILTER) + .build(); + + collector1 = TopScoreDocCollector.create(10, null, true); // COMPLETE + collector2 = TopScoreDocCollector.create(10, null, false); // TOP_SCORES + searcher.search(filteredQuery, collector1); + searcher.search(filteredQuery, collector2); + CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs); + } + reader.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java index 89fe2a84941..b98ae4109c1 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java @@ -24,7 +24,7 @@ import java.util.List; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FilterLeafReader; @@ -39,6 +39,7 @@ import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; public class TestTermScorer extends LuceneTestCase { protected Directory directory; @@ -218,7 +219,10 @@ public class TestTermScorer extends LuceneTestCase { int numValues = random().nextInt(1 << random().nextInt(5)); int start = random().nextInt(10); for (int j = 0; j < numValues; ++j) { - doc.add(new StringField("foo", Integer.toString(start + j), Store.NO)); + int freq = TestUtil.nextInt(random(), 1, 1 << random().nextInt(3)); + for (int k = 0; k < freq; ++k) { + doc.add(new TextField("foo", Integer.toString(start + j), Store.NO)); + } } w.addDocument(doc); } @@ -234,7 +238,7 @@ public class TestTermScorer extends LuceneTestCase { searcher.search(query, collector1); searcher.search(query, collector2); - assertTopDocsEquals(collector1.topDocs(), collector2.topDocs()); + CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs); int filterTerm = random().nextInt(15); Query filteredQuery = new BooleanQuery.Builder() @@ -246,19 +250,10 @@ public class TestTermScorer extends LuceneTestCase { collector2 = TopScoreDocCollector.create(10, null, false); // TOP_SCORES searcher.search(filteredQuery, collector1); searcher.search(filteredQuery, collector2); - assertTopDocsEquals(collector1.topDocs(), collector2.topDocs()); + CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs); } reader.close(); dir.close(); } - private static void assertTopDocsEquals(TopDocs td1, TopDocs td2) { - assertEquals(td1.scoreDocs.length, td2.scoreDocs.length); - for (int i = 0; i < td1.scoreDocs.length; ++i) { - ScoreDoc sd1 = td1.scoreDocs[i]; - ScoreDoc sd2 = td2.scoreDocs[i]; - assertEquals(sd1.doc, sd2.doc); - assertEquals(sd1.score, sd2.score, 0f); - } - } }