LUCENE-8312: Leverage impacts to speed up SynonymQuery.

This commit is contained in:
Adrien Grand 2018-05-25 08:47:47 +02:00
parent 24d79de796
commit 0a1de2c4a5
10 changed files with 756 additions and 214 deletions

View File

@ -30,8 +30,8 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ImpactsDISI;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MaxScoreCache;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
@ -133,9 +133,9 @@ final class FeatureQuery extends Query {
return null;
}
SimScorer scorer = function.scorer(boost);
ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS);
MaxScoreCache maxScoreCache = new MaxScoreCache(impacts, scorer);
final SimScorer scorer = function.scorer(boost);
final ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS);
final ImpactsDISI impactsDisi = new ImpactsDISI(impacts, impacts, scorer);
return new Scorer(this) {
@ -151,19 +151,23 @@ final class FeatureQuery extends Query {
@Override
public DocIdSetIterator iterator() {
return impacts;
return impactsDisi;
}
@Override
public int advanceShallow(int target) throws IOException {
return maxScoreCache.advanceShallow(target);
return impactsDisi.advanceShallow(target);
}
@Override
public float getMaxScore(int upTo) throws IOException {
return maxScoreCache.getMaxScore(upTo);
return impactsDisi.getMaxScore(upTo);
}
@Override
public void setMinCompetitiveScore(float minScore) {
impactsDisi.setMinCompetitiveScore(minScore);
}
};
}

View File

@ -16,41 +16,14 @@
*/
package org.apache.lucene.index;
import java.io.IOException;
/**
* Extension of {@link PostingsEnum} which also provides information about
* upcoming impacts.
* @lucene.experimental
*/
public abstract class ImpactsEnum extends PostingsEnum {
public abstract class ImpactsEnum extends PostingsEnum implements ImpactsSource {
/** Sole constructor. */
protected ImpactsEnum() {}
/**
* Shallow-advance to {@code target}. This is cheaper than calling
* {@link #advance(int)} and allows further calls to {@link #getImpacts()}
* to ignore doc IDs that are less than {@code target} in order to get more
* precise information about impacts.
* This method may not be called on targets that are less than the current
* {@link #docID()}.
* After this method has been called, {@link #nextDoc()} may not be called
* if the current doc ID is less than {@code target - 1} and
* {@link #advance(int)} may not be called on targets that are less than
* {@code target}.
*/
public abstract void advanceShallow(int target) throws IOException;
/**
* Get information about upcoming impacts for doc ids that are greater than
* or equal to the maximum of {@link #docID()} and the last target that was
* passed to {@link #advanceShallow(int)}.
* This method may not be called on an unpositioned iterator on which
* {@link #advanceShallow(int)} has never been called.
* NOTE: advancing this iterator may invalidate the returned impacts, so they
* should not be used after the iterator has been advanced.
*/
public abstract Impacts getImpacts() throws IOException;
}

View File

@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
/**
* Source of {@link Impacts}.
* @lucene.internal
*/
public interface ImpactsSource {
/**
* Shallow-advance to {@code target}. This is cheaper than calling
* {@link DocIdSetIterator#advance(int)} and allows further calls to
* {@link #getImpacts()} to ignore doc IDs that are less than {@code target}
* in order to get more precise information about impacts.
* This method may not be called on targets that are less than the current
* {@link DocIdSetIterator#docID()}.
* After this method has been called, {@link DocIdSetIterator#nextDoc()} may
* not be called if the current doc ID is less than {@code target - 1} and
* {@link DocIdSetIterator#advance(int)} may not be called on targets that
* are less than {@code target}.
*/
void advanceShallow(int target) throws IOException;
/**
* Get information about upcoming impacts for doc ids that are greater than
* or equal to the maximum of {@link DocIdSetIterator#docID()} and the last
* target that was passed to {@link #advanceShallow(int)}.
* This method may not be called on an unpositioned iterator on which
* {@link #advanceShallow(int)} has never been called.
* NOTE: advancing this iterator may invalidate the returned impacts, so they
* should not be used after the iterator has been advanced.
*/
Impacts getImpacts() throws IOException;
}

View File

@ -0,0 +1,149 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
/**
* {@link DocIdSetIterator} that skips non-competitive docs thanks to the
* indexed impacts. Call {@link #setMinCompetitiveScore(float)} in order to
* give this iterator the ability to skip low-scoring documents.
* @lucene.internal
*/
public final class ImpactsDISI extends DocIdSetIterator {
private final DocIdSetIterator in;
private final ImpactsSource impactsSource;
private final MaxScoreCache maxScoreCache;
private final float globalMaxScore;
private float minCompetitiveScore = 0;
private int upTo = DocIdSetIterator.NO_MORE_DOCS;
private float maxScore = Float.MAX_VALUE;
/**
* Sole constructor.
* @param in wrapped iterator
* @param impactsSource source of impacts
* @param scorer scorer
*/
public ImpactsDISI(DocIdSetIterator in, ImpactsSource impactsSource, SimScorer scorer) {
this.in = in;
this.impactsSource = impactsSource;
this.maxScoreCache = new MaxScoreCache(impactsSource, scorer);
this.globalMaxScore = scorer.score(Float.MAX_VALUE, 1L);
}
/**
* Set the minimum competitive score.
* @see Scorer#setMinCompetitiveScore(float)
*/
public void setMinCompetitiveScore(float minCompetitiveScore) {
assert minCompetitiveScore >= this.minCompetitiveScore;
if (minCompetitiveScore > this.minCompetitiveScore) {
this.minCompetitiveScore = minCompetitiveScore;
// force upTo and maxScore to be recomputed so that we will skip documents
// if the current block of documents is not competitive - only if the min
// competitive score actually increased
upTo = -1;
}
}
/**
* Implement the contract of {@link Scorer#advanceShallow(int)} based on the
* wrapped {@link ImpactsEnum}.
* @see Scorer#advanceShallow(int)
*/
public int advanceShallow(int target) throws IOException {
impactsSource.advanceShallow(target);
Impacts impacts = impactsSource.getImpacts();
return impacts.getDocIdUpTo(0);
}
/**
* Implement the contract of {@link Scorer#getMaxScore(int)} based on the
* wrapped {@link ImpactsEnum} and {@link Scorer}.
* @see Scorer#getMaxScore(int)
*/
public float getMaxScore(int upTo) throws IOException {
final int level = maxScoreCache.getLevel(upTo);
if (level == -1) {
return globalMaxScore;
} else {
return maxScoreCache.getMaxScoreForLevel(level);
}
}
private int advanceTarget(int target) throws IOException {
if (target <= upTo) {
// we are still in the current block, which is considered competitive
// according to impacts, no skipping
return target;
}
upTo = advanceShallow(target);
maxScore = maxScoreCache.getMaxScoreForLevel(0);
while (true) {
assert upTo >= target;
if (maxScore >= minCompetitiveScore) {
return target;
}
if (upTo == NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
final int skipUpTo = maxScoreCache.getSkipUpTo(minCompetitiveScore);
if (skipUpTo == -1) { // no further skipping
target = upTo + 1;
} else if (skipUpTo == NO_MORE_DOCS) {
return NO_MORE_DOCS;
} else {
target = skipUpTo + 1;
}
upTo = advanceShallow(target);
maxScore = maxScoreCache.getMaxScoreForLevel(0);
}
}
@Override
public int advance(int target) throws IOException {
return in.advance(advanceTarget(target));
}
@Override
public int nextDoc() throws IOException {
return advance(in.docID() + 1);
}
@Override
public int docID() {
return in.docID();
}
@Override
public long cost() {
return in.cost();
}
}

View File

@ -22,7 +22,7 @@ import java.util.List;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
@ -30,22 +30,21 @@ import org.apache.lucene.util.ArrayUtil;
* Compute maximum scores based on {@link Impacts} and keep them in a cache in
* order not to run expensive similarity score computations multiple times on
* the same data.
* @lucene.internal
*/
public final class MaxScoreCache {
final class MaxScoreCache {
private final ImpactsEnum impactsEnum;
private final ImpactsSource impactsSource;
private final SimScorer scorer;
private final float globalMaxScore;
private float[] maxScoreCache;
private int[] maxScoreCacheUpTo;
/**
* Sole constructor.
*/
public MaxScoreCache(ImpactsEnum impactsEnum, SimScorer scorer) {
this.impactsEnum = impactsEnum;
public MaxScoreCache(ImpactsSource impactsSource, SimScorer scorer) {
this.impactsSource = impactsSource;
this.scorer = scorer;
globalMaxScore = scorer.score(Integer.MAX_VALUE, 1L);
maxScoreCache = new float[0];
maxScoreCacheUpTo = new int[0];
}
@ -71,8 +70,8 @@ public final class MaxScoreCache {
* Return the first level that includes all doc IDs up to {@code upTo},
* or -1 if there is no such level.
*/
private int getLevel(int upTo) throws IOException {
final Impacts impacts = impactsEnum.getImpacts();
int getLevel(int upTo) throws IOException {
final Impacts impacts = impactsSource.getImpacts();
for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
final int impactsUpTo = impacts.getDocIdUpTo(level);
if (upTo <= impactsUpTo) {
@ -86,7 +85,7 @@ public final class MaxScoreCache {
* Return the maximum score for the given {@code level}.
*/
float getMaxScoreForLevel(int level) throws IOException {
final Impacts impacts = impactsEnum.getImpacts();
final Impacts impacts = impactsSource.getImpacts();
ensureCacheSize(level + 1);
final int levelUpTo = impacts.getDocIdUpTo(level);
if (maxScoreCacheUpTo[level] < levelUpTo) {
@ -100,8 +99,7 @@ public final class MaxScoreCache {
* Return the maximum level at which scores are all less than {@code minScore},
* or -1 if none.
*/
int getSkipLevel(float minScore) throws IOException {
final Impacts impacts = impactsEnum.getImpacts();
private int getSkipLevel(Impacts impacts, float minScore) throws IOException {
final int numLevels = impacts.numLevels();
for (int level = 0; level < numLevels; ++level) {
if (getMaxScoreForLevel(level) >= minScore) {
@ -112,27 +110,17 @@ public final class MaxScoreCache {
}
/**
* Implement the contract of {@link Scorer#advanceShallow(int)} based on the
* wrapped {@link ImpactsEnum}.
* @see Scorer#advanceShallow(int)
* Return the an inclusive upper bound of documents that all have a score that
* is less than {@code minScore}, or {@code -1} if the current document may
* be competitive.
*/
public int advanceShallow(int target) throws IOException {
impactsEnum.advanceShallow(target);
Impacts impacts = impactsEnum.getImpacts();
return impacts.getDocIdUpTo(0);
int getSkipUpTo(float minScore) throws IOException {
final Impacts impacts = impactsSource.getImpacts();
final int level = getSkipLevel(impacts, minScore);
if (level == -1) {
return -1;
}
return impacts.getDocIdUpTo(level);
}
/**
* Implement the contract of {@link Scorer#getMaxScore(int)} based on the
* wrapped {@link ImpactsEnum} and {@link Scorer}.
* @see Scorer#getMaxScore(int)
*/
public float getMaxScore(int upTo) throws IOException {
final int level = getLevel(upTo);
if (level == -1) {
return globalMaxScore;
} else {
return getMaxScoreForLevel(level);
}
}
}

View File

@ -21,12 +21,19 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
@ -34,6 +41,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
/**
* A query that treats multiple terms as synonyms.
@ -112,7 +120,7 @@ public final class SynonymQuery extends Query {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
if (scoreMode.needsScores()) {
return new SynonymWeight(this, searcher, boost);
return new SynonymWeight(this, searcher, scoreMode, boost);
} else {
// if scores are not needed, let BooleanWeight deal with optimizing that case.
BooleanQuery.Builder bq = new BooleanQuery.Builder();
@ -127,9 +135,12 @@ public final class SynonymQuery extends Query {
private final TermStates termStates[];
private final Similarity similarity;
private final Similarity.SimScorer simWeight;
private final ScoreMode scoreMode;
SynonymWeight(Query query, IndexSearcher searcher, float boost) throws IOException {
SynonymWeight(Query query, IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
super(query);
assert scoreMode.needsScores();
this.scoreMode = scoreMode;
CollectionStatistics collectionStats = searcher.collectionStatistics(terms[0].field());
long docFreq = 0;
long totalTermFreq = 0;
@ -176,8 +187,7 @@ public final class SynonymQuery extends Query {
if (newDoc == doc) {
final float freq;
if (scorer instanceof SynonymScorer) {
SynonymScorer synScorer = (SynonymScorer) scorer;
freq = synScorer.tf(synScorer.getSubMatches());
freq = ((SynonymScorer) scorer).freq();
} else {
assert scorer instanceof TermScorer;
freq = ((TermScorer)scorer).freq();
@ -197,61 +207,277 @@ public final class SynonymQuery extends Query {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
// we use termscorers + disjunction as an impl detail
List<Scorer> subScorers = new ArrayList<>();
List<PostingsEnum> iterators = new ArrayList<>();
List<ImpactsEnum> impacts = new ArrayList<>();
for (int i = 0; i < terms.length; i++) {
TermState state = termStates[i].get(context);
if (state != null) {
TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator();
termsEnum.seekExact(terms[i].bytes(), state);
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), terms[0].field(), true);
subScorers.add(new TermScorer(this, termsEnum, ScoreMode.COMPLETE, simScorer));
if (scoreMode == ScoreMode.TOP_SCORES) {
ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
iterators.add(impactsEnum);
impacts.add(impactsEnum);
} else {
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS);
iterators.add(postingsEnum);
impacts.add(new SlowImpactsEnum(postingsEnum));
}
}
}
if (subScorers.isEmpty()) {
if (iterators.isEmpty()) {
return null;
} else if (subScorers.size() == 1) {
// we must optimize this case (term not in segment), disjunctionscorer requires >= 2 subs
return subScorers.get(0);
} else {
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), terms[0].field(), true);
return new SynonymScorer(simScorer, this, subScorers);
}
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), terms[0].field(), true);
// we must optimize this case (term not in segment), disjunctions require >= 2 subs
if (iterators.size() == 1) {
if (scoreMode == ScoreMode.TOP_SCORES) {
return new TermScorer(this, impacts.get(0), simScorer);
} else {
return new TermScorer(this, iterators.get(0), simScorer);
}
}
// we use termscorers + disjunction as an impl detail
DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size());
for (PostingsEnum postings : iterators) {
queue.add(new DisiWrapper(new TermScorer(this, postings, simScorer)));
}
// Even though it is called approximation, it is accurate since none of
// the sub iterators are two-phase iterators.
DocIdSetIterator iterator = new DisjunctionDISIApproximation(queue);
ImpactsSource impactsSource = mergeImpacts(impacts.toArray(new ImpactsEnum[0]));
ImpactsDISI impactsDisi = new ImpactsDISI(iterator, impactsSource, simScorer.getSimScorer());
if (scoreMode == ScoreMode.TOP_SCORES) {
iterator = impactsDisi;
}
return new SynonymScorer(this, queue, iterator, impactsDisi, simScorer);
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
}
static class SynonymScorer extends DisjunctionScorer {
private final LeafSimScorer similarity;
SynonymScorer(LeafSimScorer similarity, Weight weight, List<Scorer> subScorers) {
super(weight, subScorers, true);
this.similarity = similarity;
/**
* Merge impacts for multiple synonyms.
*/
static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums) {
return new ImpactsSource() {
class SubIterator {
final Iterator<Impact> iterator;
int previousFreq;
Impact current;
SubIterator(Iterator<Impact> iterator) {
this.iterator = iterator;
this.current = iterator.next();
}
void next() {
previousFreq = current.freq;
if (iterator.hasNext() == false) {
current = null;
} else {
current = iterator.next();
}
}
}
@Override
public Impacts getImpacts() throws IOException {
final Impacts[] impacts = new Impacts[impactsEnums.length];
// Use the impacts that have the lower next boundary as a lead.
// It will decide on the number of levels and the block boundaries.
Impacts tmpLead = null;
for (int i = 0; i < impactsEnums.length; ++i) {
impacts[i] = impactsEnums[i].getImpacts();
if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
tmpLead = impacts[i];
}
}
final Impacts lead = tmpLead;
return new Impacts() {
@Override
public int numLevels() {
// Delegate to the lead
return lead.numLevels();
}
@Override
public int getDocIdUpTo(int level) {
// Delegate to the lead
return lead.getDocIdUpTo(level);
}
/**
* Return the minimum level whose impacts are valid up to {@code docIdUpTo},
* or {@code -1} if there is no such level.
*/
private int getLevel(Impacts impacts, int docIdUpTo) {
for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
return level;
}
}
return -1;
}
@Override
public List<Impact> getImpacts(int level) {
final int docIdUpTo = getDocIdUpTo(level);
List<List<Impact>> toMerge = new ArrayList<>();
for (int i = 0; i < impactsEnums.length; ++i) {
if (impactsEnums[i].docID() <= docIdUpTo) {
int impactsLevel = getLevel(impacts[i], docIdUpTo);
if (impactsLevel == -1) {
// One instance doesn't have impacts that cover up to docIdUpTo
// Return impacts that trigger the maximum score
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
}
toMerge.add(impacts[i].getImpacts(impactsLevel));
}
}
assert toMerge.size() > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong
if (toMerge.size() == 1) {
// common if one synonym is common and the other one is rare
return toMerge.get(0);
}
PriorityQueue<SubIterator> pq = new PriorityQueue<SubIterator>(impacts.length) {
@Override
protected boolean lessThan(SubIterator a, SubIterator b) {
if (a.current == null) { // means iteration is finished
return false;
}
if (b.current == null) {
return true;
}
return Long.compareUnsigned(a.current.norm, b.current.norm) < 0;
}
};
for (List<Impact> impacts : toMerge) {
pq.add(new SubIterator(impacts.iterator()));
}
List<Impact> mergedImpacts = new ArrayList<>();
// Idea: merge impacts by norm. The tricky thing is that we need to
// consider norm values that are not in the impacts too. For
// instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
// there might well be a document that has a freq of 2 and a length of 11,
// which was just not added to the list of impacts because {freq=2,norm=10}
// is more competitive. So the way it works is that we track the sum of
// the term freqs that we have seen so far in order to account for these
// implicit impacts.
long sumTf = 0;
SubIterator top = pq.top();
do {
final long norm = top.current.norm;
do {
sumTf += top.current.freq - top.previousFreq;
top.next();
top = pq.updateTop();
} while (top.current != null && top.current.norm == norm);
final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf);
if (mergedImpacts.isEmpty()) {
mergedImpacts.add(new Impact(freqUpperBound, norm));
} else {
Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1);
assert Long.compareUnsigned(prevImpact.norm, norm) < 0;
if (freqUpperBound > prevImpact.freq) {
mergedImpacts.add(new Impact(freqUpperBound, norm));
} // otherwise the previous impact is already more competitive
}
} while (top.current != null);
return mergedImpacts;
}
};
}
@Override
public void advanceShallow(int target) throws IOException {
for (ImpactsEnum impactsEnum : impactsEnums) {
if (impactsEnum.docID() < target) {
impactsEnum.advanceShallow(target);
}
}
}
};
}
private static class SynonymScorer extends Scorer {
private final DisiPriorityQueue queue;
private final DocIdSetIterator iterator;
private final ImpactsDISI impactsDisi;
private final LeafSimScorer simScorer;
SynonymScorer(Weight weight, DisiPriorityQueue queue, DocIdSetIterator iterator,
ImpactsDISI impactsDisi, LeafSimScorer simScorer) {
super(weight);
this.queue = queue;
this.iterator = iterator;
this.impactsDisi = impactsDisi;
this.simScorer = simScorer;
}
@Override
protected float score(DisiWrapper topList) throws IOException {
return similarity.score(topList.doc, tf(topList));
public int docID() {
return iterator.docID();
}
int freq() throws IOException {
DisiWrapper w = queue.topList();
int freq = ((PostingsEnum) w.iterator).freq();
for (w = w.next; w != null; w = w.next) {
freq += ((PostingsEnum) w.iterator).freq();
if (freq < 0) { // overflow
return Integer.MAX_VALUE;
}
}
return freq;
}
@Override
public float score() throws IOException {
return simScorer.score(iterator.docID(), freq());
}
@Override
public DocIdSetIterator iterator() {
return iterator;
}
@Override
public float getMaxScore(int upTo) throws IOException {
// TODO: merge impacts to get better score upper bounds
return similarity.getSimScorer().score(Float.MAX_VALUE, 1L);
return impactsDisi.getMaxScore(upTo);
}
/** combines TF of all subs. */
final int tf(DisiWrapper topList) throws IOException {
int tf = 0;
for (DisiWrapper w = topList; w != null; w = w.next) {
tf += ((TermScorer)w.scorer).freq();
}
return tf;
@Override
public int advanceShallow(int target) throws IOException {
return impactsDisi.advanceShallow(target);
}
@Override
public void setMinCompetitiveScore(float minScore) {
impactsDisi.setMinCompetitiveScore(minScore);
}
}
}

View File

@ -111,7 +111,11 @@ public class TermQuery extends Query {
return null;
}
LeafSimScorer scorer = new LeafSimScorer(simScorer, context.reader(), term.field(), scoreMode.needsScores());
return new TermScorer(this, termsEnum, scoreMode, scorer);
if (scoreMode == ScoreMode.TOP_SCORES) {
return new TermScorer(this, termsEnum.impacts(PostingsEnum.FREQS), scorer);
} else {
return new TermScorer(this, termsEnum.postings(null, PostingsEnum.FREQS), scorer);
}
}
@Override

View File

@ -19,11 +19,9 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.TermsEnum;
/** Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
*/
@ -32,101 +30,29 @@ final class TermScorer extends Scorer {
private final ImpactsEnum impactsEnum;
private final DocIdSetIterator iterator;
private final LeafSimScorer docScorer;
private final MaxScoreCache maxScoreCache;
private float minCompetitiveScore;
private final ImpactsDISI impactsDisi;
/**
* Construct a <code>TermScorer</code>.
*
* @param weight
* The weight of the <code>Term</code> in the query.
* @param te
* A {@link TermsEnum} positioned on the expected term.
* @param docScorer
* A {@link LeafSimScorer} for the appropriate field.
* Construct a {@link TermScorer} that will iterate all documents.
*/
TermScorer(Weight weight, TermsEnum te, ScoreMode scoreMode, LeafSimScorer docScorer) throws IOException {
TermScorer(Weight weight, PostingsEnum postingsEnum, LeafSimScorer docScorer) {
super(weight);
iterator = this.postingsEnum = postingsEnum;
impactsEnum = new SlowImpactsEnum(postingsEnum);
impactsDisi = new ImpactsDISI(impactsEnum, impactsEnum, docScorer.getSimScorer());
this.docScorer = docScorer;
if (scoreMode == ScoreMode.TOP_SCORES) {
impactsEnum = te.impacts(PostingsEnum.FREQS);
maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer());
postingsEnum = impactsEnum;
iterator = new DocIdSetIterator() {
}
int upTo = -1;
float maxScore;
private int advanceTarget(int target) throws IOException {
if (minCompetitiveScore == 0) {
// no potential for skipping
return target;
}
if (target > upTo) {
impactsEnum.advanceShallow(target);
Impacts impacts = impactsEnum.getImpacts();
upTo = impacts.getDocIdUpTo(0);
maxScore = maxScoreCache.getMaxScoreForLevel(0);
}
while (true) {
assert upTo >= target;
if (maxScore >= minCompetitiveScore) {
return target;
}
if (upTo == NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
impactsEnum.advanceShallow(upTo + 1);
Impacts impacts = impactsEnum.getImpacts();
final int level = maxScoreCache.getSkipLevel(minCompetitiveScore);
if (level >= 0) {
// we can skip more docs
int newUpTo = impacts.getDocIdUpTo(level);
if (newUpTo == NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
target = newUpTo + 1;
impactsEnum.advanceShallow(target);
impacts = impactsEnum.getImpacts();
} else {
target = upTo + 1;
}
upTo = impacts.getDocIdUpTo(0);
maxScore = maxScoreCache.getMaxScoreForLevel(0);
}
}
@Override
public int advance(int target) throws IOException {
return impactsEnum.advance(advanceTarget(target));
}
@Override
public int nextDoc() throws IOException {
return advance(impactsEnum.docID() + 1);
}
@Override
public int docID() {
return impactsEnum.docID();
}
@Override
public long cost() {
return impactsEnum.cost();
}
};
} else {
postingsEnum = te.postings(null, scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE);
impactsEnum = new SlowImpactsEnum(postingsEnum);
maxScoreCache = new MaxScoreCache(impactsEnum, docScorer.getSimScorer());
iterator = postingsEnum;
}
/**
* Construct a {@link TermScorer} that will use impacts to skip blocks of
* non-competitive documents.
*/
TermScorer(Weight weight, ImpactsEnum impactsEnum, LeafSimScorer docScorer) {
super(weight);
postingsEnum = this.impactsEnum = impactsEnum;
impactsDisi = new ImpactsDISI(impactsEnum, impactsEnum, docScorer.getSimScorer());
iterator = impactsDisi;
this.docScorer = docScorer;
}
@Override
@ -151,17 +77,17 @@ final class TermScorer extends Scorer {
@Override
public int advanceShallow(int target) throws IOException {
return maxScoreCache.advanceShallow(target);
return impactsDisi.advanceShallow(target);
}
@Override
public float getMaxScore(int upTo) throws IOException {
return maxScoreCache.getMaxScore(upTo);
return impactsDisi.getMaxScore(upTo);
}
@Override
public void setMinCompetitiveScore(float minScore) {
this.minCompetitiveScore = minScore;
impactsDisi.setMinCompetitiveScore(minScore);
}
/** Returns a string representation of this <code>TermScorer</code>. */

View File

@ -18,27 +18,39 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestSynonymQuery extends LuceneTestCase {
public void testEquals() {
QueryUtils.checkEqual(new SynonymQuery(), new SynonymQuery());
QueryUtils.checkEqual(new SynonymQuery(new Term("foo", "bar")),
QueryUtils.checkEqual(new SynonymQuery(new Term("foo", "bar")),
new SynonymQuery(new Term("foo", "bar")));
QueryUtils.checkEqual(new SynonymQuery(new Term("a", "a"), new Term("a", "b")),
QueryUtils.checkEqual(new SynonymQuery(new Term("a", "a"), new Term("a", "b")),
new SynonymQuery(new Term("a", "b"), new Term("a", "a")));
}
public void testBogusParams() {
expectThrows(IllegalArgumentException.class, () -> {
new SynonymQuery(new Term("field1", "a"), new Term("field2", "b"));
@ -54,6 +66,11 @@ public class TestSynonymQuery extends LuceneTestCase {
}
public void testScores() throws IOException {
doTestScores(false);
doTestScores(true);
}
private void doTestScores(boolean trackTotalHits) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
@ -71,8 +88,12 @@ public class TestSynonymQuery extends LuceneTestCase {
IndexSearcher searcher = newSearcher(reader);
SynonymQuery query = new SynonymQuery(new Term("f", "a"), new Term("f", "b"));
TopDocs topDocs = searcher.search(query, 20);
assertEquals(11, topDocs.totalHits);
TopScoreDocCollector collector = TopScoreDocCollector.create(20, null, trackTotalHits);
searcher.search(query, collector);
TopDocs topDocs = collector.topDocs();
if (trackTotalHits) {
assertEquals(11, topDocs.totalHits);
}
// All docs must have the same score
for (int i = 0; i < topDocs.scoreDocs.length; ++i) {
assertEquals(topDocs.scoreDocs[0].score, topDocs.scoreDocs[i].score, 0.0f);
@ -83,4 +104,206 @@ public class TestSynonymQuery extends LuceneTestCase {
dir.close();
}
public void testMergeImpacts() throws IOException {
DummyImpactsEnum impacts1 = new DummyImpactsEnum();
impacts1.reset(42,
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
});
DummyImpactsEnum impacts2 = new DummyImpactsEnum();
impacts2.reset(45,
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(6, 13) },
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
90,
1000
});
ImpactsSource mergedImpacts = SynonymQuery.mergeImpacts(new ImpactsEnum[] { impacts1, impacts2 });
assertEquals(
new Impact[][] {
new Impact[] { new Impact(5, 10), new Impact(7, 12), new Impact(14, 13) },
new Impact[] { new Impact(Integer.MAX_VALUE, 1) }
},
new int[] {
90,
1000
},
mergedImpacts.getImpacts());
// docID is > the first doIdUpTo of impacts1
impacts2.reset(112,
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(6, 13) },
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
150,
1000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) }, // same as impacts1
new Impact[] { new Impact(3, 9), new Impact(10, 11), new Impact(15, 13), new Impact(19, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
}
private static void assertEquals(Impact[][] impacts, int[] docIdUpTo, Impacts actual) {
assertEquals(impacts.length, actual.numLevels());
for (int i = 0; i < impacts.length; ++i) {
assertEquals(docIdUpTo[i], actual.getDocIdUpTo(i));
assertEquals(Arrays.asList(impacts[i]), actual.getImpacts(i));
}
}
private static class DummyImpactsEnum extends ImpactsEnum {
private int docID;
private Impact[][] impacts;
private int[] docIdUpTo;
void reset(int docID, Impact[][] impacts, int[] docIdUpTo) {
this.docID = docID;
this.impacts = impacts;
this.docIdUpTo = docIdUpTo;
}
@Override
public void advanceShallow(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Impacts getImpacts() throws IOException {
return new Impacts() {
@Override
public int numLevels() {
return impacts.length;
}
@Override
public int getDocIdUpTo(int level) {
return docIdUpTo[level];
}
@Override
public List<Impact> getImpacts(int level) {
return Arrays.asList(impacts[level]);
}
};
}
@Override
public int freq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int nextPosition() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int startOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef getPayload() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
throw new UnsupportedOperationException();
}
}
public void testRandomTopDocs() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
int numDocs = atLeast(128 * 8 * 8 * 3); // make sure some terms have skip data
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
int numValues = random().nextInt(1 << random().nextInt(5));
int start = random().nextInt(10);
for (int j = 0; j < numValues; ++j) {
int freq = TestUtil.nextInt(random(), 1, 1 << random().nextInt(3));
for (int k = 0; k < freq; ++k) {
doc.add(new TextField("foo", Integer.toString(start + j), Store.NO));
}
}
w.addDocument(doc);
}
IndexReader reader = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = newSearcher(reader);
for (int term1 = 0; term1 < 15; ++term1) {
int term2;
do {
term2 = random().nextInt(15);
} while (term1 == term2);
Query query = new SynonymQuery(
new Term[] {
new Term("foo", Integer.toString(term1)),
new Term("foo", Integer.toString(term2))});
TopScoreDocCollector collector1 = TopScoreDocCollector.create(10, null, true); // COMPLETE
TopScoreDocCollector collector2 = TopScoreDocCollector.create(10, null, false); // TOP_SCORES
searcher.search(query, collector1);
searcher.search(query, collector2);
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
int filterTerm = random().nextInt(15);
Query filteredQuery = new BooleanQuery.Builder()
.add(query, Occur.MUST)
.add(new TermQuery(new Term("foo", Integer.toString(filterTerm))), Occur.FILTER)
.build();
collector1 = TopScoreDocCollector.create(10, null, true); // COMPLETE
collector2 = TopScoreDocCollector.create(10, null, false); // TOP_SCORES
searcher.search(filteredQuery, collector1);
searcher.search(filteredQuery, collector2);
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
}
reader.close();
dir.close();
}
}

View File

@ -24,7 +24,7 @@ import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
@ -39,6 +39,7 @@ import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestTermScorer extends LuceneTestCase {
protected Directory directory;
@ -218,7 +219,10 @@ public class TestTermScorer extends LuceneTestCase {
int numValues = random().nextInt(1 << random().nextInt(5));
int start = random().nextInt(10);
for (int j = 0; j < numValues; ++j) {
doc.add(new StringField("foo", Integer.toString(start + j), Store.NO));
int freq = TestUtil.nextInt(random(), 1, 1 << random().nextInt(3));
for (int k = 0; k < freq; ++k) {
doc.add(new TextField("foo", Integer.toString(start + j), Store.NO));
}
}
w.addDocument(doc);
}
@ -234,7 +238,7 @@ public class TestTermScorer extends LuceneTestCase {
searcher.search(query, collector1);
searcher.search(query, collector2);
assertTopDocsEquals(collector1.topDocs(), collector2.topDocs());
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
int filterTerm = random().nextInt(15);
Query filteredQuery = new BooleanQuery.Builder()
@ -246,19 +250,10 @@ public class TestTermScorer extends LuceneTestCase {
collector2 = TopScoreDocCollector.create(10, null, false); // TOP_SCORES
searcher.search(filteredQuery, collector1);
searcher.search(filteredQuery, collector2);
assertTopDocsEquals(collector1.topDocs(), collector2.topDocs());
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
}
reader.close();
dir.close();
}
private static void assertTopDocsEquals(TopDocs td1, TopDocs td2) {
assertEquals(td1.scoreDocs.length, td2.scoreDocs.length);
for (int i = 0; i < td1.scoreDocs.length; ++i) {
ScoreDoc sd1 = td1.scoreDocs[i];
ScoreDoc sd2 = td2.scoreDocs[i];
assertEquals(sd1.doc, sd2.doc);
assertEquals(sd1.score, sd2.score, 0f);
}
}
}