Add #scoreSupplier support to DocValuesRewriteMethod along with singleton doc value opto (#1020)

This commit is contained in:
Greg Miller 2022-07-28 11:12:21 -07:00 committed by GitHub
parent bb752c774c
commit 4ebc249dbc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 94 additions and 40 deletions

View File

@ -100,6 +100,8 @@ Optimizations
* LUCENE-10661: Reduce memory copy in BytesStore. (luyuncheng)
* GITHUB#1020: Support #scoreSupplier and small optimizations to DocValuesRewriteMethod. (Greg Miller)
Bug Fixes
---------------------
* LUCENE-10663: Fix KnnVectorQuery explain with multiple segments. (Shiming Li)

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -83,21 +84,25 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
@Override
public Matches matches(LeafReaderContext context, int doc) throws IOException {
final SortedSetDocValues fcsi = DocValues.getSortedSet(context.reader(), query.field);
final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), query.field);
return MatchesUtils.forField(
query.field,
() ->
DisjunctionMatchesIterator.fromTermsEnum(
context, doc, query, query.field, getTermsEnum(fcsi)));
context, doc, query, query.field, getTermsEnum(values)));
}
private TermsEnum getTermsEnum(SortedSetDocValues fcsi) throws IOException {
/**
* Create a TermsEnum that provides the intersection of the query terms with the terms
* present in the doc values.
*/
private TermsEnum getTermsEnum(SortedSetDocValues values) throws IOException {
return query.getTermsEnum(
new Terms() {
@Override
public TermsEnum iterator() throws IOException {
return fcsi.termsEnum();
return values.termsEnum();
}
@Override
@ -142,46 +147,93 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
});
}
@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), query.field);
if (values.getValueCount() == 0) {
return null; // no values/docs so nothing can match
}
final Weight weight = this;
return new ScorerSupplier() {
@Override
public Scorer get(long leadCost) throws IOException {
// Create a TermsEnum that will provide the intersection of the terms specified in the
// query with the values present in the doc values:
TermsEnum termsEnum = getTermsEnum(values);
assert termsEnum != null;
if (termsEnum.next() == null) {
// no matching terms
return new ConstantScoreScorer(
weight, score(), scoreMode, DocIdSetIterator.empty());
}
// Create a bit set for the "term set" ordinals (these are the terms provided by the
// query that are actually present in the doc values field). Cannot use FixedBitSet
// because we require long index (ord):
final LongBitSet termSet = new LongBitSet(values.getValueCount());
do {
long ord = termsEnum.ord();
if (ord >= 0) {
termSet.set(ord);
}
} while (termsEnum.next() != null);
final SortedDocValues singleton = DocValues.unwrapSingleton(values);
final TwoPhaseIterator iterator;
if (singleton != null) {
iterator =
new TwoPhaseIterator(singleton) {
@Override
public boolean matches() throws IOException {
return termSet.get(singleton.ordValue());
}
@Override
public float matchCost() {
return 3; // lookup in a bitset
}
};
} else {
iterator =
new TwoPhaseIterator(values) {
@Override
public boolean matches() throws IOException {
for (int i = 0; i < values.docValueCount(); i++) {
if (termSet.get(values.nextOrd())) {
return true;
}
}
return false;
}
@Override
public float matchCost() {
return 3; // lookup in a bitset
}
};
}
return new ConstantScoreScorer(weight, score(), scoreMode, iterator);
}
@Override
public long cost() {
// We have no prior knowledge of how many docs might match for any given query term,
// so we assume that all docs with a value could be a match:
return values.cost();
}
};
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final SortedSetDocValues fcsi = DocValues.getSortedSet(context.reader(), query.field);
TermsEnum termsEnum = getTermsEnum(fcsi);
assert termsEnum != null;
if (termsEnum.next() == null) {
// no matching terms
final ScorerSupplier scorerSupplier = scorerSupplier(context);
if (scorerSupplier == null) {
return null;
}
// fill into a bitset
// Cannot use FixedBitSet because we require long index (ord):
final LongBitSet termSet = new LongBitSet(fcsi.getValueCount());
do {
long ord = termsEnum.ord();
if (ord >= 0) {
termSet.set(ord);
}
} while (termsEnum.next() != null);
return new ConstantScoreScorer(
this,
score(),
scoreMode,
new TwoPhaseIterator(fcsi) {
@Override
public boolean matches() throws IOException {
for (int i = 0; i < fcsi.docValueCount(); i++) {
if (termSet.get(fcsi.nextOrd())) {
return true;
}
}
return false;
}
@Override
public float matchCost() {
return 3; // lookup in a bitset
}
});
return scorerSupplier.get(Long.MAX_VALUE);
}
@Override