mirror of https://github.com/apache/lucene.git
LUCENE-9620 Add Weight#count(LeafReaderContext) (#242)
Add a default implementation in Weight.java and add sample faster implementations in MatchAllDocsQuery, MatchNoDocsQuery, TermQuery Add tests for BooleanQuery and TermQuery Co-authored-by: Gautam Worah <gauworah@amazon.com> Co-authored-by: Adrien Grand <jpountz@gmail.com>
This commit is contained in:
parent
059d06cec7
commit
44e9f5de53
|
@ -143,6 +143,10 @@ API Changes
|
||||||
* LUCENE-10027: Directory reader open API from indexCommit and leafSorter has been modified
|
* LUCENE-10027: Directory reader open API from indexCommit and leafSorter has been modified
|
||||||
to add an extra parameter - minSupportedMajorVersion. (Mayya Sharipova)
|
to add an extra parameter - minSupportedMajorVersion. (Mayya Sharipova)
|
||||||
|
|
||||||
|
* LUCENE-9620: Added a (sometimes) faster implementation for IndexSearcher#count that relies on the new Weight#count API.
|
||||||
|
The Weight#count API represents a cleaner way for Query classes to optimize their counting method.
|
||||||
|
(Gautam Worah, Adrien Grand)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss)
|
* LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss)
|
||||||
|
|
|
@ -168,6 +168,11 @@ public final class ConstantScoreQuery extends Query {
|
||||||
public boolean isCacheable(LeafReaderContext ctx) {
|
public boolean isCacheable(LeafReaderContext ctx) {
|
||||||
return innerWeight.isCacheable(ctx);
|
return innerWeight.isCacheable(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int count(LeafReaderContext context) throws IOException {
|
||||||
|
return innerWeight.count(context);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return innerWeight;
|
return innerWeight;
|
||||||
|
|
|
@ -402,49 +402,57 @@ public class IndexSearcher {
|
||||||
return similarity;
|
return similarity;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class ShortcutHitCountCollector implements Collector {
|
||||||
|
private final Weight weight;
|
||||||
|
private final TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
|
||||||
|
private int weightCount;
|
||||||
|
|
||||||
|
ShortcutHitCountCollector(Weight weight) {
|
||||||
|
this.weight = weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
|
||||||
|
int count = weight.count(context);
|
||||||
|
// check if the number of hits can be computed in constant time
|
||||||
|
if (count == -1) {
|
||||||
|
// use a TotalHitCountCollector to calculate the number of hits in the usual way
|
||||||
|
return totalHitCountCollector.getLeafCollector(context);
|
||||||
|
} else {
|
||||||
|
weightCount += count;
|
||||||
|
throw new CollectionTerminatedException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ScoreMode scoreMode() {
|
||||||
|
return ScoreMode.COMPLETE_NO_SCORES;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Count how many documents match the given query. */
|
/** Count how many documents match the given query. */
|
||||||
public int count(Query query) throws IOException {
|
public int count(Query query) throws IOException {
|
||||||
query = rewrite(query);
|
query = rewrite(query);
|
||||||
while (true) {
|
final Weight weight = createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||||
// remove wrappers that don't matter for counts
|
|
||||||
if (query instanceof ConstantScoreQuery) {
|
|
||||||
query = ((ConstantScoreQuery) query).getQuery();
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// some counts can be computed in constant time
|
|
||||||
if (query instanceof MatchAllDocsQuery) {
|
|
||||||
return reader.numDocs();
|
|
||||||
} else if (query instanceof TermQuery && reader.hasDeletions() == false) {
|
|
||||||
Term term = ((TermQuery) query).getTerm();
|
|
||||||
int count = 0;
|
|
||||||
for (LeafReaderContext leaf : reader.leaves()) {
|
|
||||||
count += leaf.reader().docFreq(term);
|
|
||||||
}
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
// general case: create a collector and count matches
|
|
||||||
final CollectorManager<TotalHitCountCollector, Integer> collectorManager =
|
|
||||||
new CollectorManager<TotalHitCountCollector, Integer>() {
|
|
||||||
|
|
||||||
|
final CollectorManager<ShortcutHitCountCollector, Integer> shortcutCollectorManager =
|
||||||
|
new CollectorManager<ShortcutHitCountCollector, Integer>() {
|
||||||
@Override
|
@Override
|
||||||
public TotalHitCountCollector newCollector() throws IOException {
|
public ShortcutHitCountCollector newCollector() throws IOException {
|
||||||
return new TotalHitCountCollector();
|
return new ShortcutHitCountCollector(weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Integer reduce(Collection<TotalHitCountCollector> collectors) throws IOException {
|
public Integer reduce(Collection<ShortcutHitCountCollector> collectors)
|
||||||
int total = 0;
|
throws IOException {
|
||||||
for (TotalHitCountCollector collector : collectors) {
|
int totalHitCount = 0;
|
||||||
total += collector.getTotalHits();
|
for (ShortcutHitCountCollector c : collectors) {
|
||||||
|
totalHitCount += c.weightCount + c.totalHitCountCollector.getTotalHits();
|
||||||
}
|
}
|
||||||
return total;
|
return totalHitCount;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
return search(query, collectorManager);
|
return search(weight, shortcutCollectorManager, new ShortcutHitCountCollector(weight));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -659,29 +667,29 @@ public class IndexSearcher {
|
||||||
*/
|
*/
|
||||||
public <C extends Collector, T> T search(Query query, CollectorManager<C, T> collectorManager)
|
public <C extends Collector, T> T search(Query query, CollectorManager<C, T> collectorManager)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
final C firstCollector = collectorManager.newCollector();
|
||||||
|
query = rewrite(query);
|
||||||
|
final Weight weight = createWeight(query, firstCollector.scoreMode(), 1);
|
||||||
|
return search(weight, collectorManager, firstCollector);
|
||||||
|
}
|
||||||
|
|
||||||
|
private <C extends Collector, T> T search(
|
||||||
|
Weight weight, CollectorManager<C, T> collectorManager, C firstCollector) throws IOException {
|
||||||
if (executor == null || leafSlices.length <= 1) {
|
if (executor == null || leafSlices.length <= 1) {
|
||||||
final C collector = collectorManager.newCollector();
|
search(leafContexts, weight, firstCollector);
|
||||||
search(query, collector);
|
return collectorManager.reduce(Collections.singletonList(firstCollector));
|
||||||
return collectorManager.reduce(Collections.singletonList(collector));
|
|
||||||
} else {
|
} else {
|
||||||
final List<C> collectors = new ArrayList<>(leafSlices.length);
|
final List<C> collectors = new ArrayList<>(leafSlices.length);
|
||||||
ScoreMode scoreMode = null;
|
collectors.add(firstCollector);
|
||||||
for (int i = 0; i < leafSlices.length; ++i) {
|
final ScoreMode scoreMode = firstCollector.scoreMode();
|
||||||
|
for (int i = 1; i < leafSlices.length; ++i) {
|
||||||
final C collector = collectorManager.newCollector();
|
final C collector = collectorManager.newCollector();
|
||||||
collectors.add(collector);
|
collectors.add(collector);
|
||||||
if (scoreMode == null) {
|
if (scoreMode != collector.scoreMode()) {
|
||||||
scoreMode = collector.scoreMode();
|
|
||||||
} else if (scoreMode != collector.scoreMode()) {
|
|
||||||
throw new IllegalStateException(
|
throw new IllegalStateException(
|
||||||
"CollectorManager does not always produce collectors with the same score mode");
|
"CollectorManager does not always produce collectors with the same score mode");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (scoreMode == null) {
|
|
||||||
// no segments
|
|
||||||
scoreMode = ScoreMode.COMPLETE;
|
|
||||||
}
|
|
||||||
query = rewrite(query);
|
|
||||||
final Weight weight = createWeight(query, scoreMode, 1);
|
|
||||||
final List<FutureTask<C>> listTasks = new ArrayList<>();
|
final List<FutureTask<C>> listTasks = new ArrayList<>();
|
||||||
for (int i = 0; i < leafSlices.length; ++i) {
|
for (int i = 0; i < leafSlices.length; ++i) {
|
||||||
final LeafReaderContext[] leaves = leafSlices[i].leaves;
|
final LeafReaderContext[] leaves = leafSlices[i].leaves;
|
||||||
|
|
|
@ -72,6 +72,11 @@ public final class MatchAllDocsQuery extends Query {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int count(LeafReaderContext context) {
|
||||||
|
return context.reader().numDocs();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -52,6 +52,11 @@ public class MatchNoDocsQuery extends Query {
|
||||||
public boolean isCacheable(LeafReaderContext ctx) {
|
public boolean isCacheable(LeafReaderContext ctx) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int count(LeafReaderContext context) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -179,6 +179,22 @@ public class TermQuery extends Query {
|
||||||
}
|
}
|
||||||
return Explanation.noMatch("no matching term");
|
return Explanation.noMatch("no matching term");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int count(LeafReaderContext context) throws IOException {
|
||||||
|
if (context.reader().hasDeletions() == false) {
|
||||||
|
TermsEnum termsEnum = getTermsEnum(context);
|
||||||
|
// termsEnum is not null if term state is available
|
||||||
|
if (termsEnum != null) {
|
||||||
|
return termsEnum.docFreq();
|
||||||
|
} else {
|
||||||
|
// the term cannot be found in the dictionary so the count is 0
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return super.count(context);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Constructs a query for the term <code>t</code>. */
|
/** Constructs a query for the term <code>t</code>. */
|
||||||
|
|
|
@ -174,6 +174,27 @@ public abstract class Weight implements SegmentCacheable {
|
||||||
return new DefaultBulkScorer(scorer);
|
return new DefaultBulkScorer(scorer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts the number of live documents that match a given {@link Weight#parentQuery} in a leaf.
|
||||||
|
*
|
||||||
|
* <p>The default implementation returns -1 for every query. This indicates that the count could
|
||||||
|
* not be computed in O(1) time.
|
||||||
|
*
|
||||||
|
* <p>Specific query classes should override it to provide other accurate O(1) implementations
|
||||||
|
* (that actually return the count). Look at {@link MatchAllDocsQuery#createWeight(IndexSearcher,
|
||||||
|
* ScoreMode, float)} for an example
|
||||||
|
*
|
||||||
|
* <p>We use this property of the function to to count hits in {@link IndexSearcher#count(Query)}.
|
||||||
|
*
|
||||||
|
* @param context the {@link org.apache.lucene.index.LeafReaderContext} for which to return the
|
||||||
|
* count.
|
||||||
|
* @return integer count of the number of matches
|
||||||
|
* @throws IOException if there is a low-level I/O error
|
||||||
|
*/
|
||||||
|
public int count(LeafReaderContext context) throws IOException {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Just wraps a Scorer and performs top scoring using it.
|
* Just wraps a Scorer and performs top scoring using it.
|
||||||
*
|
*
|
||||||
|
|
|
@ -41,6 +41,7 @@ import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanClause.Occur;
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.NamedThreadFactory;
|
import org.apache.lucene.util.NamedThreadFactory;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
@ -736,6 +737,45 @@ public class TestBooleanQuery extends LuceneTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LUCENE-9620 Add Weight#count(LeafReaderContext)
|
||||||
|
public void testQueryMatchesCount() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
|
||||||
|
int randomNumDocs = random().nextInt(500);
|
||||||
|
int numMatchingDocs = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < randomNumDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
Field f;
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
f = newTextField("field", "a b c " + random().nextInt(), Field.Store.NO);
|
||||||
|
numMatchingDocs++;
|
||||||
|
} else {
|
||||||
|
f = newTextField("field", String.valueOf(random().nextInt()), Field.Store.NO);
|
||||||
|
}
|
||||||
|
doc.add(f);
|
||||||
|
w.addDocument(doc);
|
||||||
|
}
|
||||||
|
w.commit();
|
||||||
|
|
||||||
|
DirectoryReader reader = w.getReader();
|
||||||
|
final IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
|
||||||
|
BooleanQuery.Builder q = new BooleanQuery.Builder();
|
||||||
|
q.add(new PhraseQuery("field", "a", "b"), Occur.SHOULD);
|
||||||
|
q.add(new TermQuery(new Term("field", "c")), Occur.SHOULD);
|
||||||
|
|
||||||
|
Query builtQuery = q.build();
|
||||||
|
|
||||||
|
assertEquals(searcher.count(builtQuery), numMatchingDocs);
|
||||||
|
final Weight weight = searcher.createWeight(builtQuery, ScoreMode.COMPLETE, 1);
|
||||||
|
// tests that the Weight#count API returns -1 instead of returning the total number of matches
|
||||||
|
assertEquals(weight.count(reader.leaves().get(0)), -1);
|
||||||
|
|
||||||
|
IOUtils.close(reader, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
public void testToString() {
|
public void testToString() {
|
||||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||||
bq.add(new TermQuery(new Term("field", "a")), Occur.SHOULD);
|
bq.add(new TermQuery(new Term("field", "a")), Occur.SHOULD);
|
||||||
|
|
|
@ -36,7 +36,8 @@ public class TestFilterWeight extends LuceneTestCase {
|
||||||
final int modifiers = superClassMethod.getModifiers();
|
final int modifiers = superClassMethod.getModifiers();
|
||||||
if (Modifier.isFinal(modifiers)) continue;
|
if (Modifier.isFinal(modifiers)) continue;
|
||||||
if (Modifier.isStatic(modifiers)) continue;
|
if (Modifier.isStatic(modifiers)) continue;
|
||||||
if (Arrays.asList("bulkScorer", "scorerSupplier").contains(superClassMethod.getName())) {
|
if (Arrays.asList("bulkScorer", "scorerSupplier", "count")
|
||||||
|
.contains(superClassMethod.getName())) {
|
||||||
try {
|
try {
|
||||||
final Method subClassMethod =
|
final Method subClassMethod =
|
||||||
subClass.getDeclaredMethod(
|
subClass.getDeclaredMethod(
|
||||||
|
|
|
@ -103,6 +103,35 @@ public class TestTermQuery extends LuceneTestCase {
|
||||||
IOUtils.close(reader, w, dir);
|
IOUtils.close(reader, w, dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LUCENE-9620 Add Weight#count(LeafReaderContext)
|
||||||
|
public void testQueryMatchesCount() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
|
||||||
|
int randomNumDocs = random().nextInt(500);
|
||||||
|
int numMatchingDocs = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < randomNumDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
doc.add(new StringField("foo", "bar", Store.NO));
|
||||||
|
numMatchingDocs++;
|
||||||
|
}
|
||||||
|
w.addDocument(doc);
|
||||||
|
}
|
||||||
|
w.commit();
|
||||||
|
|
||||||
|
DirectoryReader reader = w.getReader();
|
||||||
|
final IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
|
||||||
|
Query testQuery = new TermQuery(new Term("foo", "bar"));
|
||||||
|
assertEquals(searcher.count(testQuery), numMatchingDocs);
|
||||||
|
final Weight weight = searcher.createWeight(testQuery, ScoreMode.COMPLETE, 1);
|
||||||
|
assertEquals(weight.count(reader.leaves().get(0)), numMatchingDocs);
|
||||||
|
|
||||||
|
IOUtils.close(reader, w, dir);
|
||||||
|
}
|
||||||
|
|
||||||
public void testGetTermStates() throws Exception {
|
public void testGetTermStates() throws Exception {
|
||||||
|
|
||||||
// no term states:
|
// no term states:
|
||||||
|
|
Loading…
Reference in New Issue