mirror of https://github.com/apache/lucene.git
LUCENE-10206 Implement O(1) count on query cache (#415)
When we load a query into the query cache we always calculate the count of matching documents. This uses that count to power the new `O(1)` `Weight#count` method.
This commit is contained in:
parent
1613355149
commit
941df98c3f
|
@ -43,6 +43,7 @@ import org.apache.lucene.util.Accountable;
|
|||
import org.apache.lucene.util.Accountables;
|
||||
import org.apache.lucene.util.BitDocIdSet;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.RoaringDocIdSet;
|
||||
|
||||
/**
|
||||
|
@ -266,7 +267,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
DocIdSet get(Query key, IndexReader.CacheHelper cacheHelper) {
|
||||
CacheAndCount get(Query key, IndexReader.CacheHelper cacheHelper) {
|
||||
assert lock.isHeldByCurrentThread();
|
||||
assert key instanceof BoostQuery == false;
|
||||
assert key instanceof ConstantScoreQuery == false;
|
||||
|
@ -282,7 +283,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
onMiss(readerKey, key);
|
||||
return null;
|
||||
}
|
||||
final DocIdSet cached = leafCache.get(singleton);
|
||||
final CacheAndCount cached = leafCache.get(singleton);
|
||||
if (cached == null) {
|
||||
onMiss(readerKey, singleton);
|
||||
} else {
|
||||
|
@ -291,7 +292,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
return cached;
|
||||
}
|
||||
|
||||
private void putIfAbsent(Query query, DocIdSet set, IndexReader.CacheHelper cacheHelper) {
|
||||
private void putIfAbsent(Query query, CacheAndCount cached, IndexReader.CacheHelper cacheHelper) {
|
||||
assert query instanceof BoostQuery == false;
|
||||
assert query instanceof ConstantScoreQuery == false;
|
||||
// under a lock to make sure that mostRecentlyUsedQueries and cache remain sync'ed
|
||||
|
@ -318,7 +319,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
// we just created a new leaf cache, need to register a close listener
|
||||
cacheHelper.addClosedListener(this::clearCoreCacheKey);
|
||||
}
|
||||
leafCache.putIfAbsent(query, set);
|
||||
leafCache.putIfAbsent(query, cached);
|
||||
evictIfNecessary();
|
||||
} finally {
|
||||
lock.unlock();
|
||||
|
@ -437,8 +438,8 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
recomputedRamBytesUsed += mostRecentlyUsedQueries.size() * QUERY_DEFAULT_RAM_BYTES_USED;
|
||||
for (LeafCache leafCache : cache.values()) {
|
||||
recomputedRamBytesUsed += HASHTABLE_RAM_BYTES_PER_ENTRY * leafCache.cache.size();
|
||||
for (DocIdSet set : leafCache.cache.values()) {
|
||||
recomputedRamBytesUsed += set.ramBytesUsed();
|
||||
for (CacheAndCount cached : leafCache.cache.values()) {
|
||||
recomputedRamBytesUsed += cached.ramBytesUsed();
|
||||
}
|
||||
}
|
||||
if (recomputedRamBytesUsed != ramBytesUsed) {
|
||||
|
@ -498,7 +499,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
* Default cache implementation: uses {@link RoaringDocIdSet} for sets that have a density < 1%
|
||||
* and a {@link BitDocIdSet} over a {@link FixedBitSet} otherwise.
|
||||
*/
|
||||
protected DocIdSet cacheImpl(BulkScorer scorer, int maxDoc) throws IOException {
|
||||
protected CacheAndCount cacheImpl(BulkScorer scorer, int maxDoc) throws IOException {
|
||||
if (scorer.cost() * 100 >= maxDoc) {
|
||||
// FixedBitSet is faster for dense sets and will enable the random-access
|
||||
// optimization in ConjunctionDISI
|
||||
|
@ -508,9 +509,9 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
private static DocIdSet cacheIntoBitSet(BulkScorer scorer, int maxDoc) throws IOException {
|
||||
private static CacheAndCount cacheIntoBitSet(BulkScorer scorer, int maxDoc) throws IOException {
|
||||
final FixedBitSet bitSet = new FixedBitSet(maxDoc);
|
||||
long[] cost = new long[1];
|
||||
int[] count = new int[1];
|
||||
scorer.score(
|
||||
new LeafCollector() {
|
||||
|
||||
|
@ -519,15 +520,15 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
cost[0]++;
|
||||
count[0]++;
|
||||
bitSet.set(doc);
|
||||
}
|
||||
},
|
||||
null);
|
||||
return new BitDocIdSet(bitSet, cost[0]);
|
||||
return new CacheAndCount(new BitDocIdSet(bitSet, count[0]), count[0]);
|
||||
}
|
||||
|
||||
private static DocIdSet cacheIntoRoaringDocIdSet(BulkScorer scorer, int maxDoc)
|
||||
private static CacheAndCount cacheIntoRoaringDocIdSet(BulkScorer scorer, int maxDoc)
|
||||
throws IOException {
|
||||
RoaringDocIdSet.Builder builder = new RoaringDocIdSet.Builder(maxDoc);
|
||||
scorer.score(
|
||||
|
@ -542,7 +543,8 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
}
|
||||
},
|
||||
null);
|
||||
return builder.build();
|
||||
RoaringDocIdSet cache = builder.build();
|
||||
return new CacheAndCount(cache, cache.cardinality());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -622,7 +624,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
private class LeafCache implements Accountable {
|
||||
|
||||
private final Object key;
|
||||
private final Map<Query, DocIdSet> cache;
|
||||
private final Map<Query, CacheAndCount> cache;
|
||||
private volatile long ramBytesUsed;
|
||||
|
||||
LeafCache(Object key) {
|
||||
|
@ -641,25 +643,25 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
LRUQueryCache.this.onDocIdSetEviction(key, 1, ramBytesUsed);
|
||||
}
|
||||
|
||||
DocIdSet get(Query query) {
|
||||
CacheAndCount get(Query query) {
|
||||
assert query instanceof BoostQuery == false;
|
||||
assert query instanceof ConstantScoreQuery == false;
|
||||
return cache.get(query);
|
||||
}
|
||||
|
||||
void putIfAbsent(Query query, DocIdSet set) {
|
||||
void putIfAbsent(Query query, CacheAndCount cached) {
|
||||
assert query instanceof BoostQuery == false;
|
||||
assert query instanceof ConstantScoreQuery == false;
|
||||
if (cache.putIfAbsent(query, set) == null) {
|
||||
if (cache.putIfAbsent(query, cached) == null) {
|
||||
// the set was actually put
|
||||
onDocIdSetCache(HASHTABLE_RAM_BYTES_PER_ENTRY + set.ramBytesUsed());
|
||||
onDocIdSetCache(HASHTABLE_RAM_BYTES_PER_ENTRY + cached.ramBytesUsed());
|
||||
}
|
||||
}
|
||||
|
||||
void remove(Query query) {
|
||||
assert query instanceof BoostQuery == false;
|
||||
assert query instanceof ConstantScoreQuery == false;
|
||||
DocIdSet removed = cache.remove(query);
|
||||
CacheAndCount removed = cache.remove(query);
|
||||
if (removed != null) {
|
||||
onDocIdSetEviction(HASHTABLE_RAM_BYTES_PER_ENTRY + removed.ramBytesUsed());
|
||||
}
|
||||
|
@ -703,10 +705,10 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
return worstCaseRamUsage * 5 < totalRamAvailable;
|
||||
}
|
||||
|
||||
private DocIdSet cache(LeafReaderContext context) throws IOException {
|
||||
private CacheAndCount cache(LeafReaderContext context) throws IOException {
|
||||
final BulkScorer scorer = in.bulkScorer(context);
|
||||
if (scorer == null) {
|
||||
return DocIdSet.EMPTY;
|
||||
return CacheAndCount.EMPTY;
|
||||
} else {
|
||||
return cacheImpl(scorer, context.reader().maxDoc());
|
||||
}
|
||||
|
@ -747,18 +749,18 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
return in.scorerSupplier(context);
|
||||
}
|
||||
|
||||
DocIdSet docIdSet;
|
||||
CacheAndCount cached;
|
||||
try {
|
||||
docIdSet = get(in.getQuery(), cacheHelper);
|
||||
cached = get(in.getQuery(), cacheHelper);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
|
||||
if (docIdSet == null) {
|
||||
if (cached == null) {
|
||||
if (policy.shouldCache(in.getQuery())) {
|
||||
final ScorerSupplier supplier = in.scorerSupplier(context);
|
||||
if (supplier == null) {
|
||||
putIfAbsent(in.getQuery(), DocIdSet.EMPTY, cacheHelper);
|
||||
putIfAbsent(in.getQuery(), CacheAndCount.EMPTY, cacheHelper);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -772,10 +774,10 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
}
|
||||
|
||||
Scorer scorer = supplier.get(Long.MAX_VALUE);
|
||||
DocIdSet docIdSet =
|
||||
CacheAndCount cached =
|
||||
cacheImpl(new DefaultBulkScorer(scorer), context.reader().maxDoc());
|
||||
putIfAbsent(in.getQuery(), docIdSet, cacheHelper);
|
||||
DocIdSetIterator disi = docIdSet.iterator();
|
||||
putIfAbsent(in.getQuery(), cached, cacheHelper);
|
||||
DocIdSetIterator disi = cached.iterator();
|
||||
if (disi == null) {
|
||||
// docIdSet.iterator() is allowed to return null when empty but we want a non-null
|
||||
// iterator here
|
||||
|
@ -796,11 +798,11 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
assert docIdSet != null;
|
||||
if (docIdSet == DocIdSet.EMPTY) {
|
||||
assert cached != null;
|
||||
if (cached == CacheAndCount.EMPTY) {
|
||||
return null;
|
||||
}
|
||||
final DocIdSetIterator disi = docIdSet.iterator();
|
||||
final DocIdSetIterator disi = cached.iterator();
|
||||
if (disi == null) {
|
||||
return null;
|
||||
}
|
||||
|
@ -830,7 +832,55 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
|
||||
@Override
|
||||
public int count(LeafReaderContext context) throws IOException {
|
||||
return in.count(context);
|
||||
// If the wrapped weight can count quickly then use that
|
||||
int innerCount = in.count(context);
|
||||
if (innerCount != -1) {
|
||||
return innerCount;
|
||||
}
|
||||
|
||||
// Our cache won't have an accurate count if there are deletions
|
||||
if (context.reader().hasDeletions()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Otherwise check if the count is in the cache
|
||||
if (used.compareAndSet(false, true)) {
|
||||
policy.onUse(getQuery());
|
||||
}
|
||||
|
||||
if (in.isCacheable(context) == false) {
|
||||
// this segment is not suitable for caching
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Short-circuit: Check whether this segment is eligible for caching
|
||||
// before we take a lock because of #get
|
||||
if (shouldCache(context) == false) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final IndexReader.CacheHelper cacheHelper = context.reader().getCoreCacheHelper();
|
||||
if (cacheHelper == null) {
|
||||
// this reader has no cacheHelper
|
||||
return -1;
|
||||
}
|
||||
|
||||
// If the lock is already busy, prefer using the uncached version than waiting
|
||||
if (lock.tryLock() == false) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
CacheAndCount cached;
|
||||
try {
|
||||
cached = get(in.getQuery(), cacheHelper);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
if (cached == null) {
|
||||
// Not cached
|
||||
return -1;
|
||||
}
|
||||
return cached.count();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -866,27 +916,27 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
return in.bulkScorer(context);
|
||||
}
|
||||
|
||||
DocIdSet docIdSet;
|
||||
CacheAndCount cached;
|
||||
try {
|
||||
docIdSet = get(in.getQuery(), cacheHelper);
|
||||
cached = get(in.getQuery(), cacheHelper);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
|
||||
if (docIdSet == null) {
|
||||
if (cached == null) {
|
||||
if (policy.shouldCache(in.getQuery())) {
|
||||
docIdSet = cache(context);
|
||||
putIfAbsent(in.getQuery(), docIdSet, cacheHelper);
|
||||
cached = cache(context);
|
||||
putIfAbsent(in.getQuery(), cached, cacheHelper);
|
||||
} else {
|
||||
return in.bulkScorer(context);
|
||||
}
|
||||
}
|
||||
|
||||
assert docIdSet != null;
|
||||
if (docIdSet == DocIdSet.EMPTY) {
|
||||
assert cached != null;
|
||||
if (cached == CacheAndCount.EMPTY) {
|
||||
return null;
|
||||
}
|
||||
final DocIdSetIterator disi = docIdSet.iterator();
|
||||
final DocIdSetIterator disi = cached.iterator();
|
||||
if (disi == null) {
|
||||
return null;
|
||||
}
|
||||
|
@ -895,4 +945,32 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
|||
new ConstantScoreScorer(this, 0f, ScoreMode.COMPLETE_NO_SCORES, disi));
|
||||
}
|
||||
}
|
||||
|
||||
/** Cache of doc ids with a count. */
|
||||
protected static class CacheAndCount implements Accountable {
|
||||
protected static final CacheAndCount EMPTY = new CacheAndCount(DocIdSet.EMPTY, 0);
|
||||
|
||||
private static final long BASE_RAM_BYTES_USED =
|
||||
RamUsageEstimator.shallowSizeOfInstance(CacheAndCount.class);
|
||||
private final DocIdSet cache;
|
||||
private final int count;
|
||||
|
||||
public CacheAndCount(DocIdSet cache, int count) {
|
||||
this.cache = cache;
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
public DocIdSetIterator iterator() throws IOException {
|
||||
return cache.iterator();
|
||||
}
|
||||
|
||||
public int count() {
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return BASE_RAM_BYTES_USED + cache.ramBytesUsed();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -62,6 +62,7 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -1088,7 +1089,18 @@ public class TestLRUQueryCache extends LuceneTestCase {
|
|||
cachedSearcher.setQueryCachingPolicy(ALWAYS_CACHE);
|
||||
}
|
||||
final Query q = buildRandomQuery(0);
|
||||
/*
|
||||
* Counts are the same. If the query has already been cached
|
||||
* this'll use the O(1) Weight#count method.
|
||||
*/
|
||||
assertEquals(uncachedSearcher.count(q), cachedSearcher.count(q));
|
||||
/*
|
||||
* Just to make sure we can iterate every time also check that the
|
||||
* same docs are returned in the same order.
|
||||
*/
|
||||
int size = 1 + random().nextInt(1000);
|
||||
CheckHits.checkEqual(
|
||||
q, uncachedSearcher.search(q, size).scoreDocs, cachedSearcher.search(q, size).scoreDocs);
|
||||
if (rarely()) {
|
||||
queryCache.assertConsistent();
|
||||
}
|
||||
|
@ -1976,4 +1988,60 @@ public class TestLRUQueryCache extends LuceneTestCase {
|
|||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testCacheHasFastCount() throws IOException {
|
||||
Query query = new PhraseQuery("words", new BytesRef("alice"), new BytesRef("ran"));
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w =
|
||||
new RandomIndexWriter(
|
||||
random(), dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE));
|
||||
Document doc1 = new Document();
|
||||
doc1.add(new TextField("words", "tom ran", Store.NO));
|
||||
Document doc2 = new Document();
|
||||
doc2.add(new TextField("words", "alice ran", Store.NO));
|
||||
doc2.add(new StringField("f", "a", Store.NO));
|
||||
Document doc3 = new Document();
|
||||
doc3.add(new TextField("words", "alice ran", Store.NO));
|
||||
doc3.add(new StringField("f", "b", Store.NO));
|
||||
w.addDocuments(Arrays.asList(doc1, doc2, doc3));
|
||||
|
||||
try (IndexReader reader = w.getReader()) {
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setQueryCachingPolicy(ALWAYS_CACHE);
|
||||
LRUQueryCache allCache =
|
||||
new LRUQueryCache(1000000, 10000000, context -> true, Float.POSITIVE_INFINITY);
|
||||
searcher.setQueryCache(allCache);
|
||||
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
LeafReaderContext context = getOnlyLeafReader(reader).getContext();
|
||||
// We don't have a fast count before the cache is filled
|
||||
assertEquals(-1, weight.count(context));
|
||||
// Fetch the scorer to populate the cache
|
||||
weight.scorer(context);
|
||||
assertEquals(List.of(query), allCache.cachedQueries());
|
||||
// Now we *do* have a fast count
|
||||
assertEquals(2, weight.count(context));
|
||||
}
|
||||
|
||||
w.deleteDocuments(new TermQuery(new Term("f", new BytesRef("b"))));
|
||||
try (IndexReader reader = w.getReader()) {
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setQueryCachingPolicy(ALWAYS_CACHE);
|
||||
LRUQueryCache allCache =
|
||||
new LRUQueryCache(1000000, 10000000, context -> true, Float.POSITIVE_INFINITY);
|
||||
searcher.setQueryCache(allCache);
|
||||
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
LeafReaderContext context = getOnlyLeafReader(reader).getContext();
|
||||
// We don't have a fast count before the cache is filled
|
||||
assertEquals(-1, weight.count(context));
|
||||
// Fetch the scorer to populate the cache
|
||||
weight.scorer(context);
|
||||
assertEquals(List.of(query), allCache.cachedQueries());
|
||||
// We still don't have a fast count because we have deleted documents
|
||||
assertEquals(-1, weight.count(context));
|
||||
}
|
||||
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue