LUCENE-10206 Implement O(1) count on query cache (#415)

When we load a query into the query cache we always calculate the count
of matching documents. This uses that count to power the new `O(1)`
`Weight#count` method.
This commit is contained in:
Nik Everett 2021-10-27 04:20:10 -04:00 committed by GitHub
parent 1613355149
commit 941df98c3f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 186 additions and 40 deletions

View File

@ -43,6 +43,7 @@ import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.RoaringDocIdSet;
/**
@ -266,7 +267,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
}
}
DocIdSet get(Query key, IndexReader.CacheHelper cacheHelper) {
CacheAndCount get(Query key, IndexReader.CacheHelper cacheHelper) {
assert lock.isHeldByCurrentThread();
assert key instanceof BoostQuery == false;
assert key instanceof ConstantScoreQuery == false;
@ -282,7 +283,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
onMiss(readerKey, key);
return null;
}
final DocIdSet cached = leafCache.get(singleton);
final CacheAndCount cached = leafCache.get(singleton);
if (cached == null) {
onMiss(readerKey, singleton);
} else {
@ -291,7 +292,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
return cached;
}
private void putIfAbsent(Query query, DocIdSet set, IndexReader.CacheHelper cacheHelper) {
private void putIfAbsent(Query query, CacheAndCount cached, IndexReader.CacheHelper cacheHelper) {
assert query instanceof BoostQuery == false;
assert query instanceof ConstantScoreQuery == false;
// under a lock to make sure that mostRecentlyUsedQueries and cache remain sync'ed
@ -318,7 +319,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
// we just created a new leaf cache, need to register a close listener
cacheHelper.addClosedListener(this::clearCoreCacheKey);
}
leafCache.putIfAbsent(query, set);
leafCache.putIfAbsent(query, cached);
evictIfNecessary();
} finally {
lock.unlock();
@ -437,8 +438,8 @@ public class LRUQueryCache implements QueryCache, Accountable {
recomputedRamBytesUsed += mostRecentlyUsedQueries.size() * QUERY_DEFAULT_RAM_BYTES_USED;
for (LeafCache leafCache : cache.values()) {
recomputedRamBytesUsed += HASHTABLE_RAM_BYTES_PER_ENTRY * leafCache.cache.size();
for (DocIdSet set : leafCache.cache.values()) {
recomputedRamBytesUsed += set.ramBytesUsed();
for (CacheAndCount cached : leafCache.cache.values()) {
recomputedRamBytesUsed += cached.ramBytesUsed();
}
}
if (recomputedRamBytesUsed != ramBytesUsed) {
@ -498,7 +499,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
* Default cache implementation: uses {@link RoaringDocIdSet} for sets that have a density < 1%
* and a {@link BitDocIdSet} over a {@link FixedBitSet} otherwise.
*/
protected DocIdSet cacheImpl(BulkScorer scorer, int maxDoc) throws IOException {
protected CacheAndCount cacheImpl(BulkScorer scorer, int maxDoc) throws IOException {
if (scorer.cost() * 100 >= maxDoc) {
// FixedBitSet is faster for dense sets and will enable the random-access
// optimization in ConjunctionDISI
@ -508,9 +509,9 @@ public class LRUQueryCache implements QueryCache, Accountable {
}
}
private static DocIdSet cacheIntoBitSet(BulkScorer scorer, int maxDoc) throws IOException {
private static CacheAndCount cacheIntoBitSet(BulkScorer scorer, int maxDoc) throws IOException {
final FixedBitSet bitSet = new FixedBitSet(maxDoc);
long[] cost = new long[1];
int[] count = new int[1];
scorer.score(
new LeafCollector() {
@ -519,15 +520,15 @@ public class LRUQueryCache implements QueryCache, Accountable {
@Override
public void collect(int doc) throws IOException {
cost[0]++;
count[0]++;
bitSet.set(doc);
}
},
null);
return new BitDocIdSet(bitSet, cost[0]);
return new CacheAndCount(new BitDocIdSet(bitSet, count[0]), count[0]);
}
private static DocIdSet cacheIntoRoaringDocIdSet(BulkScorer scorer, int maxDoc)
private static CacheAndCount cacheIntoRoaringDocIdSet(BulkScorer scorer, int maxDoc)
throws IOException {
RoaringDocIdSet.Builder builder = new RoaringDocIdSet.Builder(maxDoc);
scorer.score(
@ -542,7 +543,8 @@ public class LRUQueryCache implements QueryCache, Accountable {
}
},
null);
return builder.build();
RoaringDocIdSet cache = builder.build();
return new CacheAndCount(cache, cache.cardinality());
}
/**
@ -622,7 +624,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
private class LeafCache implements Accountable {
private final Object key;
private final Map<Query, DocIdSet> cache;
private final Map<Query, CacheAndCount> cache;
private volatile long ramBytesUsed;
LeafCache(Object key) {
@ -641,25 +643,25 @@ public class LRUQueryCache implements QueryCache, Accountable {
LRUQueryCache.this.onDocIdSetEviction(key, 1, ramBytesUsed);
}
DocIdSet get(Query query) {
CacheAndCount get(Query query) {
assert query instanceof BoostQuery == false;
assert query instanceof ConstantScoreQuery == false;
return cache.get(query);
}
void putIfAbsent(Query query, DocIdSet set) {
void putIfAbsent(Query query, CacheAndCount cached) {
assert query instanceof BoostQuery == false;
assert query instanceof ConstantScoreQuery == false;
if (cache.putIfAbsent(query, set) == null) {
if (cache.putIfAbsent(query, cached) == null) {
// the set was actually put
onDocIdSetCache(HASHTABLE_RAM_BYTES_PER_ENTRY + set.ramBytesUsed());
onDocIdSetCache(HASHTABLE_RAM_BYTES_PER_ENTRY + cached.ramBytesUsed());
}
}
void remove(Query query) {
assert query instanceof BoostQuery == false;
assert query instanceof ConstantScoreQuery == false;
DocIdSet removed = cache.remove(query);
CacheAndCount removed = cache.remove(query);
if (removed != null) {
onDocIdSetEviction(HASHTABLE_RAM_BYTES_PER_ENTRY + removed.ramBytesUsed());
}
@ -703,10 +705,10 @@ public class LRUQueryCache implements QueryCache, Accountable {
return worstCaseRamUsage * 5 < totalRamAvailable;
}
private DocIdSet cache(LeafReaderContext context) throws IOException {
private CacheAndCount cache(LeafReaderContext context) throws IOException {
final BulkScorer scorer = in.bulkScorer(context);
if (scorer == null) {
return DocIdSet.EMPTY;
return CacheAndCount.EMPTY;
} else {
return cacheImpl(scorer, context.reader().maxDoc());
}
@ -747,18 +749,18 @@ public class LRUQueryCache implements QueryCache, Accountable {
return in.scorerSupplier(context);
}
DocIdSet docIdSet;
CacheAndCount cached;
try {
docIdSet = get(in.getQuery(), cacheHelper);
cached = get(in.getQuery(), cacheHelper);
} finally {
lock.unlock();
}
if (docIdSet == null) {
if (cached == null) {
if (policy.shouldCache(in.getQuery())) {
final ScorerSupplier supplier = in.scorerSupplier(context);
if (supplier == null) {
putIfAbsent(in.getQuery(), DocIdSet.EMPTY, cacheHelper);
putIfAbsent(in.getQuery(), CacheAndCount.EMPTY, cacheHelper);
return null;
}
@ -772,10 +774,10 @@ public class LRUQueryCache implements QueryCache, Accountable {
}
Scorer scorer = supplier.get(Long.MAX_VALUE);
DocIdSet docIdSet =
CacheAndCount cached =
cacheImpl(new DefaultBulkScorer(scorer), context.reader().maxDoc());
putIfAbsent(in.getQuery(), docIdSet, cacheHelper);
DocIdSetIterator disi = docIdSet.iterator();
putIfAbsent(in.getQuery(), cached, cacheHelper);
DocIdSetIterator disi = cached.iterator();
if (disi == null) {
// docIdSet.iterator() is allowed to return null when empty but we want a non-null
// iterator here
@ -796,11 +798,11 @@ public class LRUQueryCache implements QueryCache, Accountable {
}
}
assert docIdSet != null;
if (docIdSet == DocIdSet.EMPTY) {
assert cached != null;
if (cached == CacheAndCount.EMPTY) {
return null;
}
final DocIdSetIterator disi = docIdSet.iterator();
final DocIdSetIterator disi = cached.iterator();
if (disi == null) {
return null;
}
@ -830,7 +832,55 @@ public class LRUQueryCache implements QueryCache, Accountable {
@Override
public int count(LeafReaderContext context) throws IOException {
return in.count(context);
// If the wrapped weight can count quickly then use that
int innerCount = in.count(context);
if (innerCount != -1) {
return innerCount;
}
// Our cache won't have an accurate count if there are deletions
if (context.reader().hasDeletions()) {
return -1;
}
// Otherwise check if the count is in the cache
if (used.compareAndSet(false, true)) {
policy.onUse(getQuery());
}
if (in.isCacheable(context) == false) {
// this segment is not suitable for caching
return -1;
}
// Short-circuit: Check whether this segment is eligible for caching
// before we take a lock because of #get
if (shouldCache(context) == false) {
return -1;
}
final IndexReader.CacheHelper cacheHelper = context.reader().getCoreCacheHelper();
if (cacheHelper == null) {
// this reader has no cacheHelper
return -1;
}
// If the lock is already busy, prefer using the uncached version than waiting
if (lock.tryLock() == false) {
return -1;
}
CacheAndCount cached;
try {
cached = get(in.getQuery(), cacheHelper);
} finally {
lock.unlock();
}
if (cached == null) {
// Not cached
return -1;
}
return cached.count();
}
@Override
@ -866,27 +916,27 @@ public class LRUQueryCache implements QueryCache, Accountable {
return in.bulkScorer(context);
}
DocIdSet docIdSet;
CacheAndCount cached;
try {
docIdSet = get(in.getQuery(), cacheHelper);
cached = get(in.getQuery(), cacheHelper);
} finally {
lock.unlock();
}
if (docIdSet == null) {
if (cached == null) {
if (policy.shouldCache(in.getQuery())) {
docIdSet = cache(context);
putIfAbsent(in.getQuery(), docIdSet, cacheHelper);
cached = cache(context);
putIfAbsent(in.getQuery(), cached, cacheHelper);
} else {
return in.bulkScorer(context);
}
}
assert docIdSet != null;
if (docIdSet == DocIdSet.EMPTY) {
assert cached != null;
if (cached == CacheAndCount.EMPTY) {
return null;
}
final DocIdSetIterator disi = docIdSet.iterator();
final DocIdSetIterator disi = cached.iterator();
if (disi == null) {
return null;
}
@ -895,4 +945,32 @@ public class LRUQueryCache implements QueryCache, Accountable {
new ConstantScoreScorer(this, 0f, ScoreMode.COMPLETE_NO_SCORES, disi));
}
}
/** Cache of doc ids with a count. */
protected static class CacheAndCount implements Accountable {
protected static final CacheAndCount EMPTY = new CacheAndCount(DocIdSet.EMPTY, 0);
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(CacheAndCount.class);
private final DocIdSet cache;
private final int count;
public CacheAndCount(DocIdSet cache, int count) {
this.cache = cache;
this.count = count;
}
public DocIdSetIterator iterator() throws IOException {
return cache.iterator();
}
public int count() {
return count;
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + cache.ramBytesUsed();
}
}
}

View File

@ -62,6 +62,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
@ -1088,7 +1089,18 @@ public class TestLRUQueryCache extends LuceneTestCase {
cachedSearcher.setQueryCachingPolicy(ALWAYS_CACHE);
}
final Query q = buildRandomQuery(0);
/*
* Counts are the same. If the query has already been cached
* this'll use the O(1) Weight#count method.
*/
assertEquals(uncachedSearcher.count(q), cachedSearcher.count(q));
/*
* Just to make sure we can iterate every time also check that the
* same docs are returned in the same order.
*/
int size = 1 + random().nextInt(1000);
CheckHits.checkEqual(
q, uncachedSearcher.search(q, size).scoreDocs, cachedSearcher.search(q, size).scoreDocs);
if (rarely()) {
queryCache.assertConsistent();
}
@ -1976,4 +1988,60 @@ public class TestLRUQueryCache extends LuceneTestCase {
reader.close();
dir.close();
}
public void testCacheHasFastCount() throws IOException {
Query query = new PhraseQuery("words", new BytesRef("alice"), new BytesRef("ran"));
Directory dir = newDirectory();
RandomIndexWriter w =
new RandomIndexWriter(
random(), dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE));
Document doc1 = new Document();
doc1.add(new TextField("words", "tom ran", Store.NO));
Document doc2 = new Document();
doc2.add(new TextField("words", "alice ran", Store.NO));
doc2.add(new StringField("f", "a", Store.NO));
Document doc3 = new Document();
doc3.add(new TextField("words", "alice ran", Store.NO));
doc3.add(new StringField("f", "b", Store.NO));
w.addDocuments(Arrays.asList(doc1, doc2, doc3));
try (IndexReader reader = w.getReader()) {
IndexSearcher searcher = newSearcher(reader);
searcher.setQueryCachingPolicy(ALWAYS_CACHE);
LRUQueryCache allCache =
new LRUQueryCache(1000000, 10000000, context -> true, Float.POSITIVE_INFINITY);
searcher.setQueryCache(allCache);
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
LeafReaderContext context = getOnlyLeafReader(reader).getContext();
// We don't have a fast count before the cache is filled
assertEquals(-1, weight.count(context));
// Fetch the scorer to populate the cache
weight.scorer(context);
assertEquals(List.of(query), allCache.cachedQueries());
// Now we *do* have a fast count
assertEquals(2, weight.count(context));
}
w.deleteDocuments(new TermQuery(new Term("f", new BytesRef("b"))));
try (IndexReader reader = w.getReader()) {
IndexSearcher searcher = newSearcher(reader);
searcher.setQueryCachingPolicy(ALWAYS_CACHE);
LRUQueryCache allCache =
new LRUQueryCache(1000000, 10000000, context -> true, Float.POSITIVE_INFINITY);
searcher.setQueryCache(allCache);
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
LeafReaderContext context = getOnlyLeafReader(reader).getContext();
// We don't have a fast count before the cache is filled
assertEquals(-1, weight.count(context));
// Fetch the scorer to populate the cache
weight.scorer(context);
assertEquals(List.of(query), allCache.cachedQueries());
// We still don't have a fast count because we have deleted documents
assertEquals(-1, weight.count(context));
}
w.close();
dir.close();
}
}