LUCENE-2468: make CachingWrapperFilter treatment of new deletes customizable

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@949288 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2010-05-28 20:45:29 +00:00
parent 7e5a4cd67e
commit 85a977bf9d
13 changed files with 453 additions and 80 deletions

View File

@ -425,6 +425,18 @@ Bug fixes
* LUCENE-2478: Fix CachingWrapperFilter to not throw NPE when
Filter.getDocIdSet() returns null. (Uwe Schindler, Daniel Noll)
* LUCENE-2468: Allow specifying how new deletions should be handled in
CachingWrapperFilter and CachingSpanFilter. By default, new
deletions are ignored in CachingWrapperFilter, since typically this
filter is AND'd with a query that correctly takes new deletions into
account. This should be a performance gain (higher cache hit rate)
in apps that reopen readers, or use near-real-time reader
(IndexWriter.getReader()), but may introduce invalid search results
(allowing deleted docs to be returned) for certain cases, so a new
expert ctor was added to CachingWrapperFilter to enforce deletions
at a performance cost. CachingSpanFilter by default recaches if
there are new deletions (Shay Banon via Mike McCandless)
New features
* LUCENE-2128: Parallelized fetching document frequencies during weight

View File

@ -309,8 +309,8 @@ public class FilterIndexReader extends IndexReader {
* contents of the FieldCache, you must override this
* method to provide a different key */
@Override
public Object getFieldCacheKey() {
return in.getFieldCacheKey();
public Object getCoreCacheKey() {
return in.getCoreCacheKey();
}
/** {@inheritDoc} */

View File

@ -1367,7 +1367,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
}
/** Expert */
public Object getFieldCacheKey() {
public Object getCoreCacheKey() {
return this;
}

View File

@ -1293,7 +1293,7 @@ public class SegmentReader extends IndexReader implements Cloneable {
// share the underlying postings data) will map to the
// same entry in the FieldCache. See LUCENE-1579.
@Override
public final Object getFieldCacheKey() {
public final Object getCoreCacheKey() {
return core;
}

View File

@ -17,11 +17,10 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.Bits;
import java.io.IOException;
import java.util.Map;
import java.util.WeakHashMap;
import java.util.concurrent.locks.ReentrantLock;
/**
* Wraps another SpanFilter's result and caches it. The purpose is to allow
@ -33,15 +32,32 @@ public class CachingSpanFilter extends SpanFilter {
/**
* A transient Filter cache (package private because of test)
*/
private transient Map<IndexReader,SpanFilterResult> cache;
private final ReentrantLock lock = new ReentrantLock();
private final CachingWrapperFilter.FilterCache<SpanFilterResult> cache;
/**
* New deletions always result in a cache miss, by default
* ({@link CachingWrapperFilter.DeletesMode#RECACHE}.
* @param filter Filter to cache results of
*/
public CachingSpanFilter(SpanFilter filter) {
this(filter, CachingWrapperFilter.DeletesMode.RECACHE);
}
/**
* @param filter Filter to cache results of
* @param deletesMode See {@link CachingWrapperFilter.DeletesMode}
*/
public CachingSpanFilter(SpanFilter filter, CachingWrapperFilter.DeletesMode deletesMode) {
this.filter = filter;
if (deletesMode == CachingWrapperFilter.DeletesMode.DYNAMIC) {
throw new IllegalArgumentException("DeletesMode.DYNAMIC is not supported");
}
this.cache = new CachingWrapperFilter.FilterCache<SpanFilterResult>(deletesMode) {
@Override
protected SpanFilterResult mergeDeletes(final Bits delDocs, final SpanFilterResult value) {
throw new IllegalStateException("DeletesMode.DYNAMIC is not supported");
}
};
}
@Override
@ -50,26 +66,24 @@ public class CachingSpanFilter extends SpanFilter {
return result != null ? result.getDocIdSet() : null;
}
// for testing
int hitCount, missCount;
private SpanFilterResult getCachedResult(IndexReader reader) throws IOException {
lock.lock();
try {
if (cache == null) {
cache = new WeakHashMap<IndexReader,SpanFilterResult>();
}
final SpanFilterResult cached = cache.get(reader);
if (cached != null) return cached;
} finally {
lock.unlock();
final Object coreKey = reader.getCoreCacheKey();
final Object delCoreKey = reader.hasDeletions() ? MultiFields.getDeletedDocs(reader) : coreKey;
SpanFilterResult result = cache.get(reader, coreKey, delCoreKey);
if (result != null) {
hitCount++;
return result;
}
final SpanFilterResult result = filter.bitSpans(reader);
lock.lock();
try {
cache.put(reader, result);
} finally {
lock.unlock();
}
missCount++;
result = filter.bitSpans(reader);
cache.put(coreKey, delCoreKey, result);
return result;
}

View File

@ -17,13 +17,15 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.Serializable;
import java.io.IOException;
import java.util.Map;
import java.util.WeakHashMap;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.OpenBitSetDISI;
import org.apache.lucene.util.Bits;
/**
* Wraps another filter's result and caches it. The purpose is to allow
@ -33,17 +35,130 @@ public class CachingWrapperFilter extends Filter {
Filter filter;
/**
* A transient Filter cache (package private because of test)
* Expert: Specifies how new deletions against a reopened
* reader should be handled.
*
* <p>The default is IGNORE, which means the cache entry
* will be re-used for a given segment, even when that
* segment has been reopened due to changes in deletions.
* This is a big performance gain, especially with
* near-real-timer readers, since you don't hit a cache
* miss on every reopened reader for prior segments.</p>
*
* <p>However, in some cases this can cause invalid query
* results, allowing deleted documents to be returned.
* This only happens if the main query does not rule out
* deleted documents on its own, such as a toplevel
* ConstantScoreQuery. To fix this, use RECACHE to
* re-create the cached filter (at a higher per-reopen
* cost, but at faster subsequent search performance), or
* use DYNAMIC to dynamically intersect deleted docs (fast
* reopen time but some hit to search performance).</p>
*/
transient Map<IndexReader,DocIdSet> cache;
private final ReentrantLock lock = new ReentrantLock();
public static enum DeletesMode {IGNORE, RECACHE, DYNAMIC};
protected final FilterCache<DocIdSet> cache;
static abstract class FilterCache<T> implements Serializable {
/**
* A transient Filter cache (package private because of test)
*/
// NOTE: not final so that we can dynamically re-init
// after de-serialize
transient Map<Object,T> cache;
private final DeletesMode deletesMode;
public FilterCache(DeletesMode deletesMode) {
this.deletesMode = deletesMode;
}
public synchronized T get(IndexReader reader, Object coreKey, Object delCoreKey) throws IOException {
T value;
if (cache == null) {
cache = new WeakHashMap<Object,T>();
}
if (deletesMode == DeletesMode.IGNORE) {
// key on core
value = cache.get(coreKey);
} else if (deletesMode == DeletesMode.RECACHE) {
// key on deletes, if any, else core
value = cache.get(delCoreKey);
} else {
assert deletesMode == DeletesMode.DYNAMIC;
// first try for exact match
value = cache.get(delCoreKey);
if (value == null) {
// now for core match, but dynamically AND NOT
// deletions
value = cache.get(coreKey);
if (value != null) {
final Bits delDocs = MultiFields.getDeletedDocs(reader);
if (delDocs != null) {
value = mergeDeletes(delDocs, value);
}
}
}
}
return value;
}
protected abstract T mergeDeletes(Bits delDocs, T value);
public synchronized void put(Object coreKey, Object delCoreKey, T value) {
if (deletesMode == DeletesMode.IGNORE) {
cache.put(coreKey, value);
} else if (deletesMode == DeletesMode.RECACHE) {
cache.put(delCoreKey, value);
} else {
cache.put(coreKey, value);
cache.put(delCoreKey, value);
}
}
}
/**
* New deletes are ignored by default, which gives higher
* cache hit rate on reopened readers. Most of the time
* this is safe, because the filter will be AND'd with a
* Query that fully enforces deletions. If instead you
* need this filter to always enforce deletions, pass
* either {@link DeletesMode#RECACHE} or {@link
* DeletesMode#DYNAMIC}.
* @param filter Filter to cache results of
*/
public CachingWrapperFilter(Filter filter) {
this(filter, DeletesMode.IGNORE);
}
/**
* Expert: by default, the cached filter will be shared
* across reopened segments that only had changes to their
* deletions.
*
* @param filter Filter to cache results of
* @param deletesMode See {@link DeletesMode}
*/
public CachingWrapperFilter(Filter filter, DeletesMode deletesMode) {
this.filter = filter;
cache = new FilterCache<DocIdSet>(deletesMode) {
@Override
public DocIdSet mergeDeletes(final Bits delDocs, final DocIdSet docIdSet) {
return new FilteredDocIdSet(docIdSet) {
@Override
protected boolean match(int docID) {
return !delDocs.get(docID);
}
};
}
};
}
/** Provide the DocIdSet to be cached, using the DocIdSet provided
@ -66,29 +181,29 @@ public class CachingWrapperFilter extends Filter {
return (it == null) ? DocIdSet.EMPTY_DOCIDSET : new OpenBitSetDISI(it, reader.maxDoc());
}
}
// for testing
int hitCount, missCount;
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
lock.lock();
try {
if (cache == null) {
cache = new WeakHashMap<IndexReader,DocIdSet>();
}
final DocIdSet cached = cache.get(reader);
if (cached != null) return cached;
} finally {
lock.unlock();
final Object coreKey = reader.getCoreCacheKey();
final Object delCoreKey = reader.hasDeletions() ? MultiFields.getDeletedDocs(reader) : coreKey;
DocIdSet docIdSet = cache.get(reader, coreKey, delCoreKey);
if (docIdSet != null) {
hitCount++;
return docIdSet;
}
final DocIdSet docIdSet = docIdSetToCache(filter.getDocIdSet(reader), reader);
missCount++;
// cache miss
docIdSet = docIdSetToCache(filter.getDocIdSet(reader), reader);
if (docIdSet != null) {
lock.lock();
try {
cache.put(reader, docIdSet);
} finally {
lock.unlock();
}
cache.put(coreKey, delCoreKey, docIdSet);
}
return docIdSet;

View File

@ -154,7 +154,7 @@ class FieldCacheImpl implements FieldCache {
/** Remove this reader from the cache, if present. */
public void purge(IndexReader r) {
Object readerKey = r.getFieldCacheKey();
Object readerKey = r.getCoreCacheKey();
synchronized(readerCache) {
readerCache.remove(readerKey);
}
@ -163,7 +163,7 @@ class FieldCacheImpl implements FieldCache {
public Object get(IndexReader reader, Entry key) throws IOException {
Map<Entry,Object> innerCache;
Object value;
final Object readerKey = reader.getFieldCacheKey();
final Object readerKey = reader.getCoreCacheKey();
synchronized (readerCache) {
innerCache = readerCache.get(readerKey);
if (innerCache == null) {

View File

@ -162,7 +162,7 @@ public class IndexSearcher extends Searcher {
throw new IllegalArgumentException("nDocs must be > 0");
}
nDocs = Math.min(nDocs, reader.numDocs());
nDocs = Math.min(nDocs, reader.maxDoc());
TopScoreDocCollector collector = TopScoreDocCollector.create(nDocs, !weight.scoresDocsOutOfOrder());
search(weight, filter, collector);
@ -190,7 +190,7 @@ public class IndexSearcher extends Searcher {
Sort sort, boolean fillFields)
throws IOException {
nDocs = Math.min(nDocs, reader.numDocs());
nDocs = Math.min(nDocs, reader.maxDoc());
TopFieldCollector collector = TopFieldCollector.create(sort, nDocs,
fillFields, fieldSortDoTrackScores, fieldSortDoMaxScore, !weight.scoresDocsOutOfOrder());

View File

@ -274,7 +274,7 @@ public final class FieldCacheSanityChecker {
if (obj instanceof IndexReader) {
IndexReader[] subs = ((IndexReader)obj).getSequentialSubReaders();
for (int j = 0; (null != subs) && (j < subs.length); j++) {
all.add(subs[j].getFieldCacheKey());
all.add(subs[j].getCoreCacheKey());
}
}

View File

@ -18,7 +18,6 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.WeakHashMap;
import junit.framework.Assert;
import org.apache.lucene.index.IndexReader;
@ -42,30 +41,18 @@ public class CachingWrapperFilterHelper extends CachingWrapperFilter {
}
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
if (cache == null) {
cache = new WeakHashMap<IndexReader,DocIdSet>();
}
synchronized (cache) { // check cache
DocIdSet cached = cache.get(reader);
if (shouldHaveCache) {
Assert.assertNotNull("Cache should have data ", cached);
} else {
Assert.assertNull("Cache should be null " + cached , cached);
}
if (cached != null) {
return cached;
}
public synchronized DocIdSet getDocIdSet(IndexReader reader) throws IOException {
final int saveMissCount = missCount;
DocIdSet docIdSet = super.getDocIdSet(reader);
if (shouldHaveCache) {
Assert.assertEquals("Cache should have data ", saveMissCount, missCount);
} else {
Assert.assertTrue("Cache should be null " + docIdSet, missCount > saveMissCount);
}
final DocIdSet bits = filter.getDocIdSet(reader);
synchronized (cache) { // update cache
cache.put(reader, bits);
}
return bits;
return docIdSet;
}
@Override

View File

@ -0,0 +1,123 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
public class TestCachingSpanFilter extends LuceneTestCase {
public void testEnforceDeletions() throws Exception {
Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
IndexReader reader = writer.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
// add a doc, refresh the reader, and check that its there
Document doc = new Document();
doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(new MatchAllDocsQuery(), 1);
assertEquals("Should find a hit...", 1, docs.totalHits);
final SpanFilter startFilter = new SpanQueryFilter(new SpanTermQuery(new Term("id", "1")));
// ignore deletions
CachingSpanFilter filter = new CachingSpanFilter(startFilter, CachingWrapperFilter.DeletesMode.IGNORE);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should find a hit...", 1, docs.totalHits);
ConstantScoreQuery constantScore = new ConstantScoreQuery(filter);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
// now delete the doc, refresh the reader, and see that it's not there
writer.deleteDocuments(new Term("id", "1"));
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should *not* find a hit...", 0, docs.totalHits);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
// force cache to regenerate:
filter = new CachingSpanFilter(startFilter, CachingWrapperFilter.DeletesMode.RECACHE);
writer.addDocument(doc);
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should find a hit...", 1, docs.totalHits);
constantScore = new ConstantScoreQuery(filter);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
// make sure we get a cache hit when we reopen readers
// that had no new deletions
IndexReader newReader = refreshReader(reader);
assertTrue(reader != newReader);
reader = newReader;
searcher = new IndexSearcher(reader);
int missCount = filter.missCount;
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
assertEquals(missCount, filter.missCount);
// now delete the doc, refresh the reader, and see that it's not there
writer.deleteDocuments(new Term("id", "1"));
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should *not* find a hit...", 0, docs.totalHits);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should *not* find a hit...", 0, docs.totalHits);
}
private static IndexReader refreshReader(IndexReader reader) throws IOException {
IndexReader oldReader = reader;
reader = reader.reopen();
if (reader != oldReader) {
oldReader.close();
}
return reader;
}
}

View File

@ -24,8 +24,11 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.OpenBitSetDISI;
@ -143,4 +146,123 @@ public class TestCachingWrapperFilter extends LuceneTestCase {
reader.close();
}
public void testEnforceDeletions() throws Exception {
Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
IndexReader reader = writer.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
// add a doc, refresh the reader, and check that its there
Document doc = new Document();
doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(new MatchAllDocsQuery(), 1);
assertEquals("Should find a hit...", 1, docs.totalHits);
final Filter startFilter = new QueryWrapperFilter(new TermQuery(new Term("id", "1")));
// ignore deletions
CachingWrapperFilter filter = new CachingWrapperFilter(startFilter, CachingWrapperFilter.DeletesMode.IGNORE);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should find a hit...", 1, docs.totalHits);
ConstantScoreQuery constantScore = new ConstantScoreQuery(filter);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
// now delete the doc, refresh the reader, and see that it's not there
writer.deleteDocuments(new Term("id", "1"));
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should *not* find a hit...", 0, docs.totalHits);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
// force cache to regenerate:
filter = new CachingWrapperFilter(startFilter, CachingWrapperFilter.DeletesMode.RECACHE);
writer.addDocument(doc);
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should find a hit...", 1, docs.totalHits);
constantScore = new ConstantScoreQuery(filter);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
// make sure we get a cache hit when we reopen reader
// that had no change to deletions
IndexReader newReader = refreshReader(reader);
assertTrue(reader != newReader);
reader = newReader;
searcher = new IndexSearcher(reader);
int missCount = filter.missCount;
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
assertEquals(missCount, filter.missCount);
// now delete the doc, refresh the reader, and see that it's not there
writer.deleteDocuments(new Term("id", "1"));
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
missCount = filter.missCount;
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals(missCount+1, filter.missCount);
assertEquals("[query + filter] Should *not* find a hit...", 0, docs.totalHits);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should *not* find a hit...", 0, docs.totalHits);
// apply deletions dynamically
filter = new CachingWrapperFilter(startFilter, CachingWrapperFilter.DeletesMode.DYNAMIC);
writer.addDocument(doc);
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should find a hit...", 1, docs.totalHits);
constantScore = new ConstantScoreQuery(filter);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
// now delete the doc, refresh the reader, and see that it's not there
writer.deleteDocuments(new Term("id", "1"));
reader = refreshReader(reader);
searcher = new IndexSearcher(reader);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should *not* find a hit...", 0, docs.totalHits);
missCount = filter.missCount;
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should *not* find a hit...", 0, docs.totalHits);
// doesn't count as a miss
assertEquals(missCount, filter.missCount);
}
private static IndexReader refreshReader(IndexReader reader) throws IOException {
IndexReader oldReader = reader;
reader = reader.reopen();
if (reader != oldReader) {
oldReader.close();
}
return reader;
}
}

View File

@ -523,8 +523,8 @@ public class SolrIndexReader extends FilterIndexReader {
}
@Override
public Object getFieldCacheKey() {
return in.getFieldCacheKey();
public Object getCoreCacheKey() {
return in.getCoreCacheKey();
}
@Override