LUCENE-1911: Add a new DocIdSet.isCacheable() method that defaults to false, but is true for some DocIdSet impls, that are effective and use no disk I/O during iteration (OpenBitSet, SortedVIntList, DocIdBitSet, EMPTY_DOCIDSET, some FieldCache filter impls). CachingWrapperFilter now copies all DocIdSets, which are not cacheable, into an OpenBitSetDISI for caching.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@816154 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-09-17 12:02:46 +00:00
parent dac29b3f49
commit 8934cbd804
13 changed files with 188 additions and 10 deletions

View File

@ -434,6 +434,14 @@ API Changes
NativeFSLockFactory, we strongly recommend not to mix deprecated
and new API. (Uwe Schindler, Mike McCandless)
* LUCENE-1911: Added a new method isCacheable() to DocIdSet. This method
should return true, if the underlying implementation does not use disk
I/O and is fast enough to be directly cached by CachingWrapperFilter.
OpenBitSet, SortedVIntList, and DocIdBitSet are such candidates.
The default implementation of the abstract DocIdSet class returns false.
In this case, CachingWrapperFilter copies the DocIdSetIterator into an
OpenBitSet for caching. (Uwe Schindler, Thomas Becker)
Bug fixes
* LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.DocIdBitSet;
import org.apache.lucene.util.OpenBitSetDISI;
import java.util.BitSet;
import java.util.WeakHashMap;
import java.util.Map;
@ -75,10 +76,20 @@ public class CachingWrapperFilter extends Filter {
/** Provide the DocIdSet to be cached, using the DocIdSet provided
* by the wrapped Filter.
* This implementation returns the given DocIdSet.
* <p>This implementation returns the given {@link DocIdSet}, if {@link DocIdSet#isCacheable}
* returns <code>true</code>, else it copies the {@link DocIdSetIterator} into
* an {@link OpenBitSetDISI}.
*/
protected DocIdSet docIdSetToCache(DocIdSet docIdSet, IndexReader reader) {
return docIdSet;
protected DocIdSet docIdSetToCache(DocIdSet docIdSet, IndexReader reader) throws IOException {
if (docIdSet.isCacheable()) {
return docIdSet;
} else {
final DocIdSetIterator it = docIdSet.iterator();
// null is allowed to be returned by iterator(),
// in this case we wrap with the empty set,
// which is cacheable.
return (it == null) ? DocIdSet.EMPTY_DOCIDSET : new OpenBitSetDISI(it, reader.maxDoc());
}
}
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {

View File

@ -37,6 +37,10 @@ public abstract class DocIdSet {
public DocIdSetIterator iterator() {
return iterator;
}
public boolean isCacheable() {
return true;
}
};
/** Provides a {@link DocIdSetIterator} to access the set.
@ -44,4 +48,15 @@ public abstract class DocIdSet {
* <code>{@linkplain #EMPTY_DOCIDSET}.iterator()</code> if there
* are no docs that match. */
public abstract DocIdSetIterator iterator() throws IOException;
/**
* This method is a hint for {@link CachingWrapperFilter}, if this <code>DocIdSet</code>
* should be cached without copying it into a BitSet. The default is to return
* <code>false</code>. If you have an own <code>DocIdSet</code> implementation
* that does its iteration very effective and fast without doing disk I/O,
* override this method and return <code>true</here>.
*/
public boolean isCacheable() {
return false;
}
}

View File

@ -476,6 +476,11 @@ public abstract class FieldCacheRangeFilter extends Filter {
/** this method checks, if a doc is a hit, should throw AIOBE, when position invalid */
abstract boolean matchDoc(int doc) throws ArrayIndexOutOfBoundsException;
/** this DocIdSet is cacheable, if it works solely with FieldCache and no TermDocs */
public boolean isCacheable() {
return !(mayUseTermDocs && reader.hasDeletions());
}
public DocIdSetIterator iterator() throws IOException {
// Synchronization needed because deleted docs BitVector
@ -484,7 +489,7 @@ public abstract class FieldCacheRangeFilter extends Filter {
// and the index has deletions
final TermDocs termDocs;
synchronized(reader) {
termDocs = (mayUseTermDocs && reader.hasDeletions()) ? reader.termDocs(null) : null;
termDocs = isCacheable() ? null : reader.termDocs(null);
}
if (termDocs != null) {
// a DocIdSetIterator using TermDocs to iterate valid docIds

View File

@ -130,6 +130,11 @@ public class FieldCacheTermsFilter extends Filter {
return new FieldCacheTermsFilterDocIdSetIterator();
}
/** This DocIdSet implementation is cacheable. */
public boolean isCacheable() {
return true;
}
protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
private int doc = -1;

View File

@ -49,6 +49,11 @@ public abstract class FilteredDocIdSet extends DocIdSet {
_innerSet = innerSet;
}
/** This DocIdSet implementation is cacheable if the inner set is cacheable. */
public boolean isCacheable() {
return _innerSet.isCacheable();
}
/**
* Validation method to determine whether a docid should be in the result set.
* @param docid docid to be tested

View File

@ -74,6 +74,7 @@ public class QueryWrapperFilter extends Filter {
public DocIdSetIterator iterator() throws IOException {
return weight.scorer(reader, true, false);
}
public boolean isCacheable() { return false; }
};
}

View File

@ -34,6 +34,11 @@ public class DocIdBitSet extends DocIdSet {
public DocIdSetIterator iterator() {
return new DocIdBitSetIterator(bitSet);
}
/** This DocIdSet implementation is cacheable. */
public boolean isCacheable() {
return true;
}
/**
* Returns the underlying BitSet.

View File

@ -116,6 +116,11 @@ public class OpenBitSet extends DocIdSet implements Cloneable, Serializable {
return new OpenBitSetIterator(bits, wlen);
}
/** This DocIdSet implementation is cacheable. */
public boolean isCacheable() {
return true;
}
/** Returns the current capacity in bits (1 greater than the index of the last bit) */
public long capacity() { return bits.length << 6; }

View File

@ -180,6 +180,11 @@ public class SortedVIntList extends DocIdSet {
return bytes.length;
}
/** This DocIdSet implementation is cacheable. */
public boolean isCacheable() {
return true;
}
/**
* @return An iterator over the sorted integers.
*/

View File

@ -18,12 +18,18 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.OpenBitSetDISI;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
import java.util.BitSet;
public class TestCachingWrapperFilter extends LuceneTestCase {
public void testCachingWorks() throws Exception {
Directory dir = new RAMDirectory();
@ -50,4 +56,47 @@ public class TestCachingWrapperFilter extends LuceneTestCase {
reader.close();
}
private static void assertDocIdSetCacheable(IndexReader reader, Filter filter, boolean shouldCacheable) throws IOException {
final CachingWrapperFilter cacher = new CachingWrapperFilter(filter);
final DocIdSet originalSet = filter.getDocIdSet(reader);
final DocIdSet cachedSet = cacher.getDocIdSet(reader);
assertTrue(cachedSet.isCacheable());
assertEquals(shouldCacheable, originalSet.isCacheable());
//System.out.println("Original: "+originalSet.getClass().getName()+" -- cached: "+cachedSet.getClass().getName());
if (originalSet.isCacheable()) {
assertEquals("Cached DocIdSet must be of same class like uncached, if cacheable", originalSet.getClass(), cachedSet.getClass());
} else {
assertTrue("Cached DocIdSet must be an OpenBitSet if the original one was not cacheable", cachedSet instanceof OpenBitSetDISI);
}
}
public void testIsCacheAble() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
writer.close();
IndexReader reader = IndexReader.open(dir);
// not cacheable:
assertDocIdSetCacheable(reader, new QueryWrapperFilter(new TermQuery(new Term("test","value"))), false);
// returns default empty docidset, always cacheable:
assertDocIdSetCacheable(reader, NumericRangeFilter.newIntRange("test", new Integer(10000), new Integer(-10000), true, true), true);
// is cacheable:
assertDocIdSetCacheable(reader, FieldCacheRangeFilter.newIntRange("test", new Integer(10), new Integer(20), true, true), true);
// a openbitset filter is always cacheable
assertDocIdSetCacheable(reader, new Filter() {
public DocIdSet getDocIdSet(IndexReader reader) {
return new OpenBitSet();
}
}, true);
// a deprecated filter is always cacheable
assertDocIdSetCacheable(reader, new Filter() {
public BitSet bits(IndexReader reader) {
return new BitSet();
}
}, true);
reader.close();
}
}

View File

@ -66,8 +66,9 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter {
Query q = new TermQuery(new Term("body","body"));
// test id, bounded on both ends
result = search.search(q,FieldCacheRangeFilter.newStringRange("id",minIP,maxIP,T,T), numDocs).scoreDocs;
FieldCacheRangeFilter fcrf;
result = search.search(q,fcrf = FieldCacheRangeFilter.newStringRange("id",minIP,maxIP,T,T), numDocs).scoreDocs;
assertTrue(fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", numDocs, result.length);
result = search.search(q,FieldCacheRangeFilter.newStringRange("id",minIP,maxIP,T,F), numDocs).scoreDocs;
@ -212,8 +213,9 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter {
Query q = new TermQuery(new Term("body","body"));
// test id, bounded on both ends
result = search.search(q,FieldCacheRangeFilter.newShortRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs;
FieldCacheRangeFilter fcrf;
result = search.search(q,fcrf=FieldCacheRangeFilter.newShortRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs;
assertTrue(fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", numDocs, result.length);
result = search.search(q,FieldCacheRangeFilter.newShortRange("id",minIdO,maxIdO,T,F), numDocs).scoreDocs;
@ -303,7 +305,9 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter {
// test id, bounded on both ends
result = search.search(q,FieldCacheRangeFilter.newIntRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs;
FieldCacheRangeFilter fcrf;
result = search.search(q,fcrf=FieldCacheRangeFilter.newIntRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs;
assertTrue(fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", numDocs, result.length);
result = search.search(q,FieldCacheRangeFilter.newIntRange("id",minIdO,maxIdO,T,F), numDocs).scoreDocs;
@ -393,7 +397,9 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter {
// test id, bounded on both ends
result = search.search(q,FieldCacheRangeFilter.newLongRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs;
FieldCacheRangeFilter fcrf;
result = search.search(q,fcrf=FieldCacheRangeFilter.newLongRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs;
assertTrue(fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", numDocs, result.length);
result = search.search(q,FieldCacheRangeFilter.newLongRange("id",minIdO,maxIdO,T,F), numDocs).scoreDocs;
@ -523,4 +529,49 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter {
assertEquals("infinity special case", 0, result.length);
}
// test using a sparse index (with deleted docs). The DocIdSet should be not cacheable, as it uses TermDocs if the range contains 0
public void testSparseIndex() throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), T, IndexWriter.MaxFieldLength.LIMITED);
for (int d = -20; d <= 20; d++) {
Document doc = new Document();
doc.add(new Field("id",Integer.toString(d), Field.Store.NO, Field.Index.NOT_ANALYZED));
doc.add(new Field("body","body", Field.Store.NO, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
writer.optimize();
writer.deleteDocuments(new Term("id","0"));
writer.close();
IndexReader reader = IndexReader.open(dir);
IndexSearcher search = new IndexSearcher(reader);
assertTrue(reader.hasDeletions());
ScoreDoc[] result;
FieldCacheRangeFilter fcrf;
Query q = new TermQuery(new Term("body","body"));
result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) -20),new Byte((byte) 20),T,T), 100).scoreDocs;
assertFalse("DocIdSet must be not cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", 40, result.length);
result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) 0),new Byte((byte) 20),T,T), 100).scoreDocs;
assertFalse("DocIdSet must be not cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", 20, result.length);
result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) -20),new Byte((byte) 0),T,T), 100).scoreDocs;
assertFalse("DocIdSet must be not cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", 20, result.length);
result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) 10),new Byte((byte) 20),T,T), 100).scoreDocs;
assertTrue("DocIdSet must be cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", 11, result.length);
result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) -20),new Byte((byte) -10),T,T), 100).scoreDocs;
assertTrue("DocIdSet must be cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
assertEquals("find all", 11, result.length);
}
}

View File

@ -48,6 +48,8 @@ public class TestQueryWrapperFilter extends LuceneTestCase {
IndexSearcher searcher = new IndexSearcher(dir, true);
TopDocs hits = searcher.search(new MatchAllDocsQuery(), qwf, 10);
assertEquals(1, hits.totalHits);
hits = searcher.search(new MatchAllDocsQuery(), new CachingWrapperFilter(qwf), 10);
assertEquals(1, hits.totalHits);
// should not throw exception with complex primitive query
BooleanQuery booleanQuery = new BooleanQuery();
@ -58,6 +60,8 @@ public class TestQueryWrapperFilter extends LuceneTestCase {
hits = searcher.search(new MatchAllDocsQuery(), qwf, 10);
assertEquals(1, hits.totalHits);
hits = searcher.search(new MatchAllDocsQuery(), new CachingWrapperFilter(qwf), 10);
assertEquals(1, hits.totalHits);
// should not throw exception with non primitive Query (doesn't implement
// Query#createWeight)
@ -65,6 +69,15 @@ public class TestQueryWrapperFilter extends LuceneTestCase {
hits = searcher.search(new MatchAllDocsQuery(), qwf, 10);
assertEquals(1, hits.totalHits);
hits = searcher.search(new MatchAllDocsQuery(), new CachingWrapperFilter(qwf), 10);
assertEquals(1, hits.totalHits);
// test a query with no hits
termQuery = new TermQuery(new Term("field", "not_exist"));
qwf = new QueryWrapperFilter(termQuery);
hits = searcher.search(new MatchAllDocsQuery(), qwf, 10);
assertEquals(0, hits.totalHits);
hits = searcher.search(new MatchAllDocsQuery(), new CachingWrapperFilter(qwf), 10);
assertEquals(0, hits.totalHits);
}
}