From 8934cbd804efb7372a802b66ea0adea3bddd62f5 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Thu, 17 Sep 2009 12:02:46 +0000 Subject: [PATCH] LUCENE-1911: Add a new DocIdSet.isCacheable() method that defaults to false, but is true for some DocIdSet impls, that are effective and use no disk I/O during iteration (OpenBitSet, SortedVIntList, DocIdBitSet, EMPTY_DOCIDSET, some FieldCache filter impls). CachingWrapperFilter now copies all DocIdSets, which are not cacheable, into an OpenBitSetDISI for caching. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@816154 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 8 +++ .../lucene/search/CachingWrapperFilter.java | 17 ++++- .../org/apache/lucene/search/DocIdSet.java | 15 +++++ .../lucene/search/FieldCacheRangeFilter.java | 7 ++- .../lucene/search/FieldCacheTermsFilter.java | 5 ++ .../lucene/search/FilteredDocIdSet.java | 5 ++ .../lucene/search/QueryWrapperFilter.java | 1 + .../org/apache/lucene/util/DocIdBitSet.java | 5 ++ .../org/apache/lucene/util/OpenBitSet.java | 5 ++ .../apache/lucene/util/SortedVIntList.java | 5 ++ .../search/TestCachingWrapperFilter.java | 49 +++++++++++++++ .../search/TestFieldCacheRangeFilter.java | 63 +++++++++++++++++-- .../lucene/search/TestQueryWrapperFilter.java | 13 ++++ 13 files changed, 188 insertions(+), 10 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 44af5bde6ac..3d1da042652 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -434,6 +434,14 @@ API Changes NativeFSLockFactory, we strongly recommend not to mix deprecated and new API. (Uwe Schindler, Mike McCandless) + * LUCENE-1911: Added a new method isCacheable() to DocIdSet. This method + should return true, if the underlying implementation does not use disk + I/O and is fast enough to be directly cached by CachingWrapperFilter. + OpenBitSet, SortedVIntList, and DocIdBitSet are such candidates. + The default implementation of the abstract DocIdSet class returns false. + In this case, CachingWrapperFilter copies the DocIdSetIterator into an + OpenBitSet for caching. (Uwe Schindler, Thomas Becker) + Bug fixes * LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals() diff --git a/src/java/org/apache/lucene/search/CachingWrapperFilter.java b/src/java/org/apache/lucene/search/CachingWrapperFilter.java index 3b01371325b..bfef2a64078 100644 --- a/src/java/org/apache/lucene/search/CachingWrapperFilter.java +++ b/src/java/org/apache/lucene/search/CachingWrapperFilter.java @@ -19,6 +19,7 @@ package org.apache.lucene.search; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.DocIdBitSet; +import org.apache.lucene.util.OpenBitSetDISI; import java.util.BitSet; import java.util.WeakHashMap; import java.util.Map; @@ -75,10 +76,20 @@ public class CachingWrapperFilter extends Filter { /** Provide the DocIdSet to be cached, using the DocIdSet provided * by the wrapped Filter. - * This implementation returns the given DocIdSet. + *

This implementation returns the given {@link DocIdSet}, if {@link DocIdSet#isCacheable} + * returns true, else it copies the {@link DocIdSetIterator} into + * an {@link OpenBitSetDISI}. */ - protected DocIdSet docIdSetToCache(DocIdSet docIdSet, IndexReader reader) { - return docIdSet; + protected DocIdSet docIdSetToCache(DocIdSet docIdSet, IndexReader reader) throws IOException { + if (docIdSet.isCacheable()) { + return docIdSet; + } else { + final DocIdSetIterator it = docIdSet.iterator(); + // null is allowed to be returned by iterator(), + // in this case we wrap with the empty set, + // which is cacheable. + return (it == null) ? DocIdSet.EMPTY_DOCIDSET : new OpenBitSetDISI(it, reader.maxDoc()); + } } public DocIdSet getDocIdSet(IndexReader reader) throws IOException { diff --git a/src/java/org/apache/lucene/search/DocIdSet.java b/src/java/org/apache/lucene/search/DocIdSet.java index 02199856990..c6a9b6d7347 100644 --- a/src/java/org/apache/lucene/search/DocIdSet.java +++ b/src/java/org/apache/lucene/search/DocIdSet.java @@ -37,6 +37,10 @@ public abstract class DocIdSet { public DocIdSetIterator iterator() { return iterator; } + + public boolean isCacheable() { + return true; + } }; /** Provides a {@link DocIdSetIterator} to access the set. @@ -44,4 +48,15 @@ public abstract class DocIdSet { * {@linkplain #EMPTY_DOCIDSET}.iterator() if there * are no docs that match. */ public abstract DocIdSetIterator iterator() throws IOException; + + /** + * This method is a hint for {@link CachingWrapperFilter}, if this DocIdSet + * should be cached without copying it into a BitSet. The default is to return + * false. If you have an own DocIdSet implementation + * that does its iteration very effective and fast without doing disk I/O, + * override this method and return true. + */ + public boolean isCacheable() { + return false; + } } diff --git a/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java b/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java index 9d80af5775a..52415d51429 100644 --- a/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java +++ b/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java @@ -476,6 +476,11 @@ public abstract class FieldCacheRangeFilter extends Filter { /** this method checks, if a doc is a hit, should throw AIOBE, when position invalid */ abstract boolean matchDoc(int doc) throws ArrayIndexOutOfBoundsException; + + /** this DocIdSet is cacheable, if it works solely with FieldCache and no TermDocs */ + public boolean isCacheable() { + return !(mayUseTermDocs && reader.hasDeletions()); + } public DocIdSetIterator iterator() throws IOException { // Synchronization needed because deleted docs BitVector @@ -484,7 +489,7 @@ public abstract class FieldCacheRangeFilter extends Filter { // and the index has deletions final TermDocs termDocs; synchronized(reader) { - termDocs = (mayUseTermDocs && reader.hasDeletions()) ? reader.termDocs(null) : null; + termDocs = isCacheable() ? null : reader.termDocs(null); } if (termDocs != null) { // a DocIdSetIterator using TermDocs to iterate valid docIds diff --git a/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java b/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java index 89dc8b6b486..0e0faef93f3 100644 --- a/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java +++ b/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java @@ -130,6 +130,11 @@ public class FieldCacheTermsFilter extends Filter { return new FieldCacheTermsFilterDocIdSetIterator(); } + /** This DocIdSet implementation is cacheable. */ + public boolean isCacheable() { + return true; + } + protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator { private int doc = -1; diff --git a/src/java/org/apache/lucene/search/FilteredDocIdSet.java b/src/java/org/apache/lucene/search/FilteredDocIdSet.java index 7a2c1228fd0..ded83a81ea3 100644 --- a/src/java/org/apache/lucene/search/FilteredDocIdSet.java +++ b/src/java/org/apache/lucene/search/FilteredDocIdSet.java @@ -49,6 +49,11 @@ public abstract class FilteredDocIdSet extends DocIdSet { _innerSet = innerSet; } + /** This DocIdSet implementation is cacheable if the inner set is cacheable. */ + public boolean isCacheable() { + return _innerSet.isCacheable(); + } + /** * Validation method to determine whether a docid should be in the result set. * @param docid docid to be tested diff --git a/src/java/org/apache/lucene/search/QueryWrapperFilter.java b/src/java/org/apache/lucene/search/QueryWrapperFilter.java index 0b46d72f8cc..8d4f44fbacd 100644 --- a/src/java/org/apache/lucene/search/QueryWrapperFilter.java +++ b/src/java/org/apache/lucene/search/QueryWrapperFilter.java @@ -74,6 +74,7 @@ public class QueryWrapperFilter extends Filter { public DocIdSetIterator iterator() throws IOException { return weight.scorer(reader, true, false); } + public boolean isCacheable() { return false; } }; } diff --git a/src/java/org/apache/lucene/util/DocIdBitSet.java b/src/java/org/apache/lucene/util/DocIdBitSet.java index 334300271b3..41bb5f2bc4a 100644 --- a/src/java/org/apache/lucene/util/DocIdBitSet.java +++ b/src/java/org/apache/lucene/util/DocIdBitSet.java @@ -34,6 +34,11 @@ public class DocIdBitSet extends DocIdSet { public DocIdSetIterator iterator() { return new DocIdBitSetIterator(bitSet); } + + /** This DocIdSet implementation is cacheable. */ + public boolean isCacheable() { + return true; + } /** * Returns the underlying BitSet. diff --git a/src/java/org/apache/lucene/util/OpenBitSet.java b/src/java/org/apache/lucene/util/OpenBitSet.java index 1fc10122c01..46eb0237f6a 100644 --- a/src/java/org/apache/lucene/util/OpenBitSet.java +++ b/src/java/org/apache/lucene/util/OpenBitSet.java @@ -116,6 +116,11 @@ public class OpenBitSet extends DocIdSet implements Cloneable, Serializable { return new OpenBitSetIterator(bits, wlen); } + /** This DocIdSet implementation is cacheable. */ + public boolean isCacheable() { + return true; + } + /** Returns the current capacity in bits (1 greater than the index of the last bit) */ public long capacity() { return bits.length << 6; } diff --git a/src/java/org/apache/lucene/util/SortedVIntList.java b/src/java/org/apache/lucene/util/SortedVIntList.java index 55c21aca8c2..0f764376e27 100644 --- a/src/java/org/apache/lucene/util/SortedVIntList.java +++ b/src/java/org/apache/lucene/util/SortedVIntList.java @@ -180,6 +180,11 @@ public class SortedVIntList extends DocIdSet { return bytes.length; } + /** This DocIdSet implementation is cacheable. */ + public boolean isCacheable() { + return true; + } + /** * @return An iterator over the sorted integers. */ diff --git a/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java b/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java index cd41a319687..12fc24db134 100644 --- a/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java +++ b/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java @@ -18,12 +18,18 @@ package org.apache.lucene.search; */ import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.OpenBitSetDISI; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import java.io.IOException; +import java.util.BitSet; + public class TestCachingWrapperFilter extends LuceneTestCase { public void testCachingWorks() throws Exception { Directory dir = new RAMDirectory(); @@ -50,4 +56,47 @@ public class TestCachingWrapperFilter extends LuceneTestCase { reader.close(); } + + private static void assertDocIdSetCacheable(IndexReader reader, Filter filter, boolean shouldCacheable) throws IOException { + final CachingWrapperFilter cacher = new CachingWrapperFilter(filter); + final DocIdSet originalSet = filter.getDocIdSet(reader); + final DocIdSet cachedSet = cacher.getDocIdSet(reader); + assertTrue(cachedSet.isCacheable()); + assertEquals(shouldCacheable, originalSet.isCacheable()); + //System.out.println("Original: "+originalSet.getClass().getName()+" -- cached: "+cachedSet.getClass().getName()); + if (originalSet.isCacheable()) { + assertEquals("Cached DocIdSet must be of same class like uncached, if cacheable", originalSet.getClass(), cachedSet.getClass()); + } else { + assertTrue("Cached DocIdSet must be an OpenBitSet if the original one was not cacheable", cachedSet instanceof OpenBitSetDISI); + } + } + + public void testIsCacheAble() throws Exception { + Directory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); + writer.close(); + + IndexReader reader = IndexReader.open(dir); + + // not cacheable: + assertDocIdSetCacheable(reader, new QueryWrapperFilter(new TermQuery(new Term("test","value"))), false); + // returns default empty docidset, always cacheable: + assertDocIdSetCacheable(reader, NumericRangeFilter.newIntRange("test", new Integer(10000), new Integer(-10000), true, true), true); + // is cacheable: + assertDocIdSetCacheable(reader, FieldCacheRangeFilter.newIntRange("test", new Integer(10), new Integer(20), true, true), true); + // a openbitset filter is always cacheable + assertDocIdSetCacheable(reader, new Filter() { + public DocIdSet getDocIdSet(IndexReader reader) { + return new OpenBitSet(); + } + }, true); + // a deprecated filter is always cacheable + assertDocIdSetCacheable(reader, new Filter() { + public BitSet bits(IndexReader reader) { + return new BitSet(); + } + }, true); + + reader.close(); + } } diff --git a/src/test/org/apache/lucene/search/TestFieldCacheRangeFilter.java b/src/test/org/apache/lucene/search/TestFieldCacheRangeFilter.java index a084b416910..55d71842998 100644 --- a/src/test/org/apache/lucene/search/TestFieldCacheRangeFilter.java +++ b/src/test/org/apache/lucene/search/TestFieldCacheRangeFilter.java @@ -66,8 +66,9 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter { Query q = new TermQuery(new Term("body","body")); // test id, bounded on both ends - - result = search.search(q,FieldCacheRangeFilter.newStringRange("id",minIP,maxIP,T,T), numDocs).scoreDocs; + FieldCacheRangeFilter fcrf; + result = search.search(q,fcrf = FieldCacheRangeFilter.newStringRange("id",minIP,maxIP,T,T), numDocs).scoreDocs; + assertTrue(fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); assertEquals("find all", numDocs, result.length); result = search.search(q,FieldCacheRangeFilter.newStringRange("id",minIP,maxIP,T,F), numDocs).scoreDocs; @@ -212,8 +213,9 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter { Query q = new TermQuery(new Term("body","body")); // test id, bounded on both ends - - result = search.search(q,FieldCacheRangeFilter.newShortRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs; + FieldCacheRangeFilter fcrf; + result = search.search(q,fcrf=FieldCacheRangeFilter.newShortRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs; + assertTrue(fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); assertEquals("find all", numDocs, result.length); result = search.search(q,FieldCacheRangeFilter.newShortRange("id",minIdO,maxIdO,T,F), numDocs).scoreDocs; @@ -303,7 +305,9 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter { // test id, bounded on both ends - result = search.search(q,FieldCacheRangeFilter.newIntRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs; + FieldCacheRangeFilter fcrf; + result = search.search(q,fcrf=FieldCacheRangeFilter.newIntRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs; + assertTrue(fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); assertEquals("find all", numDocs, result.length); result = search.search(q,FieldCacheRangeFilter.newIntRange("id",minIdO,maxIdO,T,F), numDocs).scoreDocs; @@ -393,7 +397,9 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter { // test id, bounded on both ends - result = search.search(q,FieldCacheRangeFilter.newLongRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs; + FieldCacheRangeFilter fcrf; + result = search.search(q,fcrf=FieldCacheRangeFilter.newLongRange("id",minIdO,maxIdO,T,T), numDocs).scoreDocs; + assertTrue(fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); assertEquals("find all", numDocs, result.length); result = search.search(q,FieldCacheRangeFilter.newLongRange("id",minIdO,maxIdO,T,F), numDocs).scoreDocs; @@ -523,4 +529,49 @@ public class TestFieldCacheRangeFilter extends BaseTestRangeFilter { assertEquals("infinity special case", 0, result.length); } + // test using a sparse index (with deleted docs). The DocIdSet should be not cacheable, as it uses TermDocs if the range contains 0 + public void testSparseIndex() throws IOException { + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), T, IndexWriter.MaxFieldLength.LIMITED); + + for (int d = -20; d <= 20; d++) { + Document doc = new Document(); + doc.add(new Field("id",Integer.toString(d), Field.Store.NO, Field.Index.NOT_ANALYZED)); + doc.add(new Field("body","body", Field.Store.NO, Field.Index.NOT_ANALYZED)); + writer.addDocument(doc); + } + + writer.optimize(); + writer.deleteDocuments(new Term("id","0")); + writer.close(); + + IndexReader reader = IndexReader.open(dir); + IndexSearcher search = new IndexSearcher(reader); + assertTrue(reader.hasDeletions()); + + ScoreDoc[] result; + FieldCacheRangeFilter fcrf; + Query q = new TermQuery(new Term("body","body")); + + result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) -20),new Byte((byte) 20),T,T), 100).scoreDocs; + assertFalse("DocIdSet must be not cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); + assertEquals("find all", 40, result.length); + + result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) 0),new Byte((byte) 20),T,T), 100).scoreDocs; + assertFalse("DocIdSet must be not cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); + assertEquals("find all", 20, result.length); + + result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) -20),new Byte((byte) 0),T,T), 100).scoreDocs; + assertFalse("DocIdSet must be not cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); + assertEquals("find all", 20, result.length); + + result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) 10),new Byte((byte) 20),T,T), 100).scoreDocs; + assertTrue("DocIdSet must be cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); + assertEquals("find all", 11, result.length); + + result = search.search(q,fcrf=FieldCacheRangeFilter.newByteRange("id",new Byte((byte) -20),new Byte((byte) -10),T,T), 100).scoreDocs; + assertTrue("DocIdSet must be cacheable", fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable()); + assertEquals("find all", 11, result.length); + } + } diff --git a/src/test/org/apache/lucene/search/TestQueryWrapperFilter.java b/src/test/org/apache/lucene/search/TestQueryWrapperFilter.java index c55e40eef6f..c8e7c602e2f 100644 --- a/src/test/org/apache/lucene/search/TestQueryWrapperFilter.java +++ b/src/test/org/apache/lucene/search/TestQueryWrapperFilter.java @@ -48,6 +48,8 @@ public class TestQueryWrapperFilter extends LuceneTestCase { IndexSearcher searcher = new IndexSearcher(dir, true); TopDocs hits = searcher.search(new MatchAllDocsQuery(), qwf, 10); assertEquals(1, hits.totalHits); + hits = searcher.search(new MatchAllDocsQuery(), new CachingWrapperFilter(qwf), 10); + assertEquals(1, hits.totalHits); // should not throw exception with complex primitive query BooleanQuery booleanQuery = new BooleanQuery(); @@ -58,6 +60,8 @@ public class TestQueryWrapperFilter extends LuceneTestCase { hits = searcher.search(new MatchAllDocsQuery(), qwf, 10); assertEquals(1, hits.totalHits); + hits = searcher.search(new MatchAllDocsQuery(), new CachingWrapperFilter(qwf), 10); + assertEquals(1, hits.totalHits); // should not throw exception with non primitive Query (doesn't implement // Query#createWeight) @@ -65,6 +69,15 @@ public class TestQueryWrapperFilter extends LuceneTestCase { hits = searcher.search(new MatchAllDocsQuery(), qwf, 10); assertEquals(1, hits.totalHits); + hits = searcher.search(new MatchAllDocsQuery(), new CachingWrapperFilter(qwf), 10); + assertEquals(1, hits.totalHits); + // test a query with no hits + termQuery = new TermQuery(new Term("field", "not_exist")); + qwf = new QueryWrapperFilter(termQuery); + hits = searcher.search(new MatchAllDocsQuery(), qwf, 10); + assertEquals(0, hits.totalHits); + hits = searcher.search(new MatchAllDocsQuery(), new CachingWrapperFilter(qwf), 10); + assertEquals(0, hits.totalHits); } }