LUCENE-5101: Make it easier to plugin different bitset implementations to CachingWrapperFilter.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1520525 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-09-06 10:07:12 +00:00
parent 81563c2f57
commit da16af7519
9 changed files with 213 additions and 34 deletions

View File

@ -267,6 +267,11 @@ Optimizations
* LUCENE-5188: CompressingStoredFieldsFormat now slices chunks containing big
documents into fixed-size blocks so that requesting a single field does not
necessarily force to decompress the whole chunk. (Adrien Grand)
* LUCENE-5101: CachingWrapper makes it easier to plug-in a custom cacheable
DocIdSet implementation and uses WAH8DocIdSet by default, which should be
more memory efficient than FixedBitSet on average as well as faster on small
sets. (Robert Muir)
Documentation
@ -289,6 +294,10 @@ Changes in backwards compatibility policy
* LUCENE-5187: SlowCompositeReaderWrapper constructor is now private,
SlowCompositeReaderWrapper.wrap should be used instead. (Adrien Grand)
* LUCENE-5101: CachingWrapperFilter doesn't always return FixedBitSet instances
anymore. Users of the join module can use
oal.search.join.FixedBitSetCachingWrapperFilter instead. (Adrien Grand)
Build

View File

@ -27,8 +27,8 @@ import java.util.WeakHashMap;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.WAH8DocIdSet;
/**
* Wraps another {@link Filter}'s result and caches it. The purpose is to allow
@ -36,9 +36,6 @@ import org.apache.lucene.util.RamUsageEstimator;
* to add caching.
*/
public class CachingWrapperFilter extends Filter {
// TODO: make this filter aware of ReaderContext. a cached filter could
// specify the actual readers key or something similar to indicate on which
// level of the readers hierarchy it should be cached.
private final Filter filter;
private final Map<Object,DocIdSet> cache = Collections.synchronizedMap(new WeakHashMap<Object,DocIdSet>());
@ -60,8 +57,8 @@ public class CachingWrapperFilter extends Filter {
/**
* Provide the DocIdSet to be cached, using the DocIdSet provided
* by the wrapped Filter. <p>This implementation returns the given {@link DocIdSet},
* if {@link DocIdSet#isCacheable} returns <code>true</code>, else it copies the
* {@link DocIdSetIterator} into a {@link FixedBitSet}.
* if {@link DocIdSet#isCacheable} returns <code>true</code>, else it calls
* {@link #cacheImpl(DocIdSetIterator,AtomicReader)}
* <p>Note: This method returns {@linkplain #EMPTY_DOCIDSET} if the given docIdSet
* is <code>null</code> or if {@link DocIdSet#iterator()} return <code>null</code>. The empty
* instance is use as a placeholder in the cache instead of the <code>null</code> value.
@ -80,12 +77,19 @@ public class CachingWrapperFilter extends Filter {
if (it == null) {
return EMPTY_DOCIDSET;
} else {
final FixedBitSet bits = new FixedBitSet(reader.maxDoc());
bits.or(it);
return bits;
return cacheImpl(it, reader);
}
}
}
/**
* Default cache implementation: uses {@link WAH8DocIdSet}.
*/
protected DocIdSet cacheImpl(DocIdSetIterator iterator, AtomicReader reader) throws IOException {
WAH8DocIdSet.Builder builder = new WAH8DocIdSet.Builder();
builder.add(iterator);
return builder.build();
}
// for testing
int hitCount, missCount;
@ -101,6 +105,7 @@ public class CachingWrapperFilter extends Filter {
} else {
missCount++;
docIdSet = docIdSetToCache(filter.getDocIdSet(context, null), reader);
assert docIdSet.isCacheable();
cache.put(key, docIdSet);
}
@ -109,19 +114,19 @@ public class CachingWrapperFilter extends Filter {
@Override
public String toString() {
return "CachingWrapperFilter("+filter+")";
return getClass().getSimpleName() + "("+filter+")";
}
@Override
public boolean equals(Object o) {
if (!(o instanceof CachingWrapperFilter)) return false;
if (o == null || !getClass().equals(o.getClass())) return false;
final CachingWrapperFilter other = (CachingWrapperFilter) o;
return this.filter.equals(other.filter);
}
@Override
public int hashCode() {
return (filter.hashCode() ^ 0x1117BF25);
return (filter.hashCode() ^ getClass().hashCode());
}
/** An empty {@code DocIdSet} instance */

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@ -32,11 +33,113 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestCachingWrapperFilter extends LuceneTestCase {
Directory dir;
DirectoryReader ir;
IndexSearcher is;
RandomIndexWriter iw;
@Override
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
iw = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field idField = new StringField("id", "", Field.Store.NO);
doc.add(idField);
// add 500 docs with id 0..499
for (int i = 0; i < 500; i++) {
idField.setStringValue(Integer.toString(i));
iw.addDocument(doc);
}
// delete 20 of them
for (int i = 0; i < 20; i++) {
iw.deleteDocuments(new Term("id", Integer.toString(random().nextInt(iw.maxDoc()))));
}
ir = iw.getReader();
is = newSearcher(ir);
}
@Override
public void tearDown() throws Exception {
IOUtils.close(iw, ir, dir);
super.tearDown();
}
private void assertFilterEquals(Filter f1, Filter f2) throws Exception {
Query query = new MatchAllDocsQuery();
TopDocs hits1 = is.search(query, f1, ir.maxDoc());
TopDocs hits2 = is.search(query, f2, ir.maxDoc());
assertEquals(hits1.totalHits, hits2.totalHits);
CheckHits.checkEqual(query, hits1.scoreDocs, hits2.scoreDocs);
// now do it again to confirm caching works
TopDocs hits3 = is.search(query, f1, ir.maxDoc());
TopDocs hits4 = is.search(query, f2, ir.maxDoc());
assertEquals(hits3.totalHits, hits4.totalHits);
CheckHits.checkEqual(query, hits3.scoreDocs, hits4.scoreDocs);
}
/** test null iterator */
public void testEmpty() throws Exception {
Query query = new BooleanQuery();
Filter expected = new QueryWrapperFilter(query);
Filter actual = new CachingWrapperFilter(expected);
assertFilterEquals(expected, actual);
}
/** test iterator returns NO_MORE_DOCS */
public void testEmpty2() throws Exception {
BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("id", "0")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term("id", "0")), BooleanClause.Occur.MUST_NOT);
Filter expected = new QueryWrapperFilter(query);
Filter actual = new CachingWrapperFilter(expected);
assertFilterEquals(expected, actual);
}
/** test null docidset */
public void testEmpty3() throws Exception {
Filter expected = new PrefixFilter(new Term("bogusField", "bogusVal"));
Filter actual = new CachingWrapperFilter(expected);
assertFilterEquals(expected, actual);
}
/** test iterator returns single document */
public void testSingle() throws Exception {
for (int i = 0; i < 10; i++) {
int id = random().nextInt(ir.maxDoc());
Query query = new TermQuery(new Term("id", Integer.toString(id)));
Filter expected = new QueryWrapperFilter(query);
Filter actual = new CachingWrapperFilter(expected);
assertFilterEquals(expected, actual);
}
}
/** test sparse filters (match single documents) */
public void testSparse() throws Exception {
for (int i = 0; i < 10; i++) {
int id_start = random().nextInt(ir.maxDoc()-1);
int id_end = id_start + 1;
Query query = TermRangeQuery.newStringRange("id",
Integer.toString(id_start), Integer.toString(id_end), true, true);
Filter expected = new QueryWrapperFilter(query);
Filter actual = new CachingWrapperFilter(expected);
assertFilterEquals(expected, actual);
}
}
/** test dense filters (match entire index) */
public void testDense() throws Exception {
Query query = new MatchAllDocsQuery();
Filter expected = new QueryWrapperFilter(query);
Filter actual = new CachingWrapperFilter(expected);
assertFilterEquals(expected, actual);
}
public void testCachingWorks() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);

View File

@ -0,0 +1,60 @@
package org.apache.lucene.search.join;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.search.CachingWrapperFilter;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.FixedBitSet;
/** A {@link CachingWrapperFilter} that caches sets using a {@link FixedBitSet},
* as required for joins. */
public final class FixedBitSetCachingWrapperFilter extends CachingWrapperFilter {
/** Sole constructor, see {@link CachingWrapperFilter#CachingWrapperFilter(Filter)}. */
public FixedBitSetCachingWrapperFilter(Filter filter) {
super(filter);
}
@Override
protected DocIdSet docIdSetToCache(DocIdSet docIdSet, AtomicReader reader)
throws IOException {
if (docIdSet == null) {
return EMPTY_DOCIDSET;
} else if (docIdSet instanceof FixedBitSet) {
// this is different from CachingWrapperFilter: even when the DocIdSet is
// cacheable, we convert it to a FixedBitSet since we require all the
// cached filters to be FixedBitSets
return docIdSet;
} else {
final DocIdSetIterator it = docIdSet.iterator();
if (it == null) {
return EMPTY_DOCIDSET;
} else {
final FixedBitSet copy = new FixedBitSet(reader.maxDoc());
copy.or(it);
return copy;
}
}
}
}

View File

@ -64,7 +64,8 @@ public class ToChildBlockJoinQuery extends Query {
*
* @param parentQuery Query that matches parent documents
* @param parentsFilter Filter (must produce FixedBitSet
* per-segment) identifying the parent documents.
* per-segment, like {@link FixedBitSetCachingWrapperFilter})
* identifying the parent documents.
* @param doScores true if parent scores should be calculated
*/
public ToChildBlockJoinQuery(Query parentQuery, Filter parentsFilter, boolean doScores) {

View File

@ -100,7 +100,8 @@ public class ToParentBlockJoinQuery extends Query {
*
* @param childQuery Query matching child documents.
* @param parentsFilter Filter (must produce FixedBitSet
* per-segment) identifying the parent documents.
* per-segment, like {@link FixedBitSetCachingWrapperFilter})
* identifying the parent documents.
* @param scoreMode How to aggregate multiple child scores
* into a single parent score.
**/

View File

@ -93,7 +93,7 @@ public class TestBlockJoin extends LuceneTestCase {
w.close();
assertTrue(r.leaves().size() > 1);
IndexSearcher s = new IndexSearcher(r);
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
BooleanQuery childQuery = new BooleanQuery();
childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
@ -145,7 +145,7 @@ public class TestBlockJoin extends LuceneTestCase {
IndexSearcher s = newSearcher(r);
// Create a filter that defines "parent" documents in the index - in this case resumes
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery childQuery = new BooleanQuery();
@ -249,7 +249,7 @@ public class TestBlockJoin extends LuceneTestCase {
IndexSearcher s = newSearcher(r);
// Create a filter that defines "parent" documents in the index - in this case resumes
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery childQuery = new BooleanQuery();
@ -269,7 +269,7 @@ public class TestBlockJoin extends LuceneTestCase {
assertEquals("dummy filter passes everyone ", 2, s.search(childJoinQuery, new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))), 10).totalHits);
// not found test
assertEquals("noone live there", 0, s.search(childJoinQuery, new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("country", "Oz")))), 1).totalHits);
assertEquals("noone live there", 0, s.search(childJoinQuery, new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("country", "Oz")))), 1).totalHits);
assertEquals("noone live there", 0, s.search(childJoinQuery, new QueryWrapperFilter(new TermQuery(new Term("country", "Oz"))), 1).totalHits);
// apply the UK filter by the searcher
@ -362,7 +362,7 @@ public class TestBlockJoin extends LuceneTestCase {
ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(
NumericRangeQuery.newIntRange("year", 1990, 2010, true, true),
new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume")))),
new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume")))),
ScoreMode.Total
);
@ -565,7 +565,7 @@ public class TestBlockJoin extends LuceneTestCase {
final IndexSearcher joinS = new IndexSearcher(joinR);
final Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "x"))));
final Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "x"))));
final int iters = 200*RANDOM_MULTIPLIER;
@ -831,7 +831,7 @@ public class TestBlockJoin extends LuceneTestCase {
childJoinQuery2 = parentJoinQuery2;
final Filter f = new QueryWrapperFilter(new TermQuery(childTerm));
childJoinFilter2 = random().nextBoolean()
? new CachingWrapperFilter(f): f;
? new FixedBitSetCachingWrapperFilter(f): f;
} else {
childJoinFilter2 = null;
// AND child field w/ parent query:
@ -852,7 +852,7 @@ public class TestBlockJoin extends LuceneTestCase {
childQuery2 = parentQuery2;
final Filter f = new QueryWrapperFilter(new TermQuery(childTerm));
childFilter2 = random().nextBoolean()
? new CachingWrapperFilter(f): f;
? new FixedBitSetCachingWrapperFilter(f): f;
} else {
childFilter2 = null;
final BooleanQuery bq2 = new BooleanQuery();
@ -991,7 +991,7 @@ public class TestBlockJoin extends LuceneTestCase {
IndexSearcher s = newSearcher(r);
// Create a filter that defines "parent" documents in the index - in this case resumes
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery childJobQuery = new BooleanQuery();
@ -1072,7 +1072,7 @@ public class TestBlockJoin extends LuceneTestCase {
w.close();
IndexSearcher s = newSearcher(r);
Query tq = new TermQuery(new Term("child", "1"));
Filter parentFilter = new CachingWrapperFilter(
Filter parentFilter = new FixedBitSetCachingWrapperFilter(
new QueryWrapperFilter(
new TermQuery(new Term("parent", "1"))));
@ -1106,7 +1106,7 @@ public class TestBlockJoin extends LuceneTestCase {
w.close();
IndexSearcher s = newSearcher(r);
Query tq = new TermQuery(new Term("child", "2"));
Filter parentFilter = new CachingWrapperFilter(
Filter parentFilter = new FixedBitSetCachingWrapperFilter(
new QueryWrapperFilter(
new TermQuery(new Term("isparent", "yes"))));
@ -1140,7 +1140,7 @@ public class TestBlockJoin extends LuceneTestCase {
IndexSearcher s = new IndexSearcher(r);
// Create a filter that defines "parent" documents in the index - in this case resumes
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
// Define child document criteria (finds an example of relevant work experience)
BooleanQuery childQuery = new BooleanQuery();
@ -1244,7 +1244,7 @@ public class TestBlockJoin extends LuceneTestCase {
w.close();
Query childQuery = new TermQuery(new Term("childText", "text"));
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery parentQuery = new BooleanQuery();
parentQuery.add(childJoinQuery, Occur.SHOULD);
@ -1310,7 +1310,7 @@ public class TestBlockJoin extends LuceneTestCase {
// never matches:
Query childQuery = new TermQuery(new Term("childText", "bogus"));
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery parentQuery = new BooleanQuery();
parentQuery.add(childJoinQuery, Occur.SHOULD);
@ -1376,7 +1376,7 @@ public class TestBlockJoin extends LuceneTestCase {
// illegally matches parent:
Query childQuery = new TermQuery(new Term("parentText", "text"));
Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
Filter parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "yes"))));
ToParentBlockJoinQuery childJoinQuery = new ToParentBlockJoinQuery(childQuery, parentsFilter, ScoreMode.Avg);
BooleanQuery parentQuery = new BooleanQuery();
parentQuery.add(childJoinQuery, Occur.SHOULD);

View File

@ -214,7 +214,7 @@ public class TestBlockJoinSorting extends LuceneTestCase {
Filter childFilter = new QueryWrapperFilter(new PrefixQuery(new Term("field2")));
ToParentBlockJoinQuery query = new ToParentBlockJoinQuery(
new FilteredQuery(new MatchAllDocsQuery(), childFilter),
new CachingWrapperFilter(parentFilter),
new FixedBitSetCachingWrapperFilter(parentFilter),
ScoreMode.None
);
@ -279,7 +279,7 @@ public class TestBlockJoinSorting extends LuceneTestCase {
childFilter = new QueryWrapperFilter(new TermQuery((new Term("filter_1", "T"))));
query = new ToParentBlockJoinQuery(
new FilteredQuery(new MatchAllDocsQuery(), childFilter),
new CachingWrapperFilter(parentFilter),
new FixedBitSetCachingWrapperFilter(parentFilter),
ScoreMode.None
);
sortField = new ToParentBlockJoinSortField(
@ -305,7 +305,7 @@ public class TestBlockJoinSorting extends LuceneTestCase {
}
private Filter wrap(Filter filter) {
return random().nextBoolean() ? new CachingWrapperFilter(filter) : filter;
return random().nextBoolean() ? new FixedBitSetCachingWrapperFilter(filter) : filter;
}
}

View File

@ -22,6 +22,7 @@ import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.join.FixedBitSetCachingWrapperFilter;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.solr.common.params.SolrParams;
@ -86,8 +87,7 @@ class BlockJoinParentQParser extends QParser {
}
protected Filter createParentFilter(Query parentQ) {
return new CachingWrapperFilter(new QueryWrapperFilter(parentQ)) {
};
return new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(parentQ));
}
}