LUCENE-1487: add FieldCacheTermsFilter, to filter by multiple terms on single-valued fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@738622 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-01-28 21:05:26 +00:00
parent 13d5245314
commit 994ae0e18a
3 changed files with 199 additions and 1 deletions

View File

@ -1,4 +1,4 @@
Lucene Change Log
Lucene Change Log
$Id$
======================= Trunk (not yet released) =======================
@ -136,6 +136,12 @@ New features
11. LUCENE-1528: Add support for Ideographic Space to the queryparser.
(Luis Alves via Michael Busch)
12. LUCENE-1487: Added FieldCacheTermsFilter, to filter by multiple
terms on single-valued fields. The filter loads the FieldCache
for the field the first time it's called, and subsequent usage of
that field, even with different Terms in the filter, are fast.
(Tim Sturge, Shalin Shekhar Mangar via Mike McCandless).
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

View File

@ -0,0 +1,117 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.OpenBitSet;
import java.io.IOException;
import java.util.Iterator;
/**
* A term filter built on top of a cached single field (in FieldCache). It can be used only
* with single-valued fields.
* <p/>
* FieldCacheTermsFilter builds a single cache for the field the first time it is used. Each
* subsequent FieldCacheTermsFilter on the same field then re-uses this cache even if the terms
* themselves are different.
* <p/>
* The FieldCacheTermsFilter is faster than building a TermsFilter each time.
* FieldCacheTermsFilter are fast to build in cases where number of documents are far more than
* unique terms. Internally, it creates a BitSet by term number and scans by document id.
* <p/>
* As with all FieldCache based functionality, FieldCacheTermsFilter is only valid for fields
* which contain zero or one terms for each document. Thus it works on dates, prices and other
* single value fields but will not work on regular text fields. It is preferable to use an
* NOT_ANALYZED field to ensure that there is only a single term.
* <p/>
* Also, collation is performed at the time the FieldCache is built; to change collation you
* need to override the getFieldCache() method to change the underlying cache.
*/
public class FieldCacheTermsFilter extends Filter {
private String field;
private Iterable terms;
public FieldCacheTermsFilter(String field, Iterable terms) {
this.field = field;
this.terms = terms;
}
public FieldCache getFieldCache() {
return FieldCache.DEFAULT;
}
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
}
protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
private FieldCache.StringIndex fcsi;
private OpenBitSet openBitSet;
public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
this.fcsi = fcsi;
openBitSet = new OpenBitSet(this.fcsi.lookup.length);
for (Iterator it = terms.iterator(); it.hasNext();) {
Object term = it.next();
int termNumber = this.fcsi.binarySearchLookup((String) term);
if (termNumber > 0) {
openBitSet.fastSet(termNumber);
}
}
}
public DocIdSetIterator iterator() {
return new FieldCacheTermsFilterDocIdSetIterator();
}
protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
private int doc = -1;
public int doc() {
return doc;
}
public boolean next() {
try {
do {
doc++;
} while (!openBitSet.fastGet(fcsi.order[doc]));
return true;
} catch (ArrayIndexOutOfBoundsException e) {
doc = Integer.MAX_VALUE;
return false;
}
}
public boolean skipTo(int target) {
try {
doc = target;
while (!openBitSet.fastGet(fcsi.order[doc])) {
doc++;
}
return true;
} catch (ArrayIndexOutOfBoundsException e) {
doc = Integer.MAX_VALUE;
return false;
}
}
}
}
}

View File

@ -0,0 +1,75 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.MockRAMDirectory;
import java.util.ArrayList;
import java.util.List;
/**
* A basic unit test for FieldCacheTermsFilter
*
* @see org.apache.lucene.search.FieldCacheTermsFilter
*/
public class TestFieldCacheTermsFilter extends TestCase {
public void testMissingTerms() throws Exception {
String fieldName = "field1";
MockRAMDirectory rd = new MockRAMDirectory();
IndexWriter w = new IndexWriter(rd, new KeywordAnalyzer(), MaxFieldLength.UNLIMITED);
for (int i = 0; i < 100; i++) {
Document doc = new Document();
int term = i * 10; //terms are units of 10;
doc.add(new Field(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
w.addDocument(doc);
}
w.close();
IndexReader reader = IndexReader.open(rd);
IndexSearcher searcher = new IndexSearcher(reader);
int numDocs = reader.numDocs();
ScoreDoc[] results;
MatchAllDocsQuery q = new MatchAllDocsQuery();
List terms = new ArrayList();
terms.add("5");
results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
assertEquals("Must match nothing", 0, results.length);
terms = new ArrayList();
terms.add("10");
results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
assertEquals("Must match 1", 1, results.length);
terms = new ArrayList();
terms.add("10");
terms.add("20");
results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
assertEquals("Must match 2", 2, results.length);
reader.close();
rd.close();
}
}