LUCENE-1487: add FieldCacheTermsFilter, to filter by multiple terms on single-valued fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@738622 13f79535-47bb-0310-9956-ffa450edef68
2009-01-28 21:05:26 +00:00 · 2009-01-28 21:05:26 +00:00 · 994ae0e18a
parent 13d5245314
commit 994ae0e18a
3 changed files with 199 additions and 1 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -1,4 +1,4 @@
-Lucene Change Log
+Lucene Change Log
 $Id$

 ======================= Trunk (not yet released) =======================
@ -136,6 +136,12 @@ New features
 11. LUCENE-1528: Add support for Ideographic Space to the queryparser.
    (Luis Alves via Michael Busch)

+12. LUCENE-1487: Added FieldCacheTermsFilter, to filter by multiple
+    terms on single-valued fields.  The filter loads the FieldCache
+    for the field the first time it's called, and subsequent usage of
+    that field, even with different Terms in the filter, are fast.
+    (Tim Sturge, Shalin Shekhar Mangar via Mike McCandless).
+
 Optimizations

 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
--- a/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java
+++ b/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java
@ -0,0 +1,117 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.util.OpenBitSet;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * A term filter built on top of a cached single field (in FieldCache). It can be used only
+ * with single-valued fields.
+ * <p/>
+ * FieldCacheTermsFilter builds a single cache for the field the first time it is used. Each
+ * subsequent FieldCacheTermsFilter on the same field then re-uses this cache even if the terms
+ * themselves are different.
+ * <p/>
+ * The FieldCacheTermsFilter is faster than building a TermsFilter each time.
+ * FieldCacheTermsFilter are fast to build in cases where number of documents are far more than
+ * unique terms. Internally, it creates a BitSet by term number and scans by document id.
+ * <p/>
+ * As with all FieldCache based functionality, FieldCacheTermsFilter is only valid for fields
+ * which contain zero or one terms for each document. Thus it works on dates, prices and other
+ * single value fields but will not work on regular text fields. It is preferable to use an
+ * NOT_ANALYZED field to ensure that there is only a single term.
+ * <p/>
+ * Also, collation is performed at the time the FieldCache is built; to change collation you
+ * need to override the getFieldCache() method to change the underlying cache.
+ */
+public class FieldCacheTermsFilter extends Filter {
+  private String field;
+  private Iterable terms;
+
+  public FieldCacheTermsFilter(String field, Iterable terms) {
+    this.field = field;
+    this.terms = terms;
+  }
+
+  public FieldCache getFieldCache() {
+    return FieldCache.DEFAULT;
+  }
+
+  public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+    return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
+  }
+
+  protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
+    private FieldCache.StringIndex fcsi;
+
+    private OpenBitSet openBitSet;
+
+    public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
+      this.fcsi = fcsi;
+      openBitSet = new OpenBitSet(this.fcsi.lookup.length);
+      for (Iterator it = terms.iterator(); it.hasNext();) {
+        Object term = it.next();
+        int termNumber = this.fcsi.binarySearchLookup((String) term);
+        if (termNumber > 0) {
+          openBitSet.fastSet(termNumber);
+        }
+      }
+    }
+
+    public DocIdSetIterator iterator() {
+      return new FieldCacheTermsFilterDocIdSetIterator();
+    }
+
+    protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
+      private int doc = -1;
+
+      public int doc() {
+        return doc;
+      }
+
+      public boolean next() {
+        try {
+          do {
+            doc++;
+          } while (!openBitSet.fastGet(fcsi.order[doc]));
+          return true;
+        } catch (ArrayIndexOutOfBoundsException e) {
+          doc = Integer.MAX_VALUE;
+          return false;
+        }
+      }
+
+      public boolean skipTo(int target) {
+        try {
+          doc = target;
+          while (!openBitSet.fastGet(fcsi.order[doc])) {
+            doc++;
+          }
+          return true;
+        } catch (ArrayIndexOutOfBoundsException e) {
+          doc = Integer.MAX_VALUE;
+          return false;
+        }
+      }
+    }
+  }
+}
--- a/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java
+++ b/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java
@ -0,0 +1,75 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.MockRAMDirectory;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A basic unit test for FieldCacheTermsFilter
+ *
+ * @see org.apache.lucene.search.FieldCacheTermsFilter
+ */
+public class TestFieldCacheTermsFilter extends TestCase {
+  public void testMissingTerms() throws Exception {
+    String fieldName = "field1";
+    MockRAMDirectory rd = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(rd, new KeywordAnalyzer(), MaxFieldLength.UNLIMITED);
+    for (int i = 0; i < 100; i++) {
+      Document doc = new Document();
+      int term = i * 10; //terms are units of 10;
+      doc.add(new Field(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
+      w.addDocument(doc);
+    }
+    w.close();
+
+    IndexReader reader = IndexReader.open(rd);
+    IndexSearcher searcher = new IndexSearcher(reader);
+    int numDocs = reader.numDocs();
+    ScoreDoc[] results;
+    MatchAllDocsQuery q = new MatchAllDocsQuery();
+
+    List terms = new ArrayList();
+    terms.add("5");
+    results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
+    assertEquals("Must match nothing", 0, results.length);
+
+    terms = new ArrayList();
+    terms.add("10");
+    results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
+    assertEquals("Must match 1", 1, results.length);
+
+    terms = new ArrayList();
+    terms.add("10");
+    terms.add("20");
+    results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
+    assertEquals("Must match 2", 2, results.length);
+
+    reader.close();
+    rd.close();
+  }
+}