LUCENE-5088: Added TermFilter to filter docs by a specific term.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1503823 13f79535-47bb-0310-9956-ffa450edef68
2013-07-16 18:43:02 +00:00 · 2013-07-16 18:43:02 +00:00 · 95f9b493a1
parent 454d269e0d
commit 95f9b493a1
3 changed files with 287 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -65,6 +65,11 @@ API Changes
 * LUCENE-5114: Remove unused boolean useCache parameter from
  TermsEnum.seekCeil and .seekExact (Mike McCandless)

+Optimizations
+
+* LUCENE-5088: Added TermFilter to filter docs by a specific term.
+  (Martijn van Groningen)
+
 ======================= Lucene 4.4.0 =======================

 Changes in backwards compatibility policy
--- a/lucene/queries/src/java/org/apache/lucene/queries/TermFilter.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/TermFilter.java
@ -0,0 +1,100 @@
+package org.apache.lucene.queries;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.util.Bits;
+
+import java.io.IOException;
+
+/**
+ * A filter that includes documents that match with a specific term.
+ */
+final public class TermFilter extends Filter {
+
+  private final Term term;
+
+  /**
+   * @param term The term documents need to have in order to be a match for this filter.
+   */
+  public TermFilter(Term term) {
+    if (term == null) {
+      throw new IllegalArgumentException("Term must not be null");
+    } else if (term.field() == null) {
+      throw new IllegalArgumentException("Field must not be null");
+    }
+    this.term = term;
+  }
+
+  /**
+   * @return The term this filter includes documents with.
+   */
+  public Term getTerm() {
+    return term;
+  }
+
+  @Override
+  public DocIdSet getDocIdSet(AtomicReaderContext context, final Bits acceptDocs) throws IOException {
+    Terms terms = context.reader().terms(term.field());
+    if (terms == null) {
+      return null;
+    }
+
+    final TermsEnum termsEnum = terms.iterator(null);
+    if (!termsEnum.seekExact(term.bytes())) {
+      return null;
+    }
+    return new DocIdSet() {
+      @Override
+      public DocIdSetIterator iterator() throws IOException {
+        return termsEnum.docs(acceptDocs, null, DocsEnum.FLAG_NONE);
+      }
+
+    };
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+
+    TermFilter that = (TermFilter) o;
+
+    if (term != null ? !term.equals(that.term) : that.term != null) return false;
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    return term != null ? term.hashCode() : 0;
+  }
+
+  @Override
+  public String toString() {
+    return term.field() + ":" + term.text();
+  }
+
+}
--- a/lucene/queries/src/test/org/apache/lucene/queries/TermFilterTest.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/TermFilterTest.java
@ -0,0 +1,182 @@
+package org.apache.lucene.queries;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+public class TermFilterTest extends LuceneTestCase {
+
+  public void testCachability() throws Exception {
+    TermFilter a = termFilter("field1", "a");
+    HashSet<Filter> cachedFilters = new HashSet<Filter>();
+    cachedFilters.add(a);
+    assertTrue("Must be cached", cachedFilters.contains(termFilter("field1", "a")));
+    assertFalse("Must not be cached", cachedFilters.contains(termFilter("field1", "b")));
+    assertFalse("Must not be cached", cachedFilters.contains(termFilter("field2", "a")));
+  }
+
+  public void testMissingTermAndField() throws Exception {
+    String fieldName = "field1";
+    Directory rd = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), rd);
+    Document doc = new Document();
+    doc.add(newStringField(fieldName, "value1", Field.Store.NO));
+    w.addDocument(doc);
+    IndexReader reader = new SlowCompositeReaderWrapper(w.getReader());
+    assertTrue(reader.getContext() instanceof AtomicReaderContext);
+    AtomicReaderContext context = (AtomicReaderContext) reader.getContext();
+    w.close();
+
+    DocIdSet idSet = termFilter(fieldName, "value1").getDocIdSet(context, context.reader().getLiveDocs());
+    assertNotNull("must not be null", idSet);
+    DocIdSetIterator iter = idSet.iterator();
+    assertEquals(iter.nextDoc(), 0);
+    assertEquals(iter.nextDoc(), DocIdSetIterator.NO_MORE_DOCS);
+
+    idSet = termFilter(fieldName, "value2").getDocIdSet(context, context.reader().getLiveDocs());
+    assertNull("must be null", idSet);
+
+    idSet = termFilter("field2", "value1").getDocIdSet(context, context.reader().getLiveDocs());
+    assertNull("must be null", idSet);
+
+    reader.close();
+    rd.close();
+  }
+  
+  public void testRandom() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+    int num = atLeast(100);
+    List<Term> terms = new ArrayList<Term>();
+    for (int i = 0; i < num; i++) {
+      String field = "field" + i;
+      String string = _TestUtil.randomRealisticUnicodeString(random());
+      terms.add(new Term(field, string));
+      Document doc = new Document();
+      doc.add(newStringField(field, string, Field.Store.NO));
+      w.addDocument(doc);
+    }
+    IndexReader reader = w.getReader();
+    w.close();
+    
+    IndexSearcher searcher = newSearcher(reader);
+    
+    int numQueries = atLeast(10);
+    for (int i = 0; i < numQueries; i++) {
+      Term term = terms.get(random().nextInt(num));
+      TopDocs queryResult = searcher.search(new TermQuery(term), reader.maxDoc());
+      
+      MatchAllDocsQuery matchAll = new MatchAllDocsQuery();
+      final TermFilter filter = termFilter(term);
+      TopDocs filterResult = searcher.search(matchAll, filter, reader.maxDoc());
+      assertEquals(filterResult.totalHits, queryResult.totalHits);
+      ScoreDoc[] scoreDocs = filterResult.scoreDocs;
+      for (int j = 0; j < scoreDocs.length; j++) {
+        assertEquals(scoreDocs[j].doc, queryResult.scoreDocs[j].doc);
+      }
+    }
+    
+    reader.close();
+    dir.close();
+  }
+  
+  public void testHashCodeAndEquals() {
+    int num = atLeast(100);
+    for (int i = 0; i < num; i++) {
+      String field1 = "field" + i;
+      String field2 = "field" + i + num;
+      String value1 = _TestUtil.randomRealisticUnicodeString(random());
+      String value2 = _TestUtil.randomRealisticUnicodeString(random());
+
+      TermFilter filter1 = termFilter(field1, value1);
+      TermFilter filter2 = termFilter(field1, value2);
+      TermFilter filter3 = termFilter(field2, value1);
+      TermFilter filter4 = termFilter(field2, value2);
+      TermFilter[] filters = new TermFilter[]{filter1, filter2, filter3, filter4};
+      for (int j = 0; j < filters.length; j++) {
+        TermFilter termFilter = filters[j];
+        for (int k = 0; k < filters.length; k++) {
+          TermFilter otherTermFilter = filters[k];
+          if (j == k) {
+            assertEquals(termFilter, otherTermFilter);
+            assertEquals(termFilter.hashCode(), otherTermFilter.hashCode());
+            assertTrue(termFilter.equals(otherTermFilter));
+          } else {
+            assertFalse(termFilter.equals(otherTermFilter));
+          }
+        }
+      }
+
+      TermFilter filter5 = termFilter(field2, value2);
+      assertEquals(filter5, filter4);
+      assertEquals(filter5.hashCode(), filter4.hashCode());
+      assertTrue(filter5.equals(filter4));
+
+      assertEquals(filter5, filter4);
+      assertTrue(filter5.equals(filter4));
+    }
+  }
+  
+  public void testNoTerms() {
+    try {
+      new TermFilter(null);
+      fail("must fail - no term!");
+    } catch (IllegalArgumentException e) {}
+    
+    try {
+      new TermFilter(new Term(null));
+      fail("must fail - no field!");
+    } catch (IllegalArgumentException e) {}
+  }
+
+  public void testToString() {
+    TermFilter termsFilter = new TermFilter(new Term("field1", "a"));
+    assertEquals("field1:a", termsFilter.toString());
+  }
+
+  private TermFilter termFilter(String field, String value) {
+    return termFilter(new Term(field, value));
+  }
+
+  private TermFilter termFilter(Term term) {
+    return new TermFilter(term);
+  }
+
+}