LUCENE-3593: Add a filter returning all document with at least one or no value in a field

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206017 13f79535-47bb-0310-9956-ffa450edef68
2011-11-24 23:24:35 +00:00 · 2011-11-24 23:24:35 +00:00 · 94fae441e3
parent 9dd60fe58f
commit 94fae441e3
3 changed files with 260 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -651,6 +651,12 @@ Security fixes
  prevents this as best as it can by throwing AlreadyClosedException
  also on clones.  (Uwe Schindler, Robert Muir)

+New Features
+
+* LUCENE-3593: Added a FieldValueFilter that accepts all documents that either
+  have at least one or no value at all in a specific field. (Simon Willnauer,
+  Uwe Schindler, Robert Muir)
+  
 ======================= Lucene 3.5.0 =======================

 Changes in backwards compatibility policy
--- a/lucene/src/java/org/apache/lucene/search/FieldValueFilter.java
+++ b/lucene/src/java/org/apache/lucene/search/FieldValueFilter.java
@ -0,0 +1,137 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.FieldCacheRangeFilter.FieldCacheDocIdSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.Bits.MatchAllBits;
+import org.apache.lucene.util.Bits.MatchNoBits;
+
+/**
+ * A {@link Filter} that accepts all documents that have one or more values in a
+ * given field. This {@link Filter} request {@link Bits} from the
+ * {@link FieldCache} and build the bits if not present.
+ */
+public class FieldValueFilter extends Filter {
+  private final String field;
+  private final boolean negate;
+
+  /**
+   * Creates a new {@link FieldValueFilter}
+   * 
+   * @param field
+   *          the field to filter
+   */
+  public FieldValueFilter(String field) {
+    this(field, false);
+  }
+
+  /**
+   * Creates a new {@link FieldValueFilter}
+   * 
+   * @param field
+   *          the field to filter
+   * @param negate
+   *          iff <code>true</code> all documents with no value in the given
+   *          field are accepted.
+   * 
+   */
+  public FieldValueFilter(String field, boolean negate) {
+    this.field = field;
+    this.negate = negate;
+  }
+
+  @Override
+  public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs)
+      throws IOException {
+    final Bits docsWithField = FieldCache.DEFAULT.getDocsWithField(
+        context.reader, field);
+    if (negate) {
+      if (docsWithField instanceof MatchAllBits) {
+        return null;
+      }
+      final int maxDoc = context.reader.maxDoc();
+      return new FieldCacheDocIdSet(maxDoc, acceptDocs) {
+        @Override
+        final boolean matchDoc(int doc) {
+          if (doc >= maxDoc) {
+            // TODO: this makes no sense we should check this on the caller level
+            throw new ArrayIndexOutOfBoundsException("doc: "+doc + " maxDoc: " + maxDoc);
+          }
+          return !docsWithField.get(doc);
+        }
+      };
+    } else {
+      if (docsWithField instanceof MatchNoBits) {
+        return null;
+      }
+      if (docsWithField instanceof DocIdSet) {
+        // UweSays: this is always the case for our current impl - but who knows
+        // :-)
+        return BitsFilteredDocIdSet.wrap((DocIdSet) docsWithField, acceptDocs);
+      }
+      final int maxDoc = context.reader.maxDoc();
+      return new FieldCacheDocIdSet(maxDoc, acceptDocs) {
+        @Override
+        final boolean matchDoc(int doc) {
+          if (doc >= maxDoc) {
+            // TODO: this makes no sense we should check this on the caller level
+            throw new ArrayIndexOutOfBoundsException("doc: "+doc + " maxDoc: " + maxDoc);
+          }
+          return docsWithField.get(doc);
+        }
+      };
+    }
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((field == null) ? 0 : field.hashCode());
+    result = prime * result + (negate ? 1231 : 1237);
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    FieldValueFilter other = (FieldValueFilter) obj;
+    if (field == null) {
+      if (other.field != null)
+        return false;
+    } else if (!field.equals(other.field))
+      return false;
+    if (negate != other.negate)
+      return false;
+    return true;
+  }
+
+  @Override
+  public String toString() {
+    return "NoFieldValueFilter [field=" + field + ", negate=" + negate + "]";
+  }
+
+}
--- a/lucene/src/test/org/apache/lucene/search/TestFieldValueFilter.java
+++ b/lucene/src/test/org/apache/lucene/search/TestFieldValueFilter.java
@ -0,0 +1,117 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * 
+ */
+public class TestFieldValueFilter extends LuceneTestCase {
+
+  public void testFieldValueFilterNoValue() throws IOException {
+    Directory directory = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
+    int docs = atLeast(10);
+    int[] docStates = buildIndex(writer, docs);
+    int numDocsNoValue = 0;
+    for (int i = 0; i < docStates.length; i++) {
+      if (docStates[i] == 0) {
+        numDocsNoValue++;
+      }
+    }
+
+    IndexReader reader = IndexReader.open(directory);
+    IndexSearcher searcher = new IndexSearcher(reader);
+    TopDocs search = searcher.search(new TermQuery(new Term("all", "test")),
+        new FieldValueFilter("some", true), docs);
+    assertEquals(search.totalHits, numDocsNoValue);
+    
+    ScoreDoc[] scoreDocs = search.scoreDocs;
+    for (ScoreDoc scoreDoc : scoreDocs) {
+      assertNull(reader.document(scoreDoc.doc).get("some"));
+    }
+    
+    reader.close();
+    searcher.close();
+    directory.close();
+  }
+  
+  public void testFieldValueFilter() throws IOException {
+    Directory directory = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
+    int docs = atLeast(10);
+    int[] docStates = buildIndex(writer, docs);
+    int numDocsWithValue = 0;
+    for (int i = 0; i < docStates.length; i++) {
+      if (docStates[i] == 1) {
+        numDocsWithValue++;
+      }
+    }
+    IndexReader reader = IndexReader.open(directory);
+    IndexSearcher searcher = new IndexSearcher(reader);
+    TopDocs search = searcher.search(new TermQuery(new Term("all", "test")),
+        new FieldValueFilter("some"), docs);
+    assertEquals(search.totalHits, numDocsWithValue);
+    
+    ScoreDoc[] scoreDocs = search.scoreDocs;
+    for (ScoreDoc scoreDoc : scoreDocs) {
+      assertEquals("value", reader.document(scoreDoc.doc).get("some"));
+    }
+    
+    reader.close();
+    searcher.close();
+    directory.close();
+  }
+
+  private int[] buildIndex(RandomIndexWriter writer, int docs)
+      throws IOException, CorruptIndexException {
+    int[] docStates = new int[docs];
+    for (int i = 0; i < docs; i++) {
+      Document doc = new Document();
+      if (random.nextBoolean()) {
+        docStates[i] = 1;
+        doc.add(newField("some", "value", TextField.TYPE_STORED));
+      }
+      doc.add(newField("all", "test", TextField.TYPE_UNSTORED));
+      doc.add(newField("id", "" + i, TextField.TYPE_STORED));
+      writer.addDocument(doc);
+    }
+    writer.commit();
+    int numDeletes = random.nextInt(docs);
+    for (int i = 0; i < numDeletes; i++) {
+      int docID = random.nextInt(docs);
+      writer.deleteDocuments(new Term("id", "" + docID));
+      docStates[docID] = 2;
+    }
+    writer.close();
+    return docStates;
+  }
+
+}