LUCENE-3593: Add a filter returning all document with at least one or no value in a field

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1206017 13f79535-47bb-0310-9956-ffa450edef68
2011-11-24 23:24:35 +00:00 · 2011-11-24 23:24:35 +00:00 · 94fae441e3
parent 9dd60fe58f
commit 94fae441e3
3 changed files with 260 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -651,6 +651,12 @@ Security fixes
  prevents this as best as it can by throwing AlreadyClosedException
  also on clones.  (Uwe Schindler, Robert Muir)
 New Features
 * LUCENE-3593: Added a FieldValueFilter that accepts all documents that either
  have at least one or no value at all in a specific field. (Simon Willnauer,
  Uwe Schindler, Robert Muir)
 ======================= Lucene 3.5.0 =======================
 Changes in backwards compatibility policy
--- a/lucene/src/java/org/apache/lucene/search/FieldValueFilter.java
+++ b/lucene/src/java/org/apache/lucene/search/FieldValueFilter.java
@ -0,0 +1,137 @@
 package org.apache.lucene.search;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
 import org.apache.lucene.search.FieldCacheRangeFilter.FieldCacheDocIdSet;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.Bits.MatchAllBits;
 import org.apache.lucene.util.Bits.MatchNoBits;
 /**
 * A {@link Filter} that accepts all documents that have one or more values in a
 * given field. This {@link Filter} request {@link Bits} from the
 * {@link FieldCache} and build the bits if not present.
 */
 public class FieldValueFilter extends Filter {
  private final String field;
  private final boolean negate;
  /**
   * Creates a new {@link FieldValueFilter}
   * 
   * @param field
   *          the field to filter
   */
  public FieldValueFilter(String field) {
    this(field, false);
  }
  /**
   * Creates a new {@link FieldValueFilter}
   * 
   * @param field
   *          the field to filter
   * @param negate
   *          iff <code>true</code> all documents with no value in the given
   *          field are accepted.
   * 
   */
  public FieldValueFilter(String field, boolean negate) {
    this.field = field;
    this.negate = negate;
  }
  @Override
  public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs)
      throws IOException {
    final Bits docsWithField = FieldCache.DEFAULT.getDocsWithField(
        context.reader, field);
    if (negate) {
      if (docsWithField instanceof MatchAllBits) {
        return null;
      }
      final int maxDoc = context.reader.maxDoc();
      return new FieldCacheDocIdSet(maxDoc, acceptDocs) {
        @Override
        final boolean matchDoc(int doc) {
          if (doc >= maxDoc) {
            // TODO: this makes no sense we should check this on the caller level
            throw new ArrayIndexOutOfBoundsException("doc: "+doc + " maxDoc: " + maxDoc);
          }
          return !docsWithField.get(doc);
        }
      };
    } else {
      if (docsWithField instanceof MatchNoBits) {
        return null;
      }
      if (docsWithField instanceof DocIdSet) {
        // UweSays: this is always the case for our current impl - but who knows
        // :-)
        return BitsFilteredDocIdSet.wrap((DocIdSet) docsWithField, acceptDocs);
      }
      final int maxDoc = context.reader.maxDoc();
      return new FieldCacheDocIdSet(maxDoc, acceptDocs) {
        @Override
        final boolean matchDoc(int doc) {
          if (doc >= maxDoc) {
            // TODO: this makes no sense we should check this on the caller level
            throw new ArrayIndexOutOfBoundsException("doc: "+doc + " maxDoc: " + maxDoc);
          }
          return docsWithField.get(doc);
        }
      };
    }
  }
  @Override
  public int hashCode() {
    final int prime = 31;
    int result = 1;
    result = prime * result + ((field == null) ? 0 : field.hashCode());
    result = prime * result + (negate ? 1231 : 1237);
    return result;
  }
  @Override
  public boolean equals(Object obj) {
    if (this == obj)
      return true;
    if (obj == null)
      return false;
    if (getClass() != obj.getClass())
      return false;
    FieldValueFilter other = (FieldValueFilter) obj;
    if (field == null) {
      if (other.field != null)
        return false;
    } else if (!field.equals(other.field))
      return false;
    if (negate != other.negate)
      return false;
    return true;
  }
  @Override
  public String toString() {
    return "NoFieldValueFilter [field=" + field + ", negate=" + negate + "]";
  }
 }
--- a/lucene/src/test/org/apache/lucene/search/TestFieldValueFilter.java
+++ b/lucene/src/test/org/apache/lucene/search/TestFieldValueFilter.java
@ -0,0 +1,117 @@
 package org.apache.lucene.search;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 /**
 * 
 */
 public class TestFieldValueFilter extends LuceneTestCase {
  public void testFieldValueFilterNoValue() throws IOException {
    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    int docs = atLeast(10);
    int[] docStates = buildIndex(writer, docs);
    int numDocsNoValue = 0;
    for (int i = 0; i < docStates.length; i++) {
      if (docStates[i] == 0) {
        numDocsNoValue++;
      }
    }
    IndexReader reader = IndexReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs search = searcher.search(new TermQuery(new Term("all", "test")),
        new FieldValueFilter("some", true), docs);
    assertEquals(search.totalHits, numDocsNoValue);
    ScoreDoc[] scoreDocs = search.scoreDocs;
    for (ScoreDoc scoreDoc : scoreDocs) {
      assertNull(reader.document(scoreDoc.doc).get("some"));
    }
    reader.close();
    searcher.close();
    directory.close();
  }
  public void testFieldValueFilter() throws IOException {
    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    int docs = atLeast(10);
    int[] docStates = buildIndex(writer, docs);
    int numDocsWithValue = 0;
    for (int i = 0; i < docStates.length; i++) {
      if (docStates[i] == 1) {
        numDocsWithValue++;
      }
    }
    IndexReader reader = IndexReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs search = searcher.search(new TermQuery(new Term("all", "test")),
        new FieldValueFilter("some"), docs);
    assertEquals(search.totalHits, numDocsWithValue);
    ScoreDoc[] scoreDocs = search.scoreDocs;
    for (ScoreDoc scoreDoc : scoreDocs) {
      assertEquals("value", reader.document(scoreDoc.doc).get("some"));
    }
    reader.close();
    searcher.close();
    directory.close();
  }
  private int[] buildIndex(RandomIndexWriter writer, int docs)
      throws IOException, CorruptIndexException {
    int[] docStates = new int[docs];
    for (int i = 0; i < docs; i++) {
      Document doc = new Document();
      if (random.nextBoolean()) {
        docStates[i] = 1;
        doc.add(newField("some", "value", TextField.TYPE_STORED));
      }
      doc.add(newField("all", "test", TextField.TYPE_UNSTORED));
      doc.add(newField("id", "" + i, TextField.TYPE_STORED));
      writer.addDocument(doc);
    }
    writer.commit();
    int numDeletes = random.nextInt(docs);
    for (int i = 0; i < numDeletes; i++) {
      int docID = random.nextInt(docs);
      writer.deleteDocuments(new Term("id", "" + docID));
      docStates[docID] = 2;
    }
    writer.close();
    return docStates;
  }
 }