LUCENE-6633: Remove DuplicateFilter.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1688409 13f79535-47bb-0310-9956-ffa450edef68
2025-02-10 20:15:18 +00:00 · 2015-06-30 09:16:02 +00:00 · 2015-06-30 09:16:02 +00:00 · c348368de1
commit c348368de1
parent e65dccbbb1
7 changed files with 4 additions and 529 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -268,6 +268,10 @@ Changes in Backwards Compatibility Policy
  always includes deleted docs, so you have to check for deleted documents on
  top of the iterator. (Adrien Grand)

+* LUCENE-6633: DuplicateFilter has been deprecated and will be removed in 6.0.
+  DiversifiedTopDocsCollector can be used instead with a maximum number of hits
+  per key equal to 1. (Adrien Grand)
+
 ======================= Lucene 5.2.1 =======================

 Bug Fixes
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CorePlusExtensionsParser.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CorePlusExtensionsParser.java
@ -49,7 +49,6 @@ public class CorePlusExtensionsParser extends CoreParser {

  private CorePlusExtensionsParser(String defaultField, Analyzer analyzer, QueryParser parser) {
    super(defaultField, analyzer, parser);
-    queryFactory.addBuilder("DuplicateFilter", new DuplicateFilterBuilder());
    String fields[] = {"contents"};
    queryFactory.addBuilder("LikeThisQuery", new LikeThisQueryBuilder(analyzer, fields));
    queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/DuplicateFilterBuilder.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/DuplicateFilterBuilder.java
@ -1,61 +0,0 @@
-/*
- * Created on 25-Jan-2006
- */
-package org.apache.lucene.queryparser.xml.builders;
-
-import org.apache.lucene.queryparser.xml.DOMUtils;
-import org.apache.lucene.queryparser.xml.ParserException;
-import org.apache.lucene.queryparser.xml.QueryBuilder;
-import org.apache.lucene.sandbox.queries.DuplicateFilter;
-import org.apache.lucene.search.Filter;
-import org.w3c.dom.Element;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Builder for {@link DuplicateFilter}
- */
-public class DuplicateFilterBuilder implements QueryBuilder {
-
-  @Override
-  public Filter getQuery(Element e) throws ParserException {
-    String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
-    DuplicateFilter df = new DuplicateFilter(fieldName);
-
-    String keepMode = DOMUtils.getAttribute(e, "keepMode", "first");
-    if (keepMode.equalsIgnoreCase("first")) {
-      df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
-    } else if (keepMode.equalsIgnoreCase("last")) {
-      df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
-    } else {
-      throw new ParserException("Illegal keepMode attribute in DuplicateFilter:" + keepMode);
-    }
-
-    String processingMode = DOMUtils.getAttribute(e, "processingMode", "full");
-    if (processingMode.equalsIgnoreCase("full")) {
-      df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
-    } else if (processingMode.equalsIgnoreCase("fast")) {
-      df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
-    } else {
-      throw new ParserException("Illegal processingMode attribute in DuplicateFilter:" + processingMode);
-    }
-
-    return df;
-  }
-
-}
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/DuplicateFilterQuery.xml
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/DuplicateFilterQuery.xml
@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-->
-<BooleanQuery fieldName="contents">
-	<Clause occurs="should">
-		<TermQuery>money</TermQuery>
-	</Clause>
-	<Clause occurs="must">
-		<TermQuery fieldName="date">19870408</TermQuery>
-	</Clause>
-	<Clause occurs="filter">
-		<!-- Filters to last document with this date -->
-		<DuplicateFilter fieldName="date" keepMode="last"/>
-	</Clause>
-</BooleanQuery>	
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java
@ -186,14 +186,6 @@ public class TestParser extends LuceneTestCase {
    dumpResults("Cached filter", q, 5);
  }

-  public void testDuplicateFilterQueryXML() throws ParserException, IOException {
-    List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
-    Assume.assumeTrue(leaves.size() == 1);
-    Query q = parse("DuplicateFilterQuery.xml");
-    int h = searcher.search(q, 1000).totalHits;
-    assertEquals("DuplicateFilterQuery should produce 1 result ", 1, h);
-  }
-
  public void testNumericRangeQueryQueryXML() throws ParserException, IOException {
    Query q = parse("NumericRangeQueryQuery.xml");
    dumpResults("NumericRangeQuery", q, 5);
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java
@ -1,234 +0,0 @@
-package org.apache.lucene.sandbox.queries;
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.SlowCompositeReaderWrapper;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.BitsFilteredDocIdSet;
-import org.apache.lucene.search.DocIdSet;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.Filter;
-import org.apache.lucene.util.BitDocIdSet;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.util.SparseFixedBitSet;
-
-/**
- * Filter to remove duplicate values from search results.
- * <p>
- * WARNING: for this to work correctly, you may have to wrap
- * your reader as it cannot current deduplicate across different
- * index segments.
- * 
- * @see SlowCompositeReaderWrapper
- */
-public class DuplicateFilter extends Filter {
-  // TODO: make duplicate filter aware of ReaderContext such that we can
-  // filter duplicates across segments
-
-  /**
-   * KeepMode determines which document id to consider as the master, all others being
-   * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
-   */
-  public enum KeepMode {
-    KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
-  }
-
-  private KeepMode keepMode;
-
-  /**
-   * "Full" processing mode starts by setting all bits to false and only setting bits
-   * for documents that contain the given field and are identified as none-duplicates.
-   * <p>
-   * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
-   * given field. This approach avoids the need to read DocsEnum for terms that are seen
-   * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
-   * faster approach , the downside is that bitsets produced will include bits set for
-   * documents that do not actually contain the field given.
-   */
-
-  public enum ProcessingMode {
-    PM_FULL_VALIDATION, PM_FAST_INVALIDATION
-  }
-
-  private ProcessingMode processingMode;
-
-  private String fieldName;
-
-  public DuplicateFilter(String fieldName) {
-    this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
-  }
-
-  public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
-    this.fieldName = fieldName;
-    this.keepMode = keepMode;
-    this.processingMode = processingMode;
-  }
-
-  @Override
-  public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
-    if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
-      return fastBits(context.reader(), acceptDocs);
-    } else {
-      return correctBits(context.reader(), acceptDocs);
-    }
-  }
-
-  private DocIdSet correctBits(LeafReader reader, Bits acceptDocs) throws IOException {
-    SparseFixedBitSet bits = new SparseFixedBitSet(reader.maxDoc()); //assume all are INvalid
-    Terms terms = reader.fields().terms(fieldName);
-
-    if (terms != null) {
-      TermsEnum termsEnum = terms.iterator();
-      PostingsEnum docs = null;
-      while (true) {
-        BytesRef currTerm = termsEnum.next();
-        if (currTerm == null) {
-          break;
-        } else {
-          docs = termsEnum.postings(docs, PostingsEnum.NONE);
-          int doc = docs.nextDoc();
-          if (doc != DocIdSetIterator.NO_MORE_DOCS) {
-            if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
-              bits.set(doc);
-            } else {
-              int lastDoc = doc;
-              while (true) {
-                lastDoc = doc;
-                doc = docs.nextDoc();
-                if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-                  break;
-                }
-              }
-              bits.set(lastDoc);
-            }
-          }
-        }
-      }
-    }
-    return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bits, bits.approximateCardinality()), acceptDocs);
-  }
-
-  private DocIdSet fastBits(LeafReader reader, Bits acceptDocs) throws IOException {
-    FixedBitSet bits = new FixedBitSet(reader.maxDoc());
-    bits.set(0, reader.maxDoc()); //assume all are valid
-    Terms terms = reader.fields().terms(fieldName);
-
-    if (terms != null) {
-      TermsEnum termsEnum = terms.iterator();
-      PostingsEnum docs = null;
-      while (true) {
-        BytesRef currTerm = termsEnum.next();
-        if (currTerm == null) {
-          break;
-        } else {
-          if (termsEnum.docFreq() > 1) {
-            // unset potential duplicates
-            docs = termsEnum.postings(docs, PostingsEnum.NONE);
-            int doc = docs.nextDoc();
-            if (doc != DocIdSetIterator.NO_MORE_DOCS) {
-              if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
-                doc = docs.nextDoc();
-              }
-            }
-  
-            int lastDoc = -1;
-            while (true) {
-              lastDoc = doc;
-              bits.clear(lastDoc);
-              doc = docs.nextDoc();
-              if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-                break;
-              }
-            }
-  
-            if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
-              // restore the last bit
-              bits.set(lastDoc);
-            }
-          }
-        }
-      }
-    }
-
-    return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bits), acceptDocs);
-  }
-
-  public String getFieldName() {
-    return fieldName;
-  }
-
-  public void setFieldName(String fieldName) {
-    this.fieldName = fieldName;
-  }
-
-  public KeepMode getKeepMode() {
-    return keepMode;
-  }
-
-  public void setKeepMode(KeepMode keepMode) {
-    this.keepMode = keepMode;
-  }
-
-  @Override
-  public boolean equals(Object obj) {
-    if (this == obj) {
-      return true;
-    }
-    if (super.equals(obj) == false) {
-      return false;
-    }
-
-    DuplicateFilter other = (DuplicateFilter) obj;
-    return keepMode == other.keepMode &&
-        processingMode == other.processingMode &&
-        fieldName != null && fieldName.equals(other.fieldName);
-  }
-
-  @Override
-  public String toString(String field) {
-    return "DuplicateFilter(" +
-              "fieldName=" + fieldName +"," +
-              "keepMode=" + (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE ? "first" : "last") + "," +
-              "processingMode=" + (processingMode == ProcessingMode.PM_FAST_INVALIDATION ? "fast" : "full") +
-           ")";
-  }
-
-  @Override
-  public int hashCode() {
-    int hash = super.hashCode();
-    hash = 31 * hash + keepMode.hashCode();
-    hash = 31 * hash + processingMode.hashCode();
-    hash = 31 * hash + fieldName.hashCode();
-    return hash;
-  }
-
-  public ProcessingMode getProcessingMode() {
-    return processingMode;
-  }
-
-  public void setProcessingMode(ProcessingMode processingMode) {
-    this.processingMode = processingMode;
-  }
-}
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/DuplicateFilterTest.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/DuplicateFilterTest.java
@ -1,196 +0,0 @@
-package org.apache.lucene.sandbox.queries;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.HashSet;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.*;
-import org.apache.lucene.search.BooleanClause.Occur;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TestUtil;
-
-public class DuplicateFilterTest extends LuceneTestCase {
-  private static final String KEY_FIELD = "url";
-  private Directory directory;
-  private IndexReader reader;
-  TermQuery tq = new TermQuery(new Term("text", "lucene"));
-  private IndexSearcher searcher;
-  Analyzer analyzer;
-
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
-    directory = newDirectory();
-    analyzer = new MockAnalyzer(random());
-    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
-
-    //Add series of docs with filterable fields : url, text and dates  flags
-    addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
-    addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
-    addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
-    addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
-    addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
-    addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
-    addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
-    addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
-
-    // Until we fix LUCENE-2348, the index must
-    // have only 1 segment:
-    writer.forceMerge(1);
-
-    reader = writer.getReader();
-    writer.close();
-    searcher = newSearcher(reader);
-
-  }
-
-  @Override
-  public void tearDown() throws Exception {
-    IOUtils.close(reader, directory, analyzer);
-    super.tearDown();
-  }
-
-  private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
-    Document doc = new Document();
-    doc.add(newStringField(KEY_FIELD, url, Field.Store.YES));
-    doc.add(newTextField("text", text, Field.Store.YES));
-    doc.add(newTextField("date", date, Field.Store.YES));
-    writer.addDocument(doc);
-  }
-
-  public void testDefaultFilter() throws Throwable {
-    DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
-    HashSet<String> results = new HashSet<>();
-    Query query = new BooleanQuery.Builder()
-        .add(tq, Occur.MUST)
-        .add(df, Occur.FILTER)
-        .build();
-    ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
-
-    for (ScoreDoc hit : hits) {
-      StoredDocument d = searcher.doc(hit.doc);
-      String url = d.get(KEY_FIELD);
-      assertFalse("No duplicate urls should be returned", results.contains(url));
-      results.add(url);
-    }
-  }
-
-  public void testNoFilter() throws Throwable {
-    HashSet<String> results = new HashSet<>();
-    ScoreDoc[] hits = searcher.search(tq, 1000).scoreDocs;
-    assertTrue("Default searching should have found some matches", hits.length > 0);
-    boolean dupsFound = false;
-
-    for (ScoreDoc hit : hits) {
-      StoredDocument d = searcher.doc(hit.doc);
-      String url = d.get(KEY_FIELD);
-      if (!dupsFound)
-        dupsFound = results.contains(url);
-      results.add(url);
-    }
-    assertTrue("Default searching should have found duplicate urls", dupsFound);
-  }
-
-  public void testFastFilter() throws Throwable {
-    DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
-    df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
-    HashSet<String> results = new HashSet<>();
-    Query query = new BooleanQuery.Builder()
-        .add(tq, Occur.MUST)
-        .add(df, Occur.FILTER)
-        .build();
-    ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
-    assertTrue("Filtered searching should have found some matches", hits.length > 0);
-
-    for (ScoreDoc hit : hits) {
-      StoredDocument d = searcher.doc(hit.doc);
-      String url = d.get(KEY_FIELD);
-      assertFalse("No duplicate urls should be returned", results.contains(url));
-      results.add(url);
-    }
-    assertEquals("Two urls found", 2, results.size());
-  }
-
-  public void testKeepsLastFilter() throws Throwable {
-    DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
-    df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
-    Query query = new BooleanQuery.Builder()
-        .add(tq, Occur.MUST)
-        .add(df, Occur.FILTER)
-        .build();
-    ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
-    assertTrue("Filtered searching should have found some matches", hits.length > 0);
-    for (ScoreDoc hit : hits) {
-      StoredDocument d = searcher.doc(hit.doc);
-      String url = d.get(KEY_FIELD);
-      PostingsEnum td = TestUtil.docs(random(), reader,
-          KEY_FIELD,
-          new BytesRef(url),
-          null,
-          0);
-
-      int lastDoc = 0;
-      while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
-        lastDoc = td.docID();
-      }
-      assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
-    }
-  }
-
-
-  public void testKeepsFirstFilter() throws Throwable {
-    DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
-    df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
-    Query query = new BooleanQuery.Builder()
-        .add(tq, Occur.MUST)
-        .add(df, Occur.FILTER)
-        .build();
-    ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
-    assertTrue("Filtered searching should have found some matches", hits.length > 0);
-    for (ScoreDoc hit : hits) {
-      StoredDocument d = searcher.doc(hit.doc);
-      String url = d.get(KEY_FIELD);
-      PostingsEnum td = TestUtil.docs(random(), reader,
-          KEY_FIELD,
-          new BytesRef(url),
-          null,
-          0);
-
-      int lastDoc = 0;
-      td.nextDoc();
-      lastDoc = td.docID();
-      assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
-    }
-  }
-
-
-}