From c348368de105ddf14e9c525194a86f321571875d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 30 Jun 2015 09:16:02 +0000 Subject: [PATCH] LUCENE-6633: Remove DuplicateFilter. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1688409 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../xml/CorePlusExtensionsParser.java | 1 - .../xml/builders/DuplicateFilterBuilder.java | 61 ----- .../queryparser/xml/DuplicateFilterQuery.xml | 29 --- .../lucene/queryparser/xml/TestParser.java | 8 - .../sandbox/queries/DuplicateFilter.java | 234 ------------------ .../sandbox/queries/DuplicateFilterTest.java | 196 --------------- 7 files changed, 4 insertions(+), 529 deletions(-) delete mode 100644 lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/DuplicateFilterBuilder.java delete mode 100644 lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/DuplicateFilterQuery.xml delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java delete mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/DuplicateFilterTest.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index af25fb64541..cde7340edac 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -268,6 +268,10 @@ Changes in Backwards Compatibility Policy always includes deleted docs, so you have to check for deleted documents on top of the iterator. (Adrien Grand) +* LUCENE-6633: DuplicateFilter has been deprecated and will be removed in 6.0. + DiversifiedTopDocsCollector can be used instead with a maximum number of hits + per key equal to 1. (Adrien Grand) + ======================= Lucene 5.2.1 ======================= Bug Fixes diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CorePlusExtensionsParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CorePlusExtensionsParser.java index 456f6d1e168..88536a597f4 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CorePlusExtensionsParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/CorePlusExtensionsParser.java @@ -49,7 +49,6 @@ public class CorePlusExtensionsParser extends CoreParser { private CorePlusExtensionsParser(String defaultField, Analyzer analyzer, QueryParser parser) { super(defaultField, analyzer, parser); - queryFactory.addBuilder("DuplicateFilter", new DuplicateFilterBuilder()); String fields[] = {"contents"}; queryFactory.addBuilder("LikeThisQuery", new LikeThisQueryBuilder(analyzer, fields)); queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory)); diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/DuplicateFilterBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/DuplicateFilterBuilder.java deleted file mode 100644 index 02b908746c9..00000000000 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/DuplicateFilterBuilder.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Created on 25-Jan-2006 - */ -package org.apache.lucene.queryparser.xml.builders; - -import org.apache.lucene.queryparser.xml.DOMUtils; -import org.apache.lucene.queryparser.xml.ParserException; -import org.apache.lucene.queryparser.xml.QueryBuilder; -import org.apache.lucene.sandbox.queries.DuplicateFilter; -import org.apache.lucene.search.Filter; -import org.w3c.dom.Element; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Builder for {@link DuplicateFilter} - */ -public class DuplicateFilterBuilder implements QueryBuilder { - - @Override - public Filter getQuery(Element e) throws ParserException { - String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); - DuplicateFilter df = new DuplicateFilter(fieldName); - - String keepMode = DOMUtils.getAttribute(e, "keepMode", "first"); - if (keepMode.equalsIgnoreCase("first")) { - df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE); - } else if (keepMode.equalsIgnoreCase("last")) { - df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE); - } else { - throw new ParserException("Illegal keepMode attribute in DuplicateFilter:" + keepMode); - } - - String processingMode = DOMUtils.getAttribute(e, "processingMode", "full"); - if (processingMode.equalsIgnoreCase("full")) { - df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION); - } else if (processingMode.equalsIgnoreCase("fast")) { - df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION); - } else { - throw new ParserException("Illegal processingMode attribute in DuplicateFilter:" + processingMode); - } - - return df; - } - -} diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/DuplicateFilterQuery.xml b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/DuplicateFilterQuery.xml deleted file mode 100644 index 979c1983d08..00000000000 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/DuplicateFilterQuery.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - - money - - - 19870408 - - - - - - diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java index 94e5902e9cb..94c795e91c7 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java @@ -186,14 +186,6 @@ public class TestParser extends LuceneTestCase { dumpResults("Cached filter", q, 5); } - public void testDuplicateFilterQueryXML() throws ParserException, IOException { - List leaves = searcher.getTopReaderContext().leaves(); - Assume.assumeTrue(leaves.size() == 1); - Query q = parse("DuplicateFilterQuery.xml"); - int h = searcher.search(q, 1000).totalHits; - assertEquals("DuplicateFilterQuery should produce 1 result ", 1, h); - } - public void testNumericRangeQueryQueryXML() throws ParserException, IOException { Query q = parse("NumericRangeQueryQuery.xml"); dumpResults("NumericRangeQuery", q, 5); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java deleted file mode 100644 index db02d22cd30..00000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/DuplicateFilter.java +++ /dev/null @@ -1,234 +0,0 @@ -package org.apache.lucene.sandbox.queries; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.SlowCompositeReaderWrapper; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.BitsFilteredDocIdSet; -import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Filter; -import org.apache.lucene.util.BitDocIdSet; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.SparseFixedBitSet; - -/** - * Filter to remove duplicate values from search results. - *

- * WARNING: for this to work correctly, you may have to wrap - * your reader as it cannot current deduplicate across different - * index segments. - * - * @see SlowCompositeReaderWrapper - */ -public class DuplicateFilter extends Filter { - // TODO: make duplicate filter aware of ReaderContext such that we can - // filter duplicates across segments - - /** - * KeepMode determines which document id to consider as the master, all others being - * identified as duplicates. Selecting the "first occurrence" can potentially save on IO. - */ - public enum KeepMode { - KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE - } - - private KeepMode keepMode; - - /** - * "Full" processing mode starts by setting all bits to false and only setting bits - * for documents that contain the given field and are identified as none-duplicates. - *

- * "Fast" processing sets all bits to true then unsets all duplicate docs found for the - * given field. This approach avoids the need to read DocsEnum for terms that are seen - * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially - * faster approach , the downside is that bitsets produced will include bits set for - * documents that do not actually contain the field given. - */ - - public enum ProcessingMode { - PM_FULL_VALIDATION, PM_FAST_INVALIDATION - } - - private ProcessingMode processingMode; - - private String fieldName; - - public DuplicateFilter(String fieldName) { - this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION); - } - - public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) { - this.fieldName = fieldName; - this.keepMode = keepMode; - this.processingMode = processingMode; - } - - @Override - public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException { - if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) { - return fastBits(context.reader(), acceptDocs); - } else { - return correctBits(context.reader(), acceptDocs); - } - } - - private DocIdSet correctBits(LeafReader reader, Bits acceptDocs) throws IOException { - SparseFixedBitSet bits = new SparseFixedBitSet(reader.maxDoc()); //assume all are INvalid - Terms terms = reader.fields().terms(fieldName); - - if (terms != null) { - TermsEnum termsEnum = terms.iterator(); - PostingsEnum docs = null; - while (true) { - BytesRef currTerm = termsEnum.next(); - if (currTerm == null) { - break; - } else { - docs = termsEnum.postings(docs, PostingsEnum.NONE); - int doc = docs.nextDoc(); - if (doc != DocIdSetIterator.NO_MORE_DOCS) { - if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) { - bits.set(doc); - } else { - int lastDoc = doc; - while (true) { - lastDoc = doc; - doc = docs.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - } - bits.set(lastDoc); - } - } - } - } - } - return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bits, bits.approximateCardinality()), acceptDocs); - } - - private DocIdSet fastBits(LeafReader reader, Bits acceptDocs) throws IOException { - FixedBitSet bits = new FixedBitSet(reader.maxDoc()); - bits.set(0, reader.maxDoc()); //assume all are valid - Terms terms = reader.fields().terms(fieldName); - - if (terms != null) { - TermsEnum termsEnum = terms.iterator(); - PostingsEnum docs = null; - while (true) { - BytesRef currTerm = termsEnum.next(); - if (currTerm == null) { - break; - } else { - if (termsEnum.docFreq() > 1) { - // unset potential duplicates - docs = termsEnum.postings(docs, PostingsEnum.NONE); - int doc = docs.nextDoc(); - if (doc != DocIdSetIterator.NO_MORE_DOCS) { - if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) { - doc = docs.nextDoc(); - } - } - - int lastDoc = -1; - while (true) { - lastDoc = doc; - bits.clear(lastDoc); - doc = docs.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - } - - if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) { - // restore the last bit - bits.set(lastDoc); - } - } - } - } - } - - return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bits), acceptDocs); - } - - public String getFieldName() { - return fieldName; - } - - public void setFieldName(String fieldName) { - this.fieldName = fieldName; - } - - public KeepMode getKeepMode() { - return keepMode; - } - - public void setKeepMode(KeepMode keepMode) { - this.keepMode = keepMode; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (super.equals(obj) == false) { - return false; - } - - DuplicateFilter other = (DuplicateFilter) obj; - return keepMode == other.keepMode && - processingMode == other.processingMode && - fieldName != null && fieldName.equals(other.fieldName); - } - - @Override - public String toString(String field) { - return "DuplicateFilter(" + - "fieldName=" + fieldName +"," + - "keepMode=" + (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE ? "first" : "last") + "," + - "processingMode=" + (processingMode == ProcessingMode.PM_FAST_INVALIDATION ? "fast" : "full") + - ")"; - } - - @Override - public int hashCode() { - int hash = super.hashCode(); - hash = 31 * hash + keepMode.hashCode(); - hash = 31 * hash + processingMode.hashCode(); - hash = 31 * hash + fieldName.hashCode(); - return hash; - } - - public ProcessingMode getProcessingMode() { - return processingMode; - } - - public void setProcessingMode(ProcessingMode processingMode) { - this.processingMode = processingMode; - } -} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/DuplicateFilterTest.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/DuplicateFilterTest.java deleted file mode 100644 index 18fba7b2448..00000000000 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/DuplicateFilterTest.java +++ /dev/null @@ -1,196 +0,0 @@ -package org.apache.lucene.sandbox.queries; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.HashSet; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.*; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.TestUtil; - -public class DuplicateFilterTest extends LuceneTestCase { - private static final String KEY_FIELD = "url"; - private Directory directory; - private IndexReader reader; - TermQuery tq = new TermQuery(new Term("text", "lucene")); - private IndexSearcher searcher; - Analyzer analyzer; - - @Override - public void setUp() throws Exception { - super.setUp(); - directory = newDirectory(); - analyzer = new MockAnalyzer(random()); - RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy())); - - //Add series of docs with filterable fields : url, text and dates flags - addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101"); - addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102"); - addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101"); - addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101"); - addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102"); - addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101"); - addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101"); - addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102"); - - // Until we fix LUCENE-2348, the index must - // have only 1 segment: - writer.forceMerge(1); - - reader = writer.getReader(); - writer.close(); - searcher = newSearcher(reader); - - } - - @Override - public void tearDown() throws Exception { - IOUtils.close(reader, directory, analyzer); - super.tearDown(); - } - - private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException { - Document doc = new Document(); - doc.add(newStringField(KEY_FIELD, url, Field.Store.YES)); - doc.add(newTextField("text", text, Field.Store.YES)); - doc.add(newTextField("date", date, Field.Store.YES)); - writer.addDocument(doc); - } - - public void testDefaultFilter() throws Throwable { - DuplicateFilter df = new DuplicateFilter(KEY_FIELD); - HashSet results = new HashSet<>(); - Query query = new BooleanQuery.Builder() - .add(tq, Occur.MUST) - .add(df, Occur.FILTER) - .build(); - ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; - - for (ScoreDoc hit : hits) { - StoredDocument d = searcher.doc(hit.doc); - String url = d.get(KEY_FIELD); - assertFalse("No duplicate urls should be returned", results.contains(url)); - results.add(url); - } - } - - public void testNoFilter() throws Throwable { - HashSet results = new HashSet<>(); - ScoreDoc[] hits = searcher.search(tq, 1000).scoreDocs; - assertTrue("Default searching should have found some matches", hits.length > 0); - boolean dupsFound = false; - - for (ScoreDoc hit : hits) { - StoredDocument d = searcher.doc(hit.doc); - String url = d.get(KEY_FIELD); - if (!dupsFound) - dupsFound = results.contains(url); - results.add(url); - } - assertTrue("Default searching should have found duplicate urls", dupsFound); - } - - public void testFastFilter() throws Throwable { - DuplicateFilter df = new DuplicateFilter(KEY_FIELD); - df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION); - HashSet results = new HashSet<>(); - Query query = new BooleanQuery.Builder() - .add(tq, Occur.MUST) - .add(df, Occur.FILTER) - .build(); - ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; - assertTrue("Filtered searching should have found some matches", hits.length > 0); - - for (ScoreDoc hit : hits) { - StoredDocument d = searcher.doc(hit.doc); - String url = d.get(KEY_FIELD); - assertFalse("No duplicate urls should be returned", results.contains(url)); - results.add(url); - } - assertEquals("Two urls found", 2, results.size()); - } - - public void testKeepsLastFilter() throws Throwable { - DuplicateFilter df = new DuplicateFilter(KEY_FIELD); - df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE); - Query query = new BooleanQuery.Builder() - .add(tq, Occur.MUST) - .add(df, Occur.FILTER) - .build(); - ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; - assertTrue("Filtered searching should have found some matches", hits.length > 0); - for (ScoreDoc hit : hits) { - StoredDocument d = searcher.doc(hit.doc); - String url = d.get(KEY_FIELD); - PostingsEnum td = TestUtil.docs(random(), reader, - KEY_FIELD, - new BytesRef(url), - null, - 0); - - int lastDoc = 0; - while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - lastDoc = td.docID(); - } - assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc); - } - } - - - public void testKeepsFirstFilter() throws Throwable { - DuplicateFilter df = new DuplicateFilter(KEY_FIELD); - df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE); - Query query = new BooleanQuery.Builder() - .add(tq, Occur.MUST) - .add(df, Occur.FILTER) - .build(); - ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; - assertTrue("Filtered searching should have found some matches", hits.length > 0); - for (ScoreDoc hit : hits) { - StoredDocument d = searcher.doc(hit.doc); - String url = d.get(KEY_FIELD); - PostingsEnum td = TestUtil.docs(random(), reader, - KEY_FIELD, - new BytesRef(url), - null, - 0); - - int lastDoc = 0; - td.nextDoc(); - lastDoc = td.docID(); - assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc); - } - } - - -}