LUCENE-6633: Remove DuplicateFilter.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1688409 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-06-30 09:16:02 +00:00
parent e65dccbbb1
commit c348368de1
7 changed files with 4 additions and 529 deletions

View File

@ -268,6 +268,10 @@ Changes in Backwards Compatibility Policy
always includes deleted docs, so you have to check for deleted documents on
top of the iterator. (Adrien Grand)
* LUCENE-6633: DuplicateFilter has been deprecated and will be removed in 6.0.
DiversifiedTopDocsCollector can be used instead with a maximum number of hits
per key equal to 1. (Adrien Grand)
======================= Lucene 5.2.1 =======================
Bug Fixes

View File

@ -49,7 +49,6 @@ public class CorePlusExtensionsParser extends CoreParser {
private CorePlusExtensionsParser(String defaultField, Analyzer analyzer, QueryParser parser) {
super(defaultField, analyzer, parser);
queryFactory.addBuilder("DuplicateFilter", new DuplicateFilterBuilder());
String fields[] = {"contents"};
queryFactory.addBuilder("LikeThisQuery", new LikeThisQueryBuilder(analyzer, fields));
queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));

View File

@ -1,61 +0,0 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.queryparser.xml.builders;
import org.apache.lucene.queryparser.xml.DOMUtils;
import org.apache.lucene.queryparser.xml.ParserException;
import org.apache.lucene.queryparser.xml.QueryBuilder;
import org.apache.lucene.sandbox.queries.DuplicateFilter;
import org.apache.lucene.search.Filter;
import org.w3c.dom.Element;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Builder for {@link DuplicateFilter}
*/
public class DuplicateFilterBuilder implements QueryBuilder {
@Override
public Filter getQuery(Element e) throws ParserException {
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
DuplicateFilter df = new DuplicateFilter(fieldName);
String keepMode = DOMUtils.getAttribute(e, "keepMode", "first");
if (keepMode.equalsIgnoreCase("first")) {
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
} else if (keepMode.equalsIgnoreCase("last")) {
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
} else {
throw new ParserException("Illegal keepMode attribute in DuplicateFilter:" + keepMode);
}
String processingMode = DOMUtils.getAttribute(e, "processingMode", "full");
if (processingMode.equalsIgnoreCase("full")) {
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
} else if (processingMode.equalsIgnoreCase("fast")) {
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
} else {
throw new ParserException("Illegal processingMode attribute in DuplicateFilter:" + processingMode);
}
return df;
}
}

View File

@ -1,29 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<BooleanQuery fieldName="contents">
<Clause occurs="should">
<TermQuery>money</TermQuery>
</Clause>
<Clause occurs="must">
<TermQuery fieldName="date">19870408</TermQuery>
</Clause>
<Clause occurs="filter">
<!-- Filters to last document with this date -->
<DuplicateFilter fieldName="date" keepMode="last"/>
</Clause>
</BooleanQuery>

View File

@ -186,14 +186,6 @@ public class TestParser extends LuceneTestCase {
dumpResults("Cached filter", q, 5);
}
public void testDuplicateFilterQueryXML() throws ParserException, IOException {
List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
Assume.assumeTrue(leaves.size() == 1);
Query q = parse("DuplicateFilterQuery.xml");
int h = searcher.search(q, 1000).totalHits;
assertEquals("DuplicateFilterQuery should produce 1 result ", 1, h);
}
public void testNumericRangeQueryQueryXML() throws ParserException, IOException {
Query q = parse("NumericRangeQueryQuery.xml");
dumpResults("NumericRangeQuery", q, 5);

View File

@ -1,234 +0,0 @@
package org.apache.lucene.sandbox.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BitsFilteredDocIdSet;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.SparseFixedBitSet;
/**
* Filter to remove duplicate values from search results.
* <p>
* WARNING: for this to work correctly, you may have to wrap
* your reader as it cannot current deduplicate across different
* index segments.
*
* @see SlowCompositeReaderWrapper
*/
public class DuplicateFilter extends Filter {
// TODO: make duplicate filter aware of ReaderContext such that we can
// filter duplicates across segments
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
public enum KeepMode {
KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
}
private KeepMode keepMode;
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* <p>
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
* given field. This approach avoids the need to read DocsEnum for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*/
public enum ProcessingMode {
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
}
private ProcessingMode processingMode;
private String fieldName;
public DuplicateFilter(String fieldName) {
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
}
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
@Override
public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
return fastBits(context.reader(), acceptDocs);
} else {
return correctBits(context.reader(), acceptDocs);
}
}
private DocIdSet correctBits(LeafReader reader, Bits acceptDocs) throws IOException {
SparseFixedBitSet bits = new SparseFixedBitSet(reader.maxDoc()); //assume all are INvalid
Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
int doc = docs.nextDoc();
if (doc != DocIdSetIterator.NO_MORE_DOCS) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
bits.set(doc);
} else {
int lastDoc = doc;
while (true) {
lastDoc = doc;
doc = docs.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
bits.set(lastDoc);
}
}
}
}
}
return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bits, bits.approximateCardinality()), acceptDocs);
}
private DocIdSet fastBits(LeafReader reader, Bits acceptDocs) throws IOException {
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
bits.set(0, reader.maxDoc()); //assume all are valid
Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
if (termsEnum.docFreq() > 1) {
// unset potential duplicates
docs = termsEnum.postings(docs, PostingsEnum.NONE);
int doc = docs.nextDoc();
if (doc != DocIdSetIterator.NO_MORE_DOCS) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
doc = docs.nextDoc();
}
}
int lastDoc = -1;
while (true) {
lastDoc = doc;
bits.clear(lastDoc);
doc = docs.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
// restore the last bit
bits.set(lastDoc);
}
}
}
}
}
return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bits), acceptDocs);
}
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public KeepMode getKeepMode() {
return keepMode;
}
public void setKeepMode(KeepMode keepMode) {
this.keepMode = keepMode;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (super.equals(obj) == false) {
return false;
}
DuplicateFilter other = (DuplicateFilter) obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
fieldName != null && fieldName.equals(other.fieldName);
}
@Override
public String toString(String field) {
return "DuplicateFilter(" +
"fieldName=" + fieldName +"," +
"keepMode=" + (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE ? "first" : "last") + "," +
"processingMode=" + (processingMode == ProcessingMode.PM_FAST_INVALIDATION ? "fast" : "full") +
")";
}
@Override
public int hashCode() {
int hash = super.hashCode();
hash = 31 * hash + keepMode.hashCode();
hash = 31 * hash + processingMode.hashCode();
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public ProcessingMode getProcessingMode() {
return processingMode;
}
public void setProcessingMode(ProcessingMode processingMode) {
this.processingMode = processingMode;
}
}

View File

@ -1,196 +0,0 @@
package org.apache.lucene.sandbox.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class DuplicateFilterTest extends LuceneTestCase {
private static final String KEY_FIELD = "url";
private Directory directory;
private IndexReader reader;
TermQuery tq = new TermQuery(new Term("text", "lucene"));
private IndexSearcher searcher;
Analyzer analyzer;
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
analyzer = new MockAnalyzer(random());
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
//Add series of docs with filterable fields : url, text and dates flags
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
// Until we fix LUCENE-2348, the index must
// have only 1 segment:
writer.forceMerge(1);
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
}
@Override
public void tearDown() throws Exception {
IOUtils.close(reader, directory, analyzer);
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
Document doc = new Document();
doc.add(newStringField(KEY_FIELD, url, Field.Store.YES));
doc.add(newTextField("text", text, Field.Store.YES));
doc.add(newTextField("date", date, Field.Store.YES));
writer.addDocument(doc);
}
public void testDefaultFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
HashSet<String> results = new HashSet<>();
Query query = new BooleanQuery.Builder()
.add(tq, Occur.MUST)
.add(df, Occur.FILTER)
.build();
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
for (ScoreDoc hit : hits) {
StoredDocument d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned", results.contains(url));
results.add(url);
}
}
public void testNoFilter() throws Throwable {
HashSet<String> results = new HashSet<>();
ScoreDoc[] hits = searcher.search(tq, 1000).scoreDocs;
assertTrue("Default searching should have found some matches", hits.length > 0);
boolean dupsFound = false;
for (ScoreDoc hit : hits) {
StoredDocument d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
if (!dupsFound)
dupsFound = results.contains(url);
results.add(url);
}
assertTrue("Default searching should have found duplicate urls", dupsFound);
}
public void testFastFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
HashSet<String> results = new HashSet<>();
Query query = new BooleanQuery.Builder()
.add(tq, Occur.MUST)
.add(df, Occur.FILTER)
.build();
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
StoredDocument d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned", results.contains(url));
results.add(url);
}
assertEquals("Two urls found", 2, results.size());
}
public void testKeepsLastFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
Query query = new BooleanQuery.Builder()
.add(tq, Occur.MUST)
.add(df, Occur.FILTER)
.build();
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
StoredDocument d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
PostingsEnum td = TestUtil.docs(random(), reader,
KEY_FIELD,
new BytesRef(url),
null,
0);
int lastDoc = 0;
while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
lastDoc = td.docID();
}
assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
}
}
public void testKeepsFirstFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
Query query = new BooleanQuery.Builder()
.add(tq, Occur.MUST)
.add(df, Occur.FILTER)
.build();
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
StoredDocument d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
PostingsEnum td = TestUtil.docs(random(), reader,
KEY_FIELD,
new BytesRef(url),
null,
0);
int lastDoc = 0;
td.nextDoc();
lastDoc = td.docID();
assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
}
}
}