mirror of https://github.com/apache/lucene.git
LUCENE-6633: Remove DuplicateFilter.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1688409 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e65dccbbb1
commit
c348368de1
|
@ -268,6 +268,10 @@ Changes in Backwards Compatibility Policy
|
|||
always includes deleted docs, so you have to check for deleted documents on
|
||||
top of the iterator. (Adrien Grand)
|
||||
|
||||
* LUCENE-6633: DuplicateFilter has been deprecated and will be removed in 6.0.
|
||||
DiversifiedTopDocsCollector can be used instead with a maximum number of hits
|
||||
per key equal to 1. (Adrien Grand)
|
||||
|
||||
======================= Lucene 5.2.1 =======================
|
||||
|
||||
Bug Fixes
|
||||
|
|
|
@ -49,7 +49,6 @@ public class CorePlusExtensionsParser extends CoreParser {
|
|||
|
||||
private CorePlusExtensionsParser(String defaultField, Analyzer analyzer, QueryParser parser) {
|
||||
super(defaultField, analyzer, parser);
|
||||
queryFactory.addBuilder("DuplicateFilter", new DuplicateFilterBuilder());
|
||||
String fields[] = {"contents"};
|
||||
queryFactory.addBuilder("LikeThisQuery", new LikeThisQueryBuilder(analyzer, fields));
|
||||
queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.queryparser.xml.builders;
|
||||
|
||||
import org.apache.lucene.queryparser.xml.DOMUtils;
|
||||
import org.apache.lucene.queryparser.xml.ParserException;
|
||||
import org.apache.lucene.queryparser.xml.QueryBuilder;
|
||||
import org.apache.lucene.sandbox.queries.DuplicateFilter;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Builder for {@link DuplicateFilter}
|
||||
*/
|
||||
public class DuplicateFilterBuilder implements QueryBuilder {
|
||||
|
||||
@Override
|
||||
public Filter getQuery(Element e) throws ParserException {
|
||||
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
||||
DuplicateFilter df = new DuplicateFilter(fieldName);
|
||||
|
||||
String keepMode = DOMUtils.getAttribute(e, "keepMode", "first");
|
||||
if (keepMode.equalsIgnoreCase("first")) {
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
|
||||
} else if (keepMode.equalsIgnoreCase("last")) {
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
|
||||
} else {
|
||||
throw new ParserException("Illegal keepMode attribute in DuplicateFilter:" + keepMode);
|
||||
}
|
||||
|
||||
String processingMode = DOMUtils.getAttribute(e, "processingMode", "full");
|
||||
if (processingMode.equalsIgnoreCase("full")) {
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
|
||||
} else if (processingMode.equalsIgnoreCase("fast")) {
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
|
||||
} else {
|
||||
throw new ParserException("Illegal processingMode attribute in DuplicateFilter:" + processingMode);
|
||||
}
|
||||
|
||||
return df;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,29 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<BooleanQuery fieldName="contents">
|
||||
<Clause occurs="should">
|
||||
<TermQuery>money</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="must">
|
||||
<TermQuery fieldName="date">19870408</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="filter">
|
||||
<!-- Filters to last document with this date -->
|
||||
<DuplicateFilter fieldName="date" keepMode="last"/>
|
||||
</Clause>
|
||||
</BooleanQuery>
|
|
@ -186,14 +186,6 @@ public class TestParser extends LuceneTestCase {
|
|||
dumpResults("Cached filter", q, 5);
|
||||
}
|
||||
|
||||
public void testDuplicateFilterQueryXML() throws ParserException, IOException {
|
||||
List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
|
||||
Assume.assumeTrue(leaves.size() == 1);
|
||||
Query q = parse("DuplicateFilterQuery.xml");
|
||||
int h = searcher.search(q, 1000).totalHits;
|
||||
assertEquals("DuplicateFilterQuery should produce 1 result ", 1, h);
|
||||
}
|
||||
|
||||
public void testNumericRangeQueryQueryXML() throws ParserException, IOException {
|
||||
Query q = parse("NumericRangeQueryQuery.xml");
|
||||
dumpResults("NumericRangeQuery", q, 5);
|
||||
|
|
|
@ -1,234 +0,0 @@
|
|||
package org.apache.lucene.sandbox.queries;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.SlowCompositeReaderWrapper;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.BitsFilteredDocIdSet;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.BitDocIdSet;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.SparseFixedBitSet;
|
||||
|
||||
/**
|
||||
* Filter to remove duplicate values from search results.
|
||||
* <p>
|
||||
* WARNING: for this to work correctly, you may have to wrap
|
||||
* your reader as it cannot current deduplicate across different
|
||||
* index segments.
|
||||
*
|
||||
* @see SlowCompositeReaderWrapper
|
||||
*/
|
||||
public class DuplicateFilter extends Filter {
|
||||
// TODO: make duplicate filter aware of ReaderContext such that we can
|
||||
// filter duplicates across segments
|
||||
|
||||
/**
|
||||
* KeepMode determines which document id to consider as the master, all others being
|
||||
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
|
||||
*/
|
||||
public enum KeepMode {
|
||||
KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
|
||||
}
|
||||
|
||||
private KeepMode keepMode;
|
||||
|
||||
/**
|
||||
* "Full" processing mode starts by setting all bits to false and only setting bits
|
||||
* for documents that contain the given field and are identified as none-duplicates.
|
||||
* <p>
|
||||
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
|
||||
* given field. This approach avoids the need to read DocsEnum for terms that are seen
|
||||
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
|
||||
* faster approach , the downside is that bitsets produced will include bits set for
|
||||
* documents that do not actually contain the field given.
|
||||
*/
|
||||
|
||||
public enum ProcessingMode {
|
||||
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
|
||||
}
|
||||
|
||||
private ProcessingMode processingMode;
|
||||
|
||||
private String fieldName;
|
||||
|
||||
public DuplicateFilter(String fieldName) {
|
||||
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
|
||||
}
|
||||
|
||||
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
|
||||
this.fieldName = fieldName;
|
||||
this.keepMode = keepMode;
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
|
||||
if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
|
||||
return fastBits(context.reader(), acceptDocs);
|
||||
} else {
|
||||
return correctBits(context.reader(), acceptDocs);
|
||||
}
|
||||
}
|
||||
|
||||
private DocIdSet correctBits(LeafReader reader, Bits acceptDocs) throws IOException {
|
||||
SparseFixedBitSet bits = new SparseFixedBitSet(reader.maxDoc()); //assume all are INvalid
|
||||
Terms terms = reader.fields().terms(fieldName);
|
||||
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
PostingsEnum docs = null;
|
||||
while (true) {
|
||||
BytesRef currTerm = termsEnum.next();
|
||||
if (currTerm == null) {
|
||||
break;
|
||||
} else {
|
||||
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
||||
int doc = docs.nextDoc();
|
||||
if (doc != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
|
||||
bits.set(doc);
|
||||
} else {
|
||||
int lastDoc = doc;
|
||||
while (true) {
|
||||
lastDoc = doc;
|
||||
doc = docs.nextDoc();
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
bits.set(lastDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bits, bits.approximateCardinality()), acceptDocs);
|
||||
}
|
||||
|
||||
private DocIdSet fastBits(LeafReader reader, Bits acceptDocs) throws IOException {
|
||||
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
|
||||
bits.set(0, reader.maxDoc()); //assume all are valid
|
||||
Terms terms = reader.fields().terms(fieldName);
|
||||
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
PostingsEnum docs = null;
|
||||
while (true) {
|
||||
BytesRef currTerm = termsEnum.next();
|
||||
if (currTerm == null) {
|
||||
break;
|
||||
} else {
|
||||
if (termsEnum.docFreq() > 1) {
|
||||
// unset potential duplicates
|
||||
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
||||
int doc = docs.nextDoc();
|
||||
if (doc != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
|
||||
doc = docs.nextDoc();
|
||||
}
|
||||
}
|
||||
|
||||
int lastDoc = -1;
|
||||
while (true) {
|
||||
lastDoc = doc;
|
||||
bits.clear(lastDoc);
|
||||
doc = docs.nextDoc();
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
|
||||
// restore the last bit
|
||||
bits.set(lastDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bits), acceptDocs);
|
||||
}
|
||||
|
||||
public String getFieldName() {
|
||||
return fieldName;
|
||||
}
|
||||
|
||||
public void setFieldName(String fieldName) {
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
public KeepMode getKeepMode() {
|
||||
return keepMode;
|
||||
}
|
||||
|
||||
public void setKeepMode(KeepMode keepMode) {
|
||||
this.keepMode = keepMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (super.equals(obj) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DuplicateFilter other = (DuplicateFilter) obj;
|
||||
return keepMode == other.keepMode &&
|
||||
processingMode == other.processingMode &&
|
||||
fieldName != null && fieldName.equals(other.fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "DuplicateFilter(" +
|
||||
"fieldName=" + fieldName +"," +
|
||||
"keepMode=" + (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE ? "first" : "last") + "," +
|
||||
"processingMode=" + (processingMode == ProcessingMode.PM_FAST_INVALIDATION ? "fast" : "full") +
|
||||
")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = super.hashCode();
|
||||
hash = 31 * hash + keepMode.hashCode();
|
||||
hash = 31 * hash + processingMode.hashCode();
|
||||
hash = 31 * hash + fieldName.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
public ProcessingMode getProcessingMode() {
|
||||
return processingMode;
|
||||
}
|
||||
|
||||
public void setProcessingMode(ProcessingMode processingMode) {
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
}
|
|
@ -1,196 +0,0 @@
|
|||
package org.apache.lucene.sandbox.queries;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class DuplicateFilterTest extends LuceneTestCase {
|
||||
private static final String KEY_FIELD = "url";
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
TermQuery tq = new TermQuery(new Term("text", "lucene"));
|
||||
private IndexSearcher searcher;
|
||||
Analyzer analyzer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
directory = newDirectory();
|
||||
analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
|
||||
|
||||
//Add series of docs with filterable fields : url, text and dates flags
|
||||
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
|
||||
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
|
||||
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
|
||||
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
|
||||
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
|
||||
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
|
||||
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
|
||||
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
|
||||
|
||||
// Until we fix LUCENE-2348, the index must
|
||||
// have only 1 segment:
|
||||
writer.forceMerge(1);
|
||||
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher = newSearcher(reader);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
IOUtils.close(reader, directory, analyzer);
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField(KEY_FIELD, url, Field.Store.YES));
|
||||
doc.add(newTextField("text", text, Field.Store.YES));
|
||||
doc.add(newTextField("date", date, Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
public void testDefaultFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
HashSet<String> results = new HashSet<>();
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(tq, Occur.MUST)
|
||||
.add(df, Occur.FILTER)
|
||||
.build();
|
||||
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
StoredDocument d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned", results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
}
|
||||
|
||||
public void testNoFilter() throws Throwable {
|
||||
HashSet<String> results = new HashSet<>();
|
||||
ScoreDoc[] hits = searcher.search(tq, 1000).scoreDocs;
|
||||
assertTrue("Default searching should have found some matches", hits.length > 0);
|
||||
boolean dupsFound = false;
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
StoredDocument d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
if (!dupsFound)
|
||||
dupsFound = results.contains(url);
|
||||
results.add(url);
|
||||
}
|
||||
assertTrue("Default searching should have found duplicate urls", dupsFound);
|
||||
}
|
||||
|
||||
public void testFastFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
|
||||
HashSet<String> results = new HashSet<>();
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(tq, Occur.MUST)
|
||||
.add(df, Occur.FILTER)
|
||||
.build();
|
||||
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
StoredDocument d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned", results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
assertEquals("Two urls found", 2, results.size());
|
||||
}
|
||||
|
||||
public void testKeepsLastFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(tq, Occur.MUST)
|
||||
.add(df, Occur.FILTER)
|
||||
.build();
|
||||
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
for (ScoreDoc hit : hits) {
|
||||
StoredDocument d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
PostingsEnum td = TestUtil.docs(random(), reader,
|
||||
KEY_FIELD,
|
||||
new BytesRef(url),
|
||||
null,
|
||||
0);
|
||||
|
||||
int lastDoc = 0;
|
||||
while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
lastDoc = td.docID();
|
||||
}
|
||||
assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testKeepsFirstFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(tq, Occur.MUST)
|
||||
.add(df, Occur.FILTER)
|
||||
.build();
|
||||
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
for (ScoreDoc hit : hits) {
|
||||
StoredDocument d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
PostingsEnum td = TestUtil.docs(random(), reader,
|
||||
KEY_FIELD,
|
||||
new BytesRef(url),
|
||||
null,
|
||||
0);
|
||||
|
||||
int lastDoc = 0;
|
||||
td.nextDoc();
|
||||
lastDoc = td.docID();
|
||||
assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue