mirror of https://github.com/apache/lucene.git
Added new DuplicateFilter functionality to filter documents sharing a field value (e.g. primary key/url)
Also includes Junit test and XML Query support git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@581426 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f3119614e6
commit
62fa7b4b82
|
@ -0,0 +1,245 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermDocs;
|
||||||
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
|
||||||
|
public class DuplicateFilter extends Filter
|
||||||
|
{
|
||||||
|
|
||||||
|
String fieldName;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* KeepMode determines which document id to consider as the master, all others being
|
||||||
|
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
|
||||||
|
*/
|
||||||
|
int keepMode=KM_USE_FIRST_OCCURRENCE;
|
||||||
|
public static final int KM_USE_FIRST_OCCURRENCE=1;
|
||||||
|
public static final int KM_USE_LAST_OCCURRENCE=2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* "Full" processing mode starts by setting all bits to false and only setting bits
|
||||||
|
* for documents that contain the given field and are identified as none-duplicates.
|
||||||
|
|
||||||
|
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
|
||||||
|
* given field. This approach avoids the need to read TermDocs for terms that are seen
|
||||||
|
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
|
||||||
|
* faster approach , the downside is that bitsets produced will include bits set for
|
||||||
|
* documents that do not actually contain the field given.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
int processingMode=PM_FULL_VALIDATION;
|
||||||
|
public static final int PM_FULL_VALIDATION=1;
|
||||||
|
public static final int PM_FAST_INVALIDATION=2;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public DuplicateFilter(String fieldName)
|
||||||
|
{
|
||||||
|
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
|
||||||
|
{
|
||||||
|
this.fieldName = fieldName;
|
||||||
|
this.keepMode = keepMode;
|
||||||
|
this.processingMode = processingMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BitSet bits(IndexReader reader) throws IOException
|
||||||
|
{
|
||||||
|
if(processingMode==PM_FAST_INVALIDATION)
|
||||||
|
{
|
||||||
|
return fastBits(reader);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return correctBits(reader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private BitSet correctBits(IndexReader reader) throws IOException
|
||||||
|
{
|
||||||
|
|
||||||
|
BitSet bits=new BitSet(reader.maxDoc()); //assume all are INvalid
|
||||||
|
Term startTerm=new Term(fieldName,"");
|
||||||
|
TermEnum te = reader.terms(startTerm);
|
||||||
|
if(te!=null)
|
||||||
|
{
|
||||||
|
Term currTerm=te.term();
|
||||||
|
while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
|
||||||
|
{
|
||||||
|
int lastDoc=-1;
|
||||||
|
//set non duplicates
|
||||||
|
TermDocs td = reader.termDocs(currTerm);
|
||||||
|
if(td.next())
|
||||||
|
{
|
||||||
|
if(keepMode==KM_USE_FIRST_OCCURRENCE)
|
||||||
|
{
|
||||||
|
bits.set(td.doc());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
do
|
||||||
|
{
|
||||||
|
lastDoc=td.doc();
|
||||||
|
}while(td.next());
|
||||||
|
bits.set(lastDoc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(!te.next())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
currTerm=te.term();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
private BitSet fastBits(IndexReader reader) throws IOException
|
||||||
|
{
|
||||||
|
|
||||||
|
BitSet bits=new BitSet(reader.maxDoc());
|
||||||
|
bits.set(0,reader.maxDoc()); //assume all are valid
|
||||||
|
Term startTerm=new Term(fieldName,"");
|
||||||
|
TermEnum te = reader.terms(startTerm);
|
||||||
|
if(te!=null)
|
||||||
|
{
|
||||||
|
Term currTerm=te.term();
|
||||||
|
|
||||||
|
while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
|
||||||
|
{
|
||||||
|
if(te.docFreq()>1)
|
||||||
|
{
|
||||||
|
int lastDoc=-1;
|
||||||
|
//unset potential duplicates
|
||||||
|
TermDocs td = reader.termDocs(currTerm);
|
||||||
|
td.next();
|
||||||
|
if(keepMode==KM_USE_FIRST_OCCURRENCE)
|
||||||
|
{
|
||||||
|
td.next();
|
||||||
|
}
|
||||||
|
do
|
||||||
|
{
|
||||||
|
lastDoc=td.doc();
|
||||||
|
bits.set(lastDoc,false);
|
||||||
|
}while(td.next());
|
||||||
|
if(keepMode==KM_USE_LAST_OCCURRENCE)
|
||||||
|
{
|
||||||
|
//restore the last bit
|
||||||
|
bits.set(lastDoc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(!te.next())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
currTerm=te.term();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param args
|
||||||
|
* @throws IOException
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
public static void main(String[] args) throws Exception
|
||||||
|
{
|
||||||
|
IndexReader r=IndexReader.open("/indexes/personCentricAnon");
|
||||||
|
// IndexReader r=IndexReader.open("/indexes/enron");
|
||||||
|
long start=System.currentTimeMillis();
|
||||||
|
// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
|
||||||
|
// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
|
||||||
|
DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
|
||||||
|
// DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
|
||||||
|
// df.setProcessingMode(PM_SLOW_VALIDATION);
|
||||||
|
BitSet b = df.bits(r);
|
||||||
|
long end=System.currentTimeMillis()-start;
|
||||||
|
System.out.println(b.cardinality()+" in "+end+" ms ");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getFieldName()
|
||||||
|
{
|
||||||
|
return fieldName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setFieldName(String fieldName)
|
||||||
|
{
|
||||||
|
this.fieldName = fieldName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getKeepMode()
|
||||||
|
{
|
||||||
|
return keepMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setKeepMode(int keepMode)
|
||||||
|
{
|
||||||
|
this.keepMode = keepMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean equals(Object obj)
|
||||||
|
{
|
||||||
|
if(this == obj)
|
||||||
|
return true;
|
||||||
|
if((obj == null) || (obj.getClass() != this.getClass()))
|
||||||
|
return false;
|
||||||
|
DuplicateFilter other = (DuplicateFilter)obj;
|
||||||
|
return keepMode == other.keepMode &&
|
||||||
|
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public int hashCode()
|
||||||
|
{
|
||||||
|
int hash = 217;
|
||||||
|
hash = 31 * hash + keepMode;
|
||||||
|
hash = 31 * hash + fieldName.hashCode();
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getProcessingMode()
|
||||||
|
{
|
||||||
|
return processingMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setProcessingMode(int processingMode)
|
||||||
|
{
|
||||||
|
this.processingMode = processingMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,165 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermDocs;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
|
||||||
|
public class DuplicateFilterTest extends TestCase
|
||||||
|
{
|
||||||
|
private static final String KEY_FIELD = "url";
|
||||||
|
private RAMDirectory directory;
|
||||||
|
private IndexReader reader;
|
||||||
|
TermQuery tq=new TermQuery(new Term("text","lucene"));
|
||||||
|
private IndexSearcher searcher;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception
|
||||||
|
{
|
||||||
|
directory = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true);
|
||||||
|
|
||||||
|
//Add series of docs with filterable fields : url, text and dates flags
|
||||||
|
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
|
||||||
|
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
|
||||||
|
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
|
||||||
|
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
|
||||||
|
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
|
||||||
|
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
|
||||||
|
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
|
||||||
|
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
reader=IndexReader.open(directory);
|
||||||
|
searcher =new IndexSearcher(reader);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() throws Exception
|
||||||
|
{
|
||||||
|
reader.close();
|
||||||
|
searcher.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addDoc(IndexWriter writer, String url, String text, String date) throws IOException
|
||||||
|
{
|
||||||
|
Document doc=new Document();
|
||||||
|
doc.add(new Field(KEY_FIELD,url,Field.Store.YES,Field.Index.UN_TOKENIZED));
|
||||||
|
doc.add(new Field("text",text,Field.Store.YES,Field.Index.TOKENIZED));
|
||||||
|
doc.add(new Field("date",date,Field.Store.YES,Field.Index.TOKENIZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDefaultFilter() throws Throwable
|
||||||
|
{
|
||||||
|
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||||
|
HashSet results=new HashSet();
|
||||||
|
Hits h = searcher.search(tq,df);
|
||||||
|
for(int i=0;i<h.length();i++)
|
||||||
|
{
|
||||||
|
Document d=h.doc(i);
|
||||||
|
String url=d.get(KEY_FIELD);
|
||||||
|
assertFalse("No duplicate urls should be returned",results.contains(url));
|
||||||
|
results.add(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public void testNoFilter() throws Throwable
|
||||||
|
{
|
||||||
|
HashSet results=new HashSet();
|
||||||
|
Hits h = searcher.search(tq);
|
||||||
|
assertTrue("Default searching should have found some matches",h.length()>0);
|
||||||
|
boolean dupsFound=false;
|
||||||
|
for(int i=0;i<h.length();i++)
|
||||||
|
{
|
||||||
|
Document d=h.doc(i);
|
||||||
|
String url=d.get(KEY_FIELD);
|
||||||
|
if(!dupsFound)
|
||||||
|
dupsFound=results.contains(url);
|
||||||
|
results.add(url);
|
||||||
|
}
|
||||||
|
assertTrue("Default searching should have found duplicate urls",dupsFound);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFastFilter() throws Throwable
|
||||||
|
{
|
||||||
|
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||||
|
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
||||||
|
HashSet results=new HashSet();
|
||||||
|
Hits h = searcher.search(tq,df);
|
||||||
|
assertTrue("Filtered searching should have found some matches",h.length()>0);
|
||||||
|
for(int i=0;i<h.length();i++)
|
||||||
|
{
|
||||||
|
Document d=h.doc(i);
|
||||||
|
String url=d.get(KEY_FIELD);
|
||||||
|
assertFalse("No duplicate urls should be returned",results.contains(url));
|
||||||
|
results.add(url);
|
||||||
|
}
|
||||||
|
assertEquals("Two urls found",2, results.size());
|
||||||
|
}
|
||||||
|
public void testKeepsLastFilter() throws Throwable
|
||||||
|
{
|
||||||
|
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||||
|
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
||||||
|
Hits h = searcher.search(tq,df);
|
||||||
|
assertTrue("Filtered searching should have found some matches",h.length()>0);
|
||||||
|
for(int i=0;i<h.length();i++)
|
||||||
|
{
|
||||||
|
Document d=h.doc(i);
|
||||||
|
String url=d.get(KEY_FIELD);
|
||||||
|
TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
|
||||||
|
int lastDoc=0;
|
||||||
|
while(td.next())
|
||||||
|
{
|
||||||
|
lastDoc=td.doc();
|
||||||
|
}
|
||||||
|
assertEquals("Duplicate urls should return last doc",lastDoc, h.id((i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testKeepsFirstFilter() throws Throwable
|
||||||
|
{
|
||||||
|
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||||
|
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
||||||
|
Hits h = searcher.search(tq,df);
|
||||||
|
assertTrue("Filtered searching should have found some matches",h.length()>0);
|
||||||
|
for(int i=0;i<h.length();i++)
|
||||||
|
{
|
||||||
|
Document d=h.doc(i);
|
||||||
|
String url=d.get(KEY_FIELD);
|
||||||
|
TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
|
||||||
|
int lastDoc=0;
|
||||||
|
td.next();
|
||||||
|
lastDoc=td.doc();
|
||||||
|
assertEquals("Duplicate urls should return first doc",lastDoc, h.id((i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -4,6 +4,7 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.queryParser.QueryParser;
|
import org.apache.lucene.queryParser.QueryParser;
|
||||||
import org.apache.lucene.xmlparser.builders.BooleanFilterBuilder;
|
import org.apache.lucene.xmlparser.builders.BooleanFilterBuilder;
|
||||||
import org.apache.lucene.xmlparser.builders.BoostingQueryBuilder;
|
import org.apache.lucene.xmlparser.builders.BoostingQueryBuilder;
|
||||||
|
import org.apache.lucene.xmlparser.builders.DuplicateFilterBuilder;
|
||||||
import org.apache.lucene.xmlparser.builders.FuzzyLikeThisQueryBuilder;
|
import org.apache.lucene.xmlparser.builders.FuzzyLikeThisQueryBuilder;
|
||||||
import org.apache.lucene.xmlparser.builders.LikeThisQueryBuilder;
|
import org.apache.lucene.xmlparser.builders.LikeThisQueryBuilder;
|
||||||
import org.apache.lucene.xmlparser.builders.TermsFilterBuilder;
|
import org.apache.lucene.xmlparser.builders.TermsFilterBuilder;
|
||||||
|
@ -31,6 +32,7 @@ public class CorePlusExtensionsParser extends CoreParser
|
||||||
super(analyzer, parser);
|
super(analyzer, parser);
|
||||||
filterFactory.addBuilder("TermsFilter",new TermsFilterBuilder(analyzer));
|
filterFactory.addBuilder("TermsFilter",new TermsFilterBuilder(analyzer));
|
||||||
filterFactory.addBuilder("BooleanFilter",new BooleanFilterBuilder(filterFactory));
|
filterFactory.addBuilder("BooleanFilter",new BooleanFilterBuilder(filterFactory));
|
||||||
|
filterFactory.addBuilder("DuplicateFilter",new DuplicateFilterBuilder());
|
||||||
String fields[]={"contents"};
|
String fields[]={"contents"};
|
||||||
queryFactory.addBuilder("LikeThisQuery",new LikeThisQueryBuilder(analyzer,fields));
|
queryFactory.addBuilder("LikeThisQuery",new LikeThisQueryBuilder(analyzer,fields));
|
||||||
queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
|
queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
|
||||||
|
|
|
@ -0,0 +1,75 @@
|
||||||
|
/*
|
||||||
|
* Created on 25-Jan-2006
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.xmlparser.builders;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.BooleanFilter;
|
||||||
|
import org.apache.lucene.search.DuplicateFilter;
|
||||||
|
import org.apache.lucene.search.Filter;
|
||||||
|
import org.apache.lucene.search.FilterClause;
|
||||||
|
import org.apache.lucene.xmlparser.DOMUtils;
|
||||||
|
import org.apache.lucene.xmlparser.FilterBuilder;
|
||||||
|
import org.apache.lucene.xmlparser.ParserException;
|
||||||
|
import org.w3c.dom.Element;
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
import org.w3c.dom.NodeList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* @author maharwood
|
||||||
|
*/
|
||||||
|
public class DuplicateFilterBuilder implements FilterBuilder {
|
||||||
|
|
||||||
|
|
||||||
|
public Filter getFilter(Element e) throws ParserException {
|
||||||
|
String fieldName=DOMUtils.getAttributeWithInheritanceOrFail(e,"fieldName");
|
||||||
|
DuplicateFilter df=new DuplicateFilter(fieldName);
|
||||||
|
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
|
||||||
|
if(keepMode.equalsIgnoreCase("first"))
|
||||||
|
{
|
||||||
|
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if(keepMode.equalsIgnoreCase("last"))
|
||||||
|
{
|
||||||
|
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new ParserException("Illegal keepMode attribute in DuplicateFilter:"+keepMode);
|
||||||
|
}
|
||||||
|
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
|
||||||
|
if(processingMode.equalsIgnoreCase("full"))
|
||||||
|
{
|
||||||
|
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if(processingMode.equalsIgnoreCase("fast"))
|
||||||
|
{
|
||||||
|
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new ParserException("Illegal processingMode attribute in DuplicateFilter:"+processingMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return df;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<FilteredQuery>
|
||||||
|
<Query>
|
||||||
|
<BooleanQuery fieldName="contents">
|
||||||
|
<Clause occurs="should">
|
||||||
|
<TermQuery>money</TermQuery>
|
||||||
|
</Clause>
|
||||||
|
<Clause occurs="must">
|
||||||
|
<TermQuery fieldName="date">19870408</TermQuery>
|
||||||
|
</Clause>
|
||||||
|
</BooleanQuery>
|
||||||
|
</Query>
|
||||||
|
<Filter>
|
||||||
|
<!-- Filters to last document with this date -->
|
||||||
|
<DuplicateFilter fieldName="date" keepMode="last"/>
|
||||||
|
</Filter>
|
||||||
|
|
||||||
|
</FilteredQuery>
|
|
@ -173,6 +173,12 @@ public class TestParser extends TestCase {
|
||||||
Query q=parse("CachedFilter.xml");
|
Query q=parse("CachedFilter.xml");
|
||||||
dumpResults("Cached filter", q, 5);
|
dumpResults("Cached filter", q, 5);
|
||||||
}
|
}
|
||||||
|
public void testDuplicateFilterQueryXML() throws ParserException, IOException
|
||||||
|
{
|
||||||
|
Query q=parse("DuplicateFilterQuery.xml");
|
||||||
|
Hits h = searcher.search(q);
|
||||||
|
assertEquals("DuplicateFilterQuery should produce 1 result ", 1,h.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue