Added new DuplicateFilter functionality to filter documents sharing a field value (e.g. primary key/url)

Also includes Junit test and XML Query support

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@581426 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2007-10-02 22:56:46 +00:00
parent f3119614e6
commit 62fa7b4b82
6 changed files with 511 additions and 0 deletions

View File

@ -0,0 +1,245 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.BitSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
public class DuplicateFilter extends Filter
{
String fieldName;
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
int keepMode=KM_USE_FIRST_OCCURRENCE;
public static final int KM_USE_FIRST_OCCURRENCE=1;
public static final int KM_USE_LAST_OCCURRENCE=2;
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
* given field. This approach avoids the need to read TermDocs for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*
*/
int processingMode=PM_FULL_VALIDATION;
public static final int PM_FULL_VALIDATION=1;
public static final int PM_FAST_INVALIDATION=2;
public DuplicateFilter(String fieldName)
{
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
}
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
{
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
public BitSet bits(IndexReader reader) throws IOException
{
if(processingMode==PM_FAST_INVALIDATION)
{
return fastBits(reader);
}
else
{
return correctBits(reader);
}
}
private BitSet correctBits(IndexReader reader) throws IOException
{
BitSet bits=new BitSet(reader.maxDoc()); //assume all are INvalid
Term startTerm=new Term(fieldName,"");
TermEnum te = reader.terms(startTerm);
if(te!=null)
{
Term currTerm=te.term();
while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
{
int lastDoc=-1;
//set non duplicates
TermDocs td = reader.termDocs(currTerm);
if(td.next())
{
if(keepMode==KM_USE_FIRST_OCCURRENCE)
{
bits.set(td.doc());
}
else
{
do
{
lastDoc=td.doc();
}while(td.next());
bits.set(lastDoc);
}
}
if(!te.next())
{
break;
}
currTerm=te.term();
}
}
return bits;
}
private BitSet fastBits(IndexReader reader) throws IOException
{
BitSet bits=new BitSet(reader.maxDoc());
bits.set(0,reader.maxDoc()); //assume all are valid
Term startTerm=new Term(fieldName,"");
TermEnum te = reader.terms(startTerm);
if(te!=null)
{
Term currTerm=te.term();
while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
{
if(te.docFreq()>1)
{
int lastDoc=-1;
//unset potential duplicates
TermDocs td = reader.termDocs(currTerm);
td.next();
if(keepMode==KM_USE_FIRST_OCCURRENCE)
{
td.next();
}
do
{
lastDoc=td.doc();
bits.set(lastDoc,false);
}while(td.next());
if(keepMode==KM_USE_LAST_OCCURRENCE)
{
//restore the last bit
bits.set(lastDoc);
}
}
if(!te.next())
{
break;
}
currTerm=te.term();
}
}
return bits;
}
/**
* @param args
* @throws IOException
* @throws Exception
*/
public static void main(String[] args) throws Exception
{
IndexReader r=IndexReader.open("/indexes/personCentricAnon");
// IndexReader r=IndexReader.open("/indexes/enron");
long start=System.currentTimeMillis();
// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
// DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
// df.setProcessingMode(PM_SLOW_VALIDATION);
BitSet b = df.bits(r);
long end=System.currentTimeMillis()-start;
System.out.println(b.cardinality()+" in "+end+" ms ");
}
public String getFieldName()
{
return fieldName;
}
public void setFieldName(String fieldName)
{
this.fieldName = fieldName;
}
public int getKeepMode()
{
return keepMode;
}
public void setKeepMode(int keepMode)
{
this.keepMode = keepMode;
}
public boolean equals(Object obj)
{
if(this == obj)
return true;
if((obj == null) || (obj.getClass() != this.getClass()))
return false;
DuplicateFilter other = (DuplicateFilter)obj;
return keepMode == other.keepMode &&
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
}
public int hashCode()
{
int hash = 217;
hash = 31 * hash + keepMode;
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public int getProcessingMode()
{
return processingMode;
}
public void setProcessingMode(int processingMode)
{
this.processingMode = processingMode;
}
}

View File

@ -0,0 +1,165 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import junit.framework.TestCase;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.store.RAMDirectory;
public class DuplicateFilterTest extends TestCase
{
private static final String KEY_FIELD = "url";
private RAMDirectory directory;
private IndexReader reader;
TermQuery tq=new TermQuery(new Term("text","lucene"));
private IndexSearcher searcher;
protected void setUp() throws Exception
{
directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true);
//Add series of docs with filterable fields : url, text and dates flags
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
writer.close();
reader=IndexReader.open(directory);
searcher =new IndexSearcher(reader);
}
protected void tearDown() throws Exception
{
reader.close();
searcher.close();
directory.close();
}
private void addDoc(IndexWriter writer, String url, String text, String date) throws IOException
{
Document doc=new Document();
doc.add(new Field(KEY_FIELD,url,Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("text",text,Field.Store.YES,Field.Index.TOKENIZED));
doc.add(new Field("date",date,Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
public void testDefaultFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
HashSet results=new HashSet();
Hits h = searcher.search(tq,df);
for(int i=0;i<h.length();i++)
{
Document d=h.doc(i);
String url=d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned",results.contains(url));
results.add(url);
}
}
public void testNoFilter() throws Throwable
{
HashSet results=new HashSet();
Hits h = searcher.search(tq);
assertTrue("Default searching should have found some matches",h.length()>0);
boolean dupsFound=false;
for(int i=0;i<h.length();i++)
{
Document d=h.doc(i);
String url=d.get(KEY_FIELD);
if(!dupsFound)
dupsFound=results.contains(url);
results.add(url);
}
assertTrue("Default searching should have found duplicate urls",dupsFound);
}
public void testFastFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
HashSet results=new HashSet();
Hits h = searcher.search(tq,df);
assertTrue("Filtered searching should have found some matches",h.length()>0);
for(int i=0;i<h.length();i++)
{
Document d=h.doc(i);
String url=d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned",results.contains(url));
results.add(url);
}
assertEquals("Two urls found",2, results.size());
}
public void testKeepsLastFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
Hits h = searcher.search(tq,df);
assertTrue("Filtered searching should have found some matches",h.length()>0);
for(int i=0;i<h.length();i++)
{
Document d=h.doc(i);
String url=d.get(KEY_FIELD);
TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
int lastDoc=0;
while(td.next())
{
lastDoc=td.doc();
}
assertEquals("Duplicate urls should return last doc",lastDoc, h.id((i)));
}
}
public void testKeepsFirstFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
Hits h = searcher.search(tq,df);
assertTrue("Filtered searching should have found some matches",h.length()>0);
for(int i=0;i<h.length();i++)
{
Document d=h.doc(i);
String url=d.get(KEY_FIELD);
TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
int lastDoc=0;
td.next();
lastDoc=td.doc();
assertEquals("Duplicate urls should return first doc",lastDoc, h.id((i)));
}
}
}

View File

@ -4,6 +4,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.xmlparser.builders.BooleanFilterBuilder;
import org.apache.lucene.xmlparser.builders.BoostingQueryBuilder;
import org.apache.lucene.xmlparser.builders.DuplicateFilterBuilder;
import org.apache.lucene.xmlparser.builders.FuzzyLikeThisQueryBuilder;
import org.apache.lucene.xmlparser.builders.LikeThisQueryBuilder;
import org.apache.lucene.xmlparser.builders.TermsFilterBuilder;
@ -31,6 +32,7 @@ public class CorePlusExtensionsParser extends CoreParser
super(analyzer, parser);
filterFactory.addBuilder("TermsFilter",new TermsFilterBuilder(analyzer));
filterFactory.addBuilder("BooleanFilter",new BooleanFilterBuilder(filterFactory));
filterFactory.addBuilder("DuplicateFilter",new DuplicateFilterBuilder());
String fields[]={"contents"};
queryFactory.addBuilder("LikeThisQuery",new LikeThisQueryBuilder(analyzer,fields));
queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));

View File

@ -0,0 +1,75 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanFilter;
import org.apache.lucene.search.DuplicateFilter;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FilterClause;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.FilterBuilder;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @author maharwood
*/
public class DuplicateFilterBuilder implements FilterBuilder {
public Filter getFilter(Element e) throws ParserException {
String fieldName=DOMUtils.getAttributeWithInheritanceOrFail(e,"fieldName");
DuplicateFilter df=new DuplicateFilter(fieldName);
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
if(keepMode.equalsIgnoreCase("first"))
{
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
}
else
if(keepMode.equalsIgnoreCase("last"))
{
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
}
else
{
throw new ParserException("Illegal keepMode attribute in DuplicateFilter:"+keepMode);
}
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
if(processingMode.equalsIgnoreCase("full"))
{
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
}
else
if(processingMode.equalsIgnoreCase("fast"))
{
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
}
else
{
throw new ParserException("Illegal processingMode attribute in DuplicateFilter:"+processingMode);
}
return df;
}
}

View File

@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<FilteredQuery>
<Query>
<BooleanQuery fieldName="contents">
<Clause occurs="should">
<TermQuery>money</TermQuery>
</Clause>
<Clause occurs="must">
<TermQuery fieldName="date">19870408</TermQuery>
</Clause>
</BooleanQuery>
</Query>
<Filter>
<!-- Filters to last document with this date -->
<DuplicateFilter fieldName="date" keepMode="last"/>
</Filter>
</FilteredQuery>

View File

@ -173,6 +173,12 @@ public class TestParser extends TestCase {
Query q=parse("CachedFilter.xml");
dumpResults("Cached filter", q, 5);
}
public void testDuplicateFilterQueryXML() throws ParserException, IOException
{
Query q=parse("DuplicateFilterQuery.xml");
Hits h = searcher.search(q);
assertEquals("DuplicateFilterQuery should produce 1 result ", 1,h.length());
}