mirror of https://github.com/apache/lucene.git
Added new DuplicateFilter functionality to filter documents sharing a field value (e.g. primary key/url)
Also includes Junit test and XML Query support git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@581426 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f3119614e6
commit
62fa7b4b82
|
@ -0,0 +1,245 @@
|
|||
package org.apache.lucene.search;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
import java.util.BitSet;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
|
||||
public class DuplicateFilter extends Filter
|
||||
{
|
||||
|
||||
String fieldName;
|
||||
|
||||
/**
|
||||
* KeepMode determines which document id to consider as the master, all others being
|
||||
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
|
||||
*/
|
||||
int keepMode=KM_USE_FIRST_OCCURRENCE;
|
||||
public static final int KM_USE_FIRST_OCCURRENCE=1;
|
||||
public static final int KM_USE_LAST_OCCURRENCE=2;
|
||||
|
||||
/**
|
||||
* "Full" processing mode starts by setting all bits to false and only setting bits
|
||||
* for documents that contain the given field and are identified as none-duplicates.
|
||||
|
||||
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
|
||||
* given field. This approach avoids the need to read TermDocs for terms that are seen
|
||||
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
|
||||
* faster approach , the downside is that bitsets produced will include bits set for
|
||||
* documents that do not actually contain the field given.
|
||||
*
|
||||
*/
|
||||
int processingMode=PM_FULL_VALIDATION;
|
||||
public static final int PM_FULL_VALIDATION=1;
|
||||
public static final int PM_FAST_INVALIDATION=2;
|
||||
|
||||
|
||||
|
||||
public DuplicateFilter(String fieldName)
|
||||
{
|
||||
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
|
||||
}
|
||||
|
||||
|
||||
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
|
||||
{
|
||||
this.fieldName = fieldName;
|
||||
this.keepMode = keepMode;
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
|
||||
public BitSet bits(IndexReader reader) throws IOException
|
||||
{
|
||||
if(processingMode==PM_FAST_INVALIDATION)
|
||||
{
|
||||
return fastBits(reader);
|
||||
}
|
||||
else
|
||||
{
|
||||
return correctBits(reader);
|
||||
}
|
||||
}
|
||||
|
||||
private BitSet correctBits(IndexReader reader) throws IOException
|
||||
{
|
||||
|
||||
BitSet bits=new BitSet(reader.maxDoc()); //assume all are INvalid
|
||||
Term startTerm=new Term(fieldName,"");
|
||||
TermEnum te = reader.terms(startTerm);
|
||||
if(te!=null)
|
||||
{
|
||||
Term currTerm=te.term();
|
||||
while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
|
||||
{
|
||||
int lastDoc=-1;
|
||||
//set non duplicates
|
||||
TermDocs td = reader.termDocs(currTerm);
|
||||
if(td.next())
|
||||
{
|
||||
if(keepMode==KM_USE_FIRST_OCCURRENCE)
|
||||
{
|
||||
bits.set(td.doc());
|
||||
}
|
||||
else
|
||||
{
|
||||
do
|
||||
{
|
||||
lastDoc=td.doc();
|
||||
}while(td.next());
|
||||
bits.set(lastDoc);
|
||||
}
|
||||
}
|
||||
if(!te.next())
|
||||
{
|
||||
break;
|
||||
}
|
||||
currTerm=te.term();
|
||||
}
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
private BitSet fastBits(IndexReader reader) throws IOException
|
||||
{
|
||||
|
||||
BitSet bits=new BitSet(reader.maxDoc());
|
||||
bits.set(0,reader.maxDoc()); //assume all are valid
|
||||
Term startTerm=new Term(fieldName,"");
|
||||
TermEnum te = reader.terms(startTerm);
|
||||
if(te!=null)
|
||||
{
|
||||
Term currTerm=te.term();
|
||||
|
||||
while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
|
||||
{
|
||||
if(te.docFreq()>1)
|
||||
{
|
||||
int lastDoc=-1;
|
||||
//unset potential duplicates
|
||||
TermDocs td = reader.termDocs(currTerm);
|
||||
td.next();
|
||||
if(keepMode==KM_USE_FIRST_OCCURRENCE)
|
||||
{
|
||||
td.next();
|
||||
}
|
||||
do
|
||||
{
|
||||
lastDoc=td.doc();
|
||||
bits.set(lastDoc,false);
|
||||
}while(td.next());
|
||||
if(keepMode==KM_USE_LAST_OCCURRENCE)
|
||||
{
|
||||
//restore the last bit
|
||||
bits.set(lastDoc);
|
||||
}
|
||||
}
|
||||
if(!te.next())
|
||||
{
|
||||
break;
|
||||
}
|
||||
currTerm=te.term();
|
||||
}
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param args
|
||||
* @throws IOException
|
||||
* @throws Exception
|
||||
*/
|
||||
public static void main(String[] args) throws Exception
|
||||
{
|
||||
IndexReader r=IndexReader.open("/indexes/personCentricAnon");
|
||||
// IndexReader r=IndexReader.open("/indexes/enron");
|
||||
long start=System.currentTimeMillis();
|
||||
// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
|
||||
// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
|
||||
DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
|
||||
// DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
|
||||
// df.setProcessingMode(PM_SLOW_VALIDATION);
|
||||
BitSet b = df.bits(r);
|
||||
long end=System.currentTimeMillis()-start;
|
||||
System.out.println(b.cardinality()+" in "+end+" ms ");
|
||||
|
||||
}
|
||||
|
||||
|
||||
public String getFieldName()
|
||||
{
|
||||
return fieldName;
|
||||
}
|
||||
|
||||
|
||||
public void setFieldName(String fieldName)
|
||||
{
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
|
||||
public int getKeepMode()
|
||||
{
|
||||
return keepMode;
|
||||
}
|
||||
|
||||
|
||||
public void setKeepMode(int keepMode)
|
||||
{
|
||||
this.keepMode = keepMode;
|
||||
}
|
||||
|
||||
|
||||
public boolean equals(Object obj)
|
||||
{
|
||||
if(this == obj)
|
||||
return true;
|
||||
if((obj == null) || (obj.getClass() != this.getClass()))
|
||||
return false;
|
||||
DuplicateFilter other = (DuplicateFilter)obj;
|
||||
return keepMode == other.keepMode &&
|
||||
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
|
||||
}
|
||||
|
||||
|
||||
|
||||
public int hashCode()
|
||||
{
|
||||
int hash = 217;
|
||||
hash = 31 * hash + keepMode;
|
||||
hash = 31 * hash + fieldName.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
|
||||
public int getProcessingMode()
|
||||
{
|
||||
return processingMode;
|
||||
}
|
||||
|
||||
|
||||
public void setProcessingMode(int processingMode)
|
||||
{
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
public class DuplicateFilterTest extends TestCase
|
||||
{
|
||||
private static final String KEY_FIELD = "url";
|
||||
private RAMDirectory directory;
|
||||
private IndexReader reader;
|
||||
TermQuery tq=new TermQuery(new Term("text","lucene"));
|
||||
private IndexSearcher searcher;
|
||||
|
||||
protected void setUp() throws Exception
|
||||
{
|
||||
directory = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true);
|
||||
|
||||
//Add series of docs with filterable fields : url, text and dates flags
|
||||
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
|
||||
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
|
||||
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
|
||||
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
|
||||
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
|
||||
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
|
||||
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
|
||||
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
|
||||
|
||||
writer.close();
|
||||
reader=IndexReader.open(directory);
|
||||
searcher =new IndexSearcher(reader);
|
||||
|
||||
}
|
||||
|
||||
protected void tearDown() throws Exception
|
||||
{
|
||||
reader.close();
|
||||
searcher.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
private void addDoc(IndexWriter writer, String url, String text, String date) throws IOException
|
||||
{
|
||||
Document doc=new Document();
|
||||
doc.add(new Field(KEY_FIELD,url,Field.Store.YES,Field.Index.UN_TOKENIZED));
|
||||
doc.add(new Field("text",text,Field.Store.YES,Field.Index.TOKENIZED));
|
||||
doc.add(new Field("date",date,Field.Store.YES,Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
public void testDefaultFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
HashSet results=new HashSet();
|
||||
Hits h = searcher.search(tq,df);
|
||||
for(int i=0;i<h.length();i++)
|
||||
{
|
||||
Document d=h.doc(i);
|
||||
String url=d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned",results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
}
|
||||
public void testNoFilter() throws Throwable
|
||||
{
|
||||
HashSet results=new HashSet();
|
||||
Hits h = searcher.search(tq);
|
||||
assertTrue("Default searching should have found some matches",h.length()>0);
|
||||
boolean dupsFound=false;
|
||||
for(int i=0;i<h.length();i++)
|
||||
{
|
||||
Document d=h.doc(i);
|
||||
String url=d.get(KEY_FIELD);
|
||||
if(!dupsFound)
|
||||
dupsFound=results.contains(url);
|
||||
results.add(url);
|
||||
}
|
||||
assertTrue("Default searching should have found duplicate urls",dupsFound);
|
||||
}
|
||||
|
||||
public void testFastFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
||||
HashSet results=new HashSet();
|
||||
Hits h = searcher.search(tq,df);
|
||||
assertTrue("Filtered searching should have found some matches",h.length()>0);
|
||||
for(int i=0;i<h.length();i++)
|
||||
{
|
||||
Document d=h.doc(i);
|
||||
String url=d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned",results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
assertEquals("Two urls found",2, results.size());
|
||||
}
|
||||
public void testKeepsLastFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
||||
Hits h = searcher.search(tq,df);
|
||||
assertTrue("Filtered searching should have found some matches",h.length()>0);
|
||||
for(int i=0;i<h.length();i++)
|
||||
{
|
||||
Document d=h.doc(i);
|
||||
String url=d.get(KEY_FIELD);
|
||||
TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
|
||||
int lastDoc=0;
|
||||
while(td.next())
|
||||
{
|
||||
lastDoc=td.doc();
|
||||
}
|
||||
assertEquals("Duplicate urls should return last doc",lastDoc, h.id((i)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testKeepsFirstFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
||||
Hits h = searcher.search(tq,df);
|
||||
assertTrue("Filtered searching should have found some matches",h.length()>0);
|
||||
for(int i=0;i<h.length();i++)
|
||||
{
|
||||
Document d=h.doc(i);
|
||||
String url=d.get(KEY_FIELD);
|
||||
TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
|
||||
int lastDoc=0;
|
||||
td.next();
|
||||
lastDoc=td.doc();
|
||||
assertEquals("Duplicate urls should return first doc",lastDoc, h.id((i)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -4,6 +4,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.xmlparser.builders.BooleanFilterBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.BoostingQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.DuplicateFilterBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.FuzzyLikeThisQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.LikeThisQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.TermsFilterBuilder;
|
||||
|
@ -31,6 +32,7 @@ public class CorePlusExtensionsParser extends CoreParser
|
|||
super(analyzer, parser);
|
||||
filterFactory.addBuilder("TermsFilter",new TermsFilterBuilder(analyzer));
|
||||
filterFactory.addBuilder("BooleanFilter",new BooleanFilterBuilder(filterFactory));
|
||||
filterFactory.addBuilder("DuplicateFilter",new DuplicateFilterBuilder());
|
||||
String fields[]={"contents"};
|
||||
queryFactory.addBuilder("LikeThisQuery",new LikeThisQueryBuilder(analyzer,fields));
|
||||
queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanFilter;
|
||||
import org.apache.lucene.search.DuplicateFilter;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.FilterClause;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.FilterBuilder;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class DuplicateFilterBuilder implements FilterBuilder {
|
||||
|
||||
|
||||
public Filter getFilter(Element e) throws ParserException {
|
||||
String fieldName=DOMUtils.getAttributeWithInheritanceOrFail(e,"fieldName");
|
||||
DuplicateFilter df=new DuplicateFilter(fieldName);
|
||||
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
|
||||
if(keepMode.equalsIgnoreCase("first"))
|
||||
{
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
||||
}
|
||||
else
|
||||
if(keepMode.equalsIgnoreCase("last"))
|
||||
{
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ParserException("Illegal keepMode attribute in DuplicateFilter:"+keepMode);
|
||||
}
|
||||
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
|
||||
if(processingMode.equalsIgnoreCase("full"))
|
||||
{
|
||||
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
|
||||
}
|
||||
else
|
||||
if(processingMode.equalsIgnoreCase("fast"))
|
||||
{
|
||||
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ParserException("Illegal processingMode attribute in DuplicateFilter:"+processingMode);
|
||||
}
|
||||
|
||||
return df;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<FilteredQuery>
|
||||
<Query>
|
||||
<BooleanQuery fieldName="contents">
|
||||
<Clause occurs="should">
|
||||
<TermQuery>money</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="must">
|
||||
<TermQuery fieldName="date">19870408</TermQuery>
|
||||
</Clause>
|
||||
</BooleanQuery>
|
||||
</Query>
|
||||
<Filter>
|
||||
<!-- Filters to last document with this date -->
|
||||
<DuplicateFilter fieldName="date" keepMode="last"/>
|
||||
</Filter>
|
||||
|
||||
</FilteredQuery>
|
|
@ -173,6 +173,12 @@ public class TestParser extends TestCase {
|
|||
Query q=parse("CachedFilter.xml");
|
||||
dumpResults("Cached filter", q, 5);
|
||||
}
|
||||
public void testDuplicateFilterQueryXML() throws ParserException, IOException
|
||||
{
|
||||
Query q=parse("DuplicateFilterQuery.xml");
|
||||
Hits h = searcher.search(q);
|
||||
assertEquals("DuplicateFilterQuery should produce 1 result ", 1,h.length());
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue