Added new DuplicateFilter functionality to filter documents sharing a field value (e.g. primary key/url)

Also includes Junit test and XML Query support git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@581426 13f79535-47bb-0310-9956-ffa450edef68
2007-10-02 22:56:46 +00:00 · 2007-10-02 22:56:46 +00:00 · 62fa7b4b82
parent f3119614e6
commit 62fa7b4b82
6 changed files with 511 additions and 0 deletions
--- a/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
+++ b/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
@ -0,0 +1,245 @@
 package org.apache.lucene.search;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.BitSet;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.TermEnum;
 public class DuplicateFilter extends Filter
 {
 	String fieldName;
 	/**
 	 * KeepMode determines which document id to consider as the master, all others being 
 	 * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
 	 */
 	int keepMode=KM_USE_FIRST_OCCURRENCE;
 	public static final int KM_USE_FIRST_OCCURRENCE=1;
 	public static final int KM_USE_LAST_OCCURRENCE=2;
 	/**
 	 * "Full" processing mode starts by setting all bits to false and only setting bits
 	 * for documents that contain the given field and are identified as none-duplicates. 
 	 * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
 	 * given field. This approach avoids the need to read TermDocs for terms that are seen 
 	 * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially 
 	 * faster approach , the downside is that bitsets produced will include bits set for 
 	 * documents that do not actually contain the field given.
 	 * 
 	 */
 	int processingMode=PM_FULL_VALIDATION;
 	public static final int PM_FULL_VALIDATION=1;
 	public static final int PM_FAST_INVALIDATION=2;
 	public DuplicateFilter(String fieldName)
 	{
 		this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
 	}
 	public DuplicateFilter(String fieldName, int keepMode, int processingMode)
 	{
 		this.fieldName = fieldName;
 		this.keepMode = keepMode;
 		this.processingMode = processingMode;
 	}
 	public BitSet bits(IndexReader reader) throws IOException
 	{
 		if(processingMode==PM_FAST_INVALIDATION)
 		{
 			return fastBits(reader);
 		}
 		else
 		{
 			return correctBits(reader);
 		}
 	}
 	private BitSet correctBits(IndexReader reader) throws IOException
 	{
 		BitSet bits=new BitSet(reader.maxDoc()); //assume all are INvalid
 		Term startTerm=new Term(fieldName,"");
 		TermEnum te = reader.terms(startTerm);
 		if(te!=null)
 		{
 			Term currTerm=te.term();
 			while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
 			{
 				int lastDoc=-1;
 				//set non duplicates
 				TermDocs td = reader.termDocs(currTerm);
 				if(td.next())
 				{
 					if(keepMode==KM_USE_FIRST_OCCURRENCE)
 					{
 						bits.set(td.doc());
 					}
 					else
 					{
 						do
 						{
 							lastDoc=td.doc();
 						}while(td.next());
 						bits.set(lastDoc);
 					}
 				}
 				if(!te.next())
 				{
 					break;
 				}
 				currTerm=te.term();
 			}
 		}
 		return bits;
 	}
 	private BitSet fastBits(IndexReader reader) throws IOException
 	{
 		BitSet bits=new BitSet(reader.maxDoc());
 		bits.set(0,reader.maxDoc()); //assume all are valid
 		Term startTerm=new Term(fieldName,"");
 		TermEnum te = reader.terms(startTerm);
 		if(te!=null)
 		{
 			Term currTerm=te.term();
 			while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
 			{
 				if(te.docFreq()>1)
 				{
 					int lastDoc=-1;
 					//unset potential duplicates
 					TermDocs td = reader.termDocs(currTerm);
 					td.next();
 					if(keepMode==KM_USE_FIRST_OCCURRENCE)
 					{
 						td.next();
 					}
 					do
 					{
 						lastDoc=td.doc();
 						bits.set(lastDoc,false);
 					}while(td.next());
 					if(keepMode==KM_USE_LAST_OCCURRENCE)
 					{
 						//restore the last bit
 						bits.set(lastDoc);
 					}					
 				}
 				if(!te.next())
 				{
 					break;
 				}
 				currTerm=te.term();
 			}
 		}
 		return bits;
 	}
 	/**
 	 * @param args
 	 * @throws IOException 
 	 * @throws Exception 
 	 */
 	public static void main(String[] args) throws Exception
 	{
 		IndexReader r=IndexReader.open("/indexes/personCentricAnon");
 //		IndexReader r=IndexReader.open("/indexes/enron");
 		long start=System.currentTimeMillis();
 //		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
 //		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
 		DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
 //		DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
 //		df.setProcessingMode(PM_SLOW_VALIDATION);
 		BitSet b = df.bits(r);
 		long end=System.currentTimeMillis()-start;
 		System.out.println(b.cardinality()+" in "+end+" ms ");
 	}
 	public String getFieldName()
 	{
 		return fieldName;
 	}
 	public void setFieldName(String fieldName)
 	{
 		this.fieldName = fieldName;
 	}
 	public int getKeepMode()
 	{
 		return keepMode;
 	}
 	public void setKeepMode(int keepMode)
 	{
 		this.keepMode = keepMode;
 	}
 	public boolean equals(Object obj)
 	{
 		if(this == obj)
 			return true;
 		if((obj == null) || (obj.getClass() != this.getClass()))
 			return false;
 		DuplicateFilter other = (DuplicateFilter)obj;
 		return keepMode == other.keepMode &&
 			(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
 	}
 	public int hashCode()
 	{
 		int hash = 217;
 		hash = 31 * hash + keepMode;
 		hash = 31 * hash + fieldName.hashCode();
 		return hash;	
 	}
 	public int getProcessingMode()
 	{
 		return processingMode;
 	}
 	public void setProcessingMode(int processingMode)
 	{
 		this.processingMode = processingMode;
 	}
 }
--- a/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java
+++ b/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java
@ -0,0 +1,165 @@
 package org.apache.lucene.search;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.HashSet;
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.store.RAMDirectory;
 public class DuplicateFilterTest extends TestCase
 {
 	private static final String KEY_FIELD = "url";
 	private RAMDirectory directory;
 	private IndexReader reader;
 	TermQuery tq=new TermQuery(new Term("text","lucene"));
 	private IndexSearcher searcher;
 	protected void setUp() throws Exception
 	{
 		directory = new RAMDirectory();
 		IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true);
 		//Add series of docs with filterable fields : url, text and dates  flags
 		addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
 		addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
 		addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");		
 		addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
 		addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
 		addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
 		addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
 		addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
 		writer.close();
 		reader=IndexReader.open(directory);			
 		searcher =new IndexSearcher(reader);
 	}
 	protected void tearDown() throws Exception
 	{
 		reader.close();
 		searcher.close();
 		directory.close();
 	}
 	private void addDoc(IndexWriter writer, String url, String text, String date) throws IOException
 	{
 		Document doc=new Document();
 		doc.add(new Field(KEY_FIELD,url,Field.Store.YES,Field.Index.UN_TOKENIZED));
 		doc.add(new Field("text",text,Field.Store.YES,Field.Index.TOKENIZED));
 		doc.add(new Field("date",date,Field.Store.YES,Field.Index.TOKENIZED));
 		writer.addDocument(doc);
 	}
 	public void testDefaultFilter() throws Throwable
 	{
 		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);		
 		HashSet results=new HashSet();
 		Hits h = searcher.search(tq,df);
 		for(int i=0;i<h.length();i++)
 		{
 			Document d=h.doc(i);
 			String url=d.get(KEY_FIELD);
 			assertFalse("No duplicate urls should be returned",results.contains(url));
 			results.add(url);
 		}
 	}
 	public void testNoFilter() throws Throwable
 	{
 		HashSet results=new HashSet();
 		Hits h = searcher.search(tq);
 		assertTrue("Default searching should have found some matches",h.length()>0);
 		boolean dupsFound=false;
 		for(int i=0;i<h.length();i++)
 		{
 			Document d=h.doc(i);
 			String url=d.get(KEY_FIELD);
 			if(!dupsFound)
 				dupsFound=results.contains(url);
 			results.add(url);
 		}
 		assertTrue("Default searching should have found duplicate urls",dupsFound);
 	}
 	public void testFastFilter() throws Throwable
 	{
 		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
 		df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
 		HashSet results=new HashSet();
 		Hits h = searcher.search(tq,df);
 		assertTrue("Filtered searching should have found some matches",h.length()>0);
 		for(int i=0;i<h.length();i++)
 		{
 			Document d=h.doc(i);
 			String url=d.get(KEY_FIELD);
 			assertFalse("No duplicate urls should be returned",results.contains(url));
 			results.add(url);
 		}
 		assertEquals("Two urls found",2, results.size());
 	}	
 	public void testKeepsLastFilter() throws Throwable
 	{
 		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
 		df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
 		Hits h = searcher.search(tq,df);
 		assertTrue("Filtered searching should have found some matches",h.length()>0);
 		for(int i=0;i<h.length();i++)
 		{
 			Document d=h.doc(i);
 			String url=d.get(KEY_FIELD);
 			TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
 			int lastDoc=0;
 			while(td.next())
 			{
 				lastDoc=td.doc();
 			}
 			assertEquals("Duplicate urls should return last doc",lastDoc, h.id((i)));
 		}
 	}	
 	public void testKeepsFirstFilter() throws Throwable
 	{
 		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
 		df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
 		Hits h = searcher.search(tq,df);
 		assertTrue("Filtered searching should have found some matches",h.length()>0);
 		for(int i=0;i<h.length();i++)
 		{
 			Document d=h.doc(i);
 			String url=d.get(KEY_FIELD);
 			TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
 			int lastDoc=0;
 			td.next();
 			lastDoc=td.doc();
 			assertEquals("Duplicate urls should return first doc",lastDoc, h.id((i)));
 		}
 	}	
 }
--- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/CorePlusExtensionsParser.java
+++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/CorePlusExtensionsParser.java
@ -4,6 +4,7 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.xmlparser.builders.BooleanFilterBuilder;
 import org.apache.lucene.xmlparser.builders.BoostingQueryBuilder;
 import org.apache.lucene.xmlparser.builders.DuplicateFilterBuilder;
 import org.apache.lucene.xmlparser.builders.FuzzyLikeThisQueryBuilder;
 import org.apache.lucene.xmlparser.builders.LikeThisQueryBuilder;
 import org.apache.lucene.xmlparser.builders.TermsFilterBuilder;
@ -31,6 +32,7 @@ public class CorePlusExtensionsParser extends CoreParser
 		super(analyzer, parser);
 		filterFactory.addBuilder("TermsFilter",new TermsFilterBuilder(analyzer));
 		filterFactory.addBuilder("BooleanFilter",new BooleanFilterBuilder(filterFactory));
 		filterFactory.addBuilder("DuplicateFilter",new DuplicateFilterBuilder());
 		String fields[]={"contents"};
 		queryFactory.addBuilder("LikeThisQuery",new LikeThisQueryBuilder(analyzer,fields));
 		queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
--- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java
+++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java
@ -0,0 +1,75 @@
 /*
 * Created on 25-Jan-2006
 */
 package org.apache.lucene.xmlparser.builders;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanFilter;
 import org.apache.lucene.search.DuplicateFilter;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.FilterClause;
 import org.apache.lucene.xmlparser.DOMUtils;
 import org.apache.lucene.xmlparser.FilterBuilder;
 import org.apache.lucene.xmlparser.ParserException;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 * @author maharwood 
 */
 public class DuplicateFilterBuilder implements FilterBuilder {
 	public Filter getFilter(Element e) throws ParserException {
        String fieldName=DOMUtils.getAttributeWithInheritanceOrFail(e,"fieldName");
 		DuplicateFilter df=new DuplicateFilter(fieldName);
 		String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
 		if(keepMode.equalsIgnoreCase("first"))
 		{
 			df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
 		}
 		else
 			if(keepMode.equalsIgnoreCase("last"))
 			{
 				df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
 			}
 			else
 			{
 				throw new ParserException("Illegal keepMode attribute in DuplicateFilter:"+keepMode);
 			}
 		String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
 		if(processingMode.equalsIgnoreCase("full"))
 		{
 			df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
 		}
 		else
 			if(processingMode.equalsIgnoreCase("fast"))
 			{
 				df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
 			}
 			else
 			{
 				throw new ParserException("Illegal processingMode attribute in DuplicateFilter:"+processingMode);
 			}
 		return df;
 	}
 }
--- a/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/DuplicateFilterQuery.xml
+++ b/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/DuplicateFilterQuery.xml
@ -0,0 +1,18 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <FilteredQuery>
 	<Query>
 		<BooleanQuery fieldName="contents">
 			<Clause occurs="should">
 				<TermQuery>money</TermQuery>
 			</Clause>
 			<Clause occurs="must">
 				<TermQuery fieldName="date">19870408</TermQuery>
 			</Clause>
 		</BooleanQuery>
 	</Query>	
 	<Filter>
 		<!-- Filters to last document with this date -->
 		<DuplicateFilter fieldName="date" keepMode="last"/>
 	</Filter>
 </FilteredQuery>
--- a/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
+++ b/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
@ -173,6 +173,12 @@ public class TestParser extends TestCase {
 			Query q=parse("CachedFilter.xml");
 			dumpResults("Cached filter", q, 5);
 	}
 	public void testDuplicateFilterQueryXML() throws ParserException, IOException
 	{
 			Query q=parse("DuplicateFilterQuery.xml");
 			Hits h = searcher.search(q);
 			assertEquals("DuplicateFilterQuery should produce 1 result ", 1,h.length());
 	}