From 62fa7b4b823edabdd97c15182ae6b11e7f3e2f9e Mon Sep 17 00:00:00 2001
From: Mark Harwood <mharwood@apache.org>
Date: Tue, 2 Oct 2007 22:56:46 +0000
Subject: [PATCH] Added new DuplicateFilter functionality to filter documents
 sharing a field value (e.g. primary key/url)

Also includes Junit test and XML Query support

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@581426 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/search/DuplicateFilter.java | 245 ++++++++++++++++++
 .../lucene/search/DuplicateFilterTest.java    | 165 ++++++++++++
 .../xmlparser/CorePlusExtensionsParser.java   |   2 +
 .../builders/DuplicateFilterBuilder.java      |  75 ++++++
 .../lucene/xmlparser/DuplicateFilterQuery.xml |  18 ++
 .../apache/lucene/xmlparser/TestParser.java   |   6 +
 6 files changed, 511 insertions(+)
 create mode 100644 contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
 create mode 100644 contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java
 create mode 100644 contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java
 create mode 100644 contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/DuplicateFilterQuery.xml

diff --git a/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java b/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
new file mode 100644
index 00000000000..ba8dffde32d
--- /dev/null
+++ b/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
@@ -0,0 +1,245 @@
+package org.apache.lucene.search;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+
+public class DuplicateFilter extends Filter
+{
+	
+	String fieldName;
+	
+	/**
+	 * KeepMode determines which document id to consider as the master, all others being 
+	 * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
+	 */
+	int keepMode=KM_USE_FIRST_OCCURRENCE;
+	public static final int KM_USE_FIRST_OCCURRENCE=1;
+	public static final int KM_USE_LAST_OCCURRENCE=2;
+	
+	/**
+	 * "Full" processing mode starts by setting all bits to false and only setting bits
+	 * for documents that contain the given field and are identified as none-duplicates. 
+
+	 * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
+	 * given field. This approach avoids the need to read TermDocs for terms that are seen 
+	 * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially 
+	 * faster approach , the downside is that bitsets produced will include bits set for 
+	 * documents that do not actually contain the field given.
+	 * 
+	 */
+	int processingMode=PM_FULL_VALIDATION;
+	public static final int PM_FULL_VALIDATION=1;
+	public static final int PM_FAST_INVALIDATION=2;
+	
+
+	
+	public DuplicateFilter(String fieldName)
+	{
+		this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
+	}
+	
+
+	public DuplicateFilter(String fieldName, int keepMode, int processingMode)
+	{
+		this.fieldName = fieldName;
+		this.keepMode = keepMode;
+		this.processingMode = processingMode;
+	}
+
+	public BitSet bits(IndexReader reader) throws IOException
+	{
+		if(processingMode==PM_FAST_INVALIDATION)
+		{
+			return fastBits(reader);
+		}
+		else
+		{
+			return correctBits(reader);
+		}
+	}
+	
+	private BitSet correctBits(IndexReader reader) throws IOException
+	{
+		
+		BitSet bits=new BitSet(reader.maxDoc()); //assume all are INvalid
+		Term startTerm=new Term(fieldName,"");
+		TermEnum te = reader.terms(startTerm);
+		if(te!=null)
+		{
+			Term currTerm=te.term();
+			while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
+			{
+				int lastDoc=-1;
+				//set non duplicates
+				TermDocs td = reader.termDocs(currTerm);
+				if(td.next())
+				{
+					if(keepMode==KM_USE_FIRST_OCCURRENCE)
+					{
+						bits.set(td.doc());
+					}
+					else
+					{
+						do
+						{
+							lastDoc=td.doc();
+						}while(td.next());
+						bits.set(lastDoc);
+					}
+				}
+				if(!te.next())
+				{
+					break;
+				}
+				currTerm=te.term();
+			}
+		}
+		return bits;
+	}
+	
+	private BitSet fastBits(IndexReader reader) throws IOException
+	{
+		
+		BitSet bits=new BitSet(reader.maxDoc());
+		bits.set(0,reader.maxDoc()); //assume all are valid
+		Term startTerm=new Term(fieldName,"");
+		TermEnum te = reader.terms(startTerm);
+		if(te!=null)
+		{
+			Term currTerm=te.term();
+			
+			while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
+			{
+				if(te.docFreq()>1)
+				{
+					int lastDoc=-1;
+					//unset potential duplicates
+					TermDocs td = reader.termDocs(currTerm);
+					td.next();
+					if(keepMode==KM_USE_FIRST_OCCURRENCE)
+					{
+						td.next();
+					}
+					do
+					{
+						lastDoc=td.doc();
+						bits.set(lastDoc,false);
+					}while(td.next());
+					if(keepMode==KM_USE_LAST_OCCURRENCE)
+					{
+						//restore the last bit
+						bits.set(lastDoc);
+					}					
+				}
+				if(!te.next())
+				{
+					break;
+				}
+				currTerm=te.term();
+			}
+		}
+		return bits;
+	}
+
+	/**
+	 * @param args
+	 * @throws IOException 
+	 * @throws Exception 
+	 */
+	public static void main(String[] args) throws Exception
+	{
+		IndexReader r=IndexReader.open("/indexes/personCentricAnon");
+//		IndexReader r=IndexReader.open("/indexes/enron");
+		long start=System.currentTimeMillis();
+//		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
+//		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
+		DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
+//		DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
+//		df.setProcessingMode(PM_SLOW_VALIDATION);
+		BitSet b = df.bits(r);
+		long end=System.currentTimeMillis()-start;
+		System.out.println(b.cardinality()+" in "+end+" ms ");
+
+	}
+
+
+	public String getFieldName()
+	{
+		return fieldName;
+	}
+
+
+	public void setFieldName(String fieldName)
+	{
+		this.fieldName = fieldName;
+	}
+
+
+	public int getKeepMode()
+	{
+		return keepMode;
+	}
+
+
+	public void setKeepMode(int keepMode)
+	{
+		this.keepMode = keepMode;
+	}
+
+
+	public boolean equals(Object obj)
+	{
+		if(this == obj)
+			return true;
+		if((obj == null) || (obj.getClass() != this.getClass()))
+			return false;
+		DuplicateFilter other = (DuplicateFilter)obj;
+		return keepMode == other.keepMode &&
+			(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
+	}
+
+
+
+	public int hashCode()
+	{
+		int hash = 217;
+		hash = 31 * hash + keepMode;
+		hash = 31 * hash + fieldName.hashCode();
+		return hash;	
+	}
+
+
+	public int getProcessingMode()
+	{
+		return processingMode;
+	}
+
+
+	public void setProcessingMode(int processingMode)
+	{
+		this.processingMode = processingMode;
+	}
+	
+	
+
+}
diff --git a/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java b/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java
new file mode 100644
index 00000000000..f544c8f47f5
--- /dev/null
+++ b/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java
@@ -0,0 +1,165 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.store.RAMDirectory;
+
+public class DuplicateFilterTest extends TestCase
+{
+	private static final String KEY_FIELD = "url";
+	private RAMDirectory directory;
+	private IndexReader reader;
+	TermQuery tq=new TermQuery(new Term("text","lucene"));
+	private IndexSearcher searcher;
+
+	protected void setUp() throws Exception
+	{
+		directory = new RAMDirectory();
+		IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true);
+		
+		//Add series of docs with filterable fields : url, text and dates  flags
+		addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
+		addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
+		addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");		
+		addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
+		addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
+		addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
+		addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
+		addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
+		
+		writer.close();
+		reader=IndexReader.open(directory);			
+		searcher =new IndexSearcher(reader);
+		
+	}
+	
+	protected void tearDown() throws Exception
+	{
+		reader.close();
+		searcher.close();
+		directory.close();
+	}
+
+	private void addDoc(IndexWriter writer, String url, String text, String date) throws IOException
+	{
+		Document doc=new Document();
+		doc.add(new Field(KEY_FIELD,url,Field.Store.YES,Field.Index.UN_TOKENIZED));
+		doc.add(new Field("text",text,Field.Store.YES,Field.Index.TOKENIZED));
+		doc.add(new Field("date",date,Field.Store.YES,Field.Index.TOKENIZED));
+		writer.addDocument(doc);
+	}
+		
+	public void testDefaultFilter() throws Throwable
+	{
+		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);		
+		HashSet results=new HashSet();
+		Hits h = searcher.search(tq,df);
+		for(int i=0;i<h.length();i++)
+		{
+			Document d=h.doc(i);
+			String url=d.get(KEY_FIELD);
+			assertFalse("No duplicate urls should be returned",results.contains(url));
+			results.add(url);
+		}
+	}
+	public void testNoFilter() throws Throwable
+	{
+		HashSet results=new HashSet();
+		Hits h = searcher.search(tq);
+		assertTrue("Default searching should have found some matches",h.length()>0);
+		boolean dupsFound=false;
+		for(int i=0;i<h.length();i++)
+		{
+			Document d=h.doc(i);
+			String url=d.get(KEY_FIELD);
+			if(!dupsFound)
+				dupsFound=results.contains(url);
+			results.add(url);
+		}
+		assertTrue("Default searching should have found duplicate urls",dupsFound);
+	}
+	
+	public void testFastFilter() throws Throwable
+	{
+		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
+		df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
+		HashSet results=new HashSet();
+		Hits h = searcher.search(tq,df);
+		assertTrue("Filtered searching should have found some matches",h.length()>0);
+		for(int i=0;i<h.length();i++)
+		{
+			Document d=h.doc(i);
+			String url=d.get(KEY_FIELD);
+			assertFalse("No duplicate urls should be returned",results.contains(url));
+			results.add(url);
+		}
+		assertEquals("Two urls found",2, results.size());
+	}	
+	public void testKeepsLastFilter() throws Throwable
+	{
+		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
+		df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
+		Hits h = searcher.search(tq,df);
+		assertTrue("Filtered searching should have found some matches",h.length()>0);
+		for(int i=0;i<h.length();i++)
+		{
+			Document d=h.doc(i);
+			String url=d.get(KEY_FIELD);
+			TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
+			int lastDoc=0;
+			while(td.next())
+			{
+				lastDoc=td.doc();
+			}
+			assertEquals("Duplicate urls should return last doc",lastDoc, h.id((i)));
+		}
+	}	
+	
+	
+	public void testKeepsFirstFilter() throws Throwable
+	{
+		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
+		df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
+		Hits h = searcher.search(tq,df);
+		assertTrue("Filtered searching should have found some matches",h.length()>0);
+		for(int i=0;i<h.length();i++)
+		{
+			Document d=h.doc(i);
+			String url=d.get(KEY_FIELD);
+			TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
+			int lastDoc=0;
+			td.next();
+			lastDoc=td.doc();
+			assertEquals("Duplicate urls should return first doc",lastDoc, h.id((i)));
+		}
+	}	
+	
+	
+}
diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/CorePlusExtensionsParser.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/CorePlusExtensionsParser.java
index 1e97c9fc3a8..532cea8fa38 100644
--- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/CorePlusExtensionsParser.java
+++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/CorePlusExtensionsParser.java
@@ -4,6 +4,7 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.xmlparser.builders.BooleanFilterBuilder;
 import org.apache.lucene.xmlparser.builders.BoostingQueryBuilder;
+import org.apache.lucene.xmlparser.builders.DuplicateFilterBuilder;
 import org.apache.lucene.xmlparser.builders.FuzzyLikeThisQueryBuilder;
 import org.apache.lucene.xmlparser.builders.LikeThisQueryBuilder;
 import org.apache.lucene.xmlparser.builders.TermsFilterBuilder;
@@ -31,6 +32,7 @@ public class CorePlusExtensionsParser extends CoreParser
 		super(analyzer, parser);
 		filterFactory.addBuilder("TermsFilter",new TermsFilterBuilder(analyzer));
 		filterFactory.addBuilder("BooleanFilter",new BooleanFilterBuilder(filterFactory));
+		filterFactory.addBuilder("DuplicateFilter",new DuplicateFilterBuilder());
 		String fields[]={"contents"};
 		queryFactory.addBuilder("LikeThisQuery",new LikeThisQueryBuilder(analyzer,fields));
 		queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java
new file mode 100644
index 00000000000..9833996fa6a
--- /dev/null
+++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java
@@ -0,0 +1,75 @@
+/*
+ * Created on 25-Jan-2006
+ */
+package org.apache.lucene.xmlparser.builders;
+
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanFilter;
+import org.apache.lucene.search.DuplicateFilter;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.FilterClause;
+import org.apache.lucene.xmlparser.DOMUtils;
+import org.apache.lucene.xmlparser.FilterBuilder;
+import org.apache.lucene.xmlparser.ParserException;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @author maharwood 
+ */
+public class DuplicateFilterBuilder implements FilterBuilder {
+	
+
+	public Filter getFilter(Element e) throws ParserException {
+        String fieldName=DOMUtils.getAttributeWithInheritanceOrFail(e,"fieldName");
+		DuplicateFilter df=new DuplicateFilter(fieldName);
+		String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
+		if(keepMode.equalsIgnoreCase("first"))
+		{
+			df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
+		}
+		else
+			if(keepMode.equalsIgnoreCase("last"))
+			{
+				df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
+			}
+			else
+			{
+				throw new ParserException("Illegal keepMode attribute in DuplicateFilter:"+keepMode);
+			}
+		String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
+		if(processingMode.equalsIgnoreCase("full"))
+		{
+			df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
+		}
+		else
+			if(processingMode.equalsIgnoreCase("fast"))
+			{
+				df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
+			}
+			else
+			{
+				throw new ParserException("Illegal processingMode attribute in DuplicateFilter:"+processingMode);
+			}
+					
+		return df;
+	}
+
+}
diff --git a/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/DuplicateFilterQuery.xml b/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/DuplicateFilterQuery.xml
new file mode 100644
index 00000000000..7cb6886f83c
--- /dev/null
+++ b/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/DuplicateFilterQuery.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<FilteredQuery>
+	<Query>
+		<BooleanQuery fieldName="contents">
+			<Clause occurs="should">
+				<TermQuery>money</TermQuery>
+			</Clause>
+			<Clause occurs="must">
+				<TermQuery fieldName="date">19870408</TermQuery>
+			</Clause>
+		</BooleanQuery>
+	</Query>	
+	<Filter>
+		<!-- Filters to last document with this date -->
+		<DuplicateFilter fieldName="date" keepMode="last"/>
+	</Filter>
+	
+</FilteredQuery>
diff --git a/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java b/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
index c54794bad61..e55867fd360 100644
--- a/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
+++ b/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
@@ -173,6 +173,12 @@ public class TestParser extends TestCase {
 			Query q=parse("CachedFilter.xml");
 			dumpResults("Cached filter", q, 5);
 	}
+	public void testDuplicateFilterQueryXML() throws ParserException, IOException
+	{
+			Query q=parse("DuplicateFilterQuery.xml");
+			Hits h = searcher.search(q);
+			assertEquals("DuplicateFilterQuery should produce 1 result ", 1,h.length());
+	}