Fixed bug in FuzzyLikeThisQuery.java. Queries that contain a term with no fuzzy variants caused the query construction logic to exit loop early, producing no fuzzy variants for all subsequent terms in the query string.

Junit test added which recreates the problem conditions and added fix to FuzzyLikeThisQuery that solves the issue. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@699512 13f79535-47bb-0310-9956-ffa450edef68
2008-09-26 21:44:55 +00:00 · 2008-09-26 21:44:55 +00:00 · ba6344a4d7
parent 0e708a21fe
commit ba6344a4d7
2 changed files with 131 additions and 18 deletions
--- a/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
+++ b/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
@ -141,26 +141,25 @@ public class FuzzyLikeThisQuery extends Query
                    }
                }
                while(fe.next());
-                if(numVariants==0)
+                if(numVariants>0)
                {
-                    //no variants to rank here
-                    break;
+	                int avgDf=totalVariantDocFreqs/numVariants;
+	                if(df==0)//no direct match we can use as df for all variants 
+	                {
+	                    df=avgDf; //use avg df of all variants
+	                }
+	                
+	                // take the top variants (scored by edit distance) and reset the score
+	                // to include an IDF factor then add to the global queue for ranking 
+	                // overall top query terms
+	                int size = variantsQ.size();
+	                for(int i = 0; i < size; i++)
+	                {
+	                  ScoreTerm st = (ScoreTerm) variantsQ.pop();
+	                  st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
+	                  q.insert(st);
+	                }                            
                }
-                int avgDf=totalVariantDocFreqs/numVariants;
-                if(df==0)//no direct match we can use as df for all variants 
-                {
-                    df=avgDf; //use avg df of all variants
-                }
-                
-                // take the top variants (scored by edit distance) and reset the score
-                // to include an IDF factor then add to the global queue for ranking overall top query terms
-                int size = variantsQ.size();
-                for(int i = 0; i < size; i++)
-                {
-                  ScoreTerm st = (ScoreTerm) variantsQ.pop();
-                  st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
-                  q.insert(st);
-                }                            
        	}
        }     
    }
--- a/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java
+++ b/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java
@ -0,0 +1,114 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.RAMDirectory;
+
+public class FuzzyLikeThisQueryTest extends TestCase
+{
+	private RAMDirectory directory;
+	private IndexSearcher searcher;
+	private Analyzer analyzer=new WhitespaceAnalyzer();
+
+	protected void setUp() throws Exception
+	{
+		directory = new RAMDirectory();
+		IndexWriter writer = new IndexWriter(directory, analyzer,true, MaxFieldLength.UNLIMITED);
+		
+		//Add series of docs with misspelt names
+		addDoc(writer, "jonathon smythe","1");
+		addDoc(writer, "jonathan smith","2");
+		addDoc(writer, "johnathon smyth","3");
+		addDoc(writer, "johnny smith","4" );
+		addDoc(writer, "jonny smith","5" );
+		addDoc(writer, "johnathon smythe","6");
+	
+		writer.close();
+		searcher=new IndexSearcher(directory);			
+	}
+	
+	private void addDoc(IndexWriter writer, String name, String id) throws IOException
+	{
+		Document doc=new Document();
+		doc.add(new Field("name",name,Field.Store.YES,Field.Index.ANALYZED));
+		doc.add(new Field("id",id,Field.Store.YES,Field.Index.ANALYZED));
+		writer.addDocument(doc);
+	}
+	
+		
+	//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match 
+	public void testClosestEditDistanceMatchComesFirst() throws Throwable
+	{
+		FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
+		flt.addTerms("smith", "name", 0.3f, 1);
+		Query q=flt.rewrite(searcher.getIndexReader());
+		HashSet queryTerms=new HashSet();
+		q.extractTerms(queryTerms);
+		assertTrue("Should have variant smythe",queryTerms.contains(new Term("name","smythe")));
+		assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
+		assertTrue("Should have variant smyth",queryTerms.contains(new Term("name","smyth")));
+		TopDocs topDocs = searcher.search(flt, 1);
+		ScoreDoc[] sd = topDocs.scoreDocs;
+		assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
+		Document doc=searcher.doc(sd[0].doc);
+		assertEquals("Should match most similar not most rare variant", "2",doc.get("id"));
+	}
+	//Test multiple input words are having variants produced
+	public void testMultiWord() throws Throwable
+	{
+		FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
+		flt.addTerms("jonathin smoth", "name", 0.3f, 1);
+		Query q=flt.rewrite(searcher.getIndexReader());
+		HashSet queryTerms=new HashSet();
+		q.extractTerms(queryTerms);
+		assertTrue("Should have variant jonathan",queryTerms.contains(new Term("name","jonathan")));
+		assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
+		TopDocs topDocs = searcher.search(flt, 1);
+		ScoreDoc[] sd = topDocs.scoreDocs;
+		assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
+		Document doc=searcher.doc(sd[0].doc);
+		assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
+	}
+	//Test bug found when first query word does not match anything
+	public void testNoMatchFirstWordBug() throws Throwable
+	{
+		FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
+		flt.addTerms("fernando smith", "name", 0.3f, 1);
+		Query q=flt.rewrite(searcher.getIndexReader());
+		HashSet queryTerms=new HashSet();
+		q.extractTerms(queryTerms);
+		assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
+		TopDocs topDocs = searcher.search(flt, 1);
+		ScoreDoc[] sd = topDocs.scoreDocs;
+		assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
+		Document doc=searcher.doc(sd[0].doc);
+		assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
+	}
+}