LUCENE-7439: FuzzyQuery now matches all terms within the specified edit distance, even if they are short

This commit is contained in:
Mike McCandless 2016-09-15 15:45:41 -04:00
parent bd9962aba6
commit 471f90cf82
6 changed files with 34 additions and 19 deletions

View File

@ -32,6 +32,9 @@ Bug Fixes
Improvements Improvements
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
edit distance, even if they are short terms (Mike McCandless)
Optimizations Optimizations
Other Other

View File

@ -350,7 +350,7 @@ public class FuzzyTermsEnum extends TermsEnum {
final int codePointCount = UnicodeUtil.codePointCount(term); final int codePointCount = UnicodeUtil.codePointCount(term);
final float similarity = 1.0f - ((float) ed / (float) final float similarity = 1.0f - ((float) ed / (float)
(Math.min(codePointCount, termLength))); (Math.min(codePointCount, termLength)));
if (similarity > minSimilarity) { if (minSimilarity == 0 || similarity > minSimilarity) {
boostAtt.setBoost((similarity - minSimilarity) * scale_factor); boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
//System.out.println(" yes"); //System.out.println(" yes");
return AcceptStatus.YES; return AcceptStatus.YES;

View File

@ -160,7 +160,9 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
for (final ScoreTerm st : scoreTerms) { for (final ScoreTerm st : scoreTerms) {
final Term term = new Term(query.field, st.bytes.toBytesRef()); final Term term = new Term(query.field, st.bytes.toBytesRef());
addClause(b, term, st.termState.docFreq(), st.boost, st.termState); // add to query // We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
// but truncate such boosts to 0.0f when building the query:
addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState); // add to query
} }
return build(b); return build(b);
} }

View File

@ -49,15 +49,16 @@ public class FuzzyTermOnShortTermsTest extends LuceneTestCase {
countHits(a, new String[]{"abcde"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1); countHits(a, new String[]{"abcde"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "abcde"), 2), 1); countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "abcde"), 2), 1);
//these don't // LUCENE-7439: these now work as well:
countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 0);
countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 0);
countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 0); countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 1);
countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 0); countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 1);
countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 1);
countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 0); countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 1);
countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 0); countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 1);
} }
private void countHits(Analyzer analyzer, String[] docs, Query q, int expected) throws Exception { private void countHits(Analyzer analyzer, String[] docs, Query q, int expected) throws Exception {

View File

@ -543,12 +543,10 @@ public class TestFuzzyQuery extends LuceneTestCase {
continue; continue;
} }
int ed = getDistance(term, queryTerm); int ed = getDistance(term, queryTerm);
if (Math.min(queryTerm.length(), term.length()) > ed) { float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length()); while (ed < 3) {
while (ed < 3) { expected[ed].add(new TermAndScore(term, score));
expected[ed].add(new TermAndScore(term, score)); ed++;
ed++;
}
} }
} }

View File

@ -16,9 +16,11 @@
*/ */
package org.apache.lucene.sandbox.queries; package org.apache.lucene.sandbox.queries;
import java.util.List;
import java.util.Arrays;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
@ -472,8 +474,17 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50); q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
hits = searcher.search(q, 10).scoreDocs; hits = searcher.search(q, 10).scoreDocs;
assertEquals(2, hits.length); assertEquals(2, hits.length);
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
assertEquals("foobar", searcher.doc(hits[1].doc).get("field")); // We cannot expect a particular order since both hits 0.0 score:
Set<String> actual = new HashSet<>();
actual.add(searcher.doc(hits[0].doc).get("field"));
actual.add(searcher.doc(hits[1].doc).get("field"));
Set<String> expected = new HashSet<>();
expected.add("test");
expected.add("foobar");
assertEquals(expected, actual);
reader.close(); reader.close();
index.close(); index.close();