LUCENE-7439: FuzzyQuery now matches all terms within the specified edit distance, even if they are short

This commit is contained in:
Mike McCandless 2016-09-15 15:45:41 -04:00
parent bd9962aba6
commit 471f90cf82
6 changed files with 34 additions and 19 deletions

View File

@ -32,6 +32,9 @@ Bug Fixes
Improvements
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
edit distance, even if they are short terms (Mike McCandless)
Optimizations
Other

View File

@ -350,7 +350,7 @@ public class FuzzyTermsEnum extends TermsEnum {
final int codePointCount = UnicodeUtil.codePointCount(term);
final float similarity = 1.0f - ((float) ed / (float)
(Math.min(codePointCount, termLength)));
if (similarity > minSimilarity) {
if (minSimilarity == 0 || similarity > minSimilarity) {
boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
//System.out.println(" yes");
return AcceptStatus.YES;

View File

@ -160,7 +160,9 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
for (final ScoreTerm st : scoreTerms) {
final Term term = new Term(query.field, st.bytes.toBytesRef());
addClause(b, term, st.termState.docFreq(), st.boost, st.termState); // add to query
// We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
// but truncate such boosts to 0.0f when building the query:
addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState); // add to query
}
return build(b);
}

View File

@ -49,15 +49,16 @@ public class FuzzyTermOnShortTermsTest extends LuceneTestCase {
countHits(a, new String[]{"abcde"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "abcde"), 2), 1);
//these don't
countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 0);
countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 0);
// LUCENE-7439: these now work as well:
countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 0);
countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 0);
countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 1);
countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 1);
countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 0);
countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 0);
countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 1);
countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1);
countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 1);
countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 1);
}
private void countHits(Analyzer analyzer, String[] docs, Query q, int expected) throws Exception {

View File

@ -543,14 +543,12 @@ public class TestFuzzyQuery extends LuceneTestCase {
continue;
}
int ed = getDistance(term, queryTerm);
if (Math.min(queryTerm.length(), term.length()) > ed) {
float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
while (ed < 3) {
expected[ed].add(new TermAndScore(term, score));
ed++;
}
}
}
for(int ed=0;ed<3;ed++) {
Collections.sort(expected[ed]);

View File

@ -16,9 +16,11 @@
*/
package org.apache.lucene.sandbox.queries;
import java.util.List;
import java.util.Arrays;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -472,8 +474,17 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
hits = searcher.search(q, 10).scoreDocs;
assertEquals(2, hits.length);
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
// We cannot expect a particular order since both hits 0.0 score:
Set<String> actual = new HashSet<>();
actual.add(searcher.doc(hits[0].doc).get("field"));
actual.add(searcher.doc(hits[1].doc).get("field"));
Set<String> expected = new HashSet<>();
expected.add("test");
expected.add("foobar");
assertEquals(expected, actual);
reader.close();
index.close();