From 471f90cf825ee3106fef1fa4c1094d0ca461e7fb Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Thu, 15 Sep 2016 15:45:41 -0400 Subject: [PATCH] LUCENE-7439: FuzzyQuery now matches all terms within the specified edit distance, even if they are short --- lucene/CHANGES.txt | 3 +++ .../apache/lucene/search/FuzzyTermsEnum.java | 2 +- .../apache/lucene/search/TopTermsRewrite.java | 4 +++- .../search/FuzzyTermOnShortTermsTest.java | 15 ++++++++------- .../apache/lucene/search/TestFuzzyQuery.java | 10 ++++------ .../sandbox/queries/TestSlowFuzzyQuery.java | 19 +++++++++++++++---- 6 files changed, 34 insertions(+), 19 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 964719a3445..522da2f0c2b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -32,6 +32,9 @@ Bug Fixes Improvements +* LUCENE-7439: FuzzyQuery now matches all terms within the specified + edit distance, even if they are short terms (Mike McCandless) + Optimizations Other diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index 66a64e14527..37f16b42f03 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -350,7 +350,7 @@ public class FuzzyTermsEnum extends TermsEnum { final int codePointCount = UnicodeUtil.codePointCount(term); final float similarity = 1.0f - ((float) ed / (float) (Math.min(codePointCount, termLength))); - if (similarity > minSimilarity) { + if (minSimilarity == 0 || similarity > minSimilarity) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); //System.out.println(" yes"); return AcceptStatus.YES; diff --git a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java index 013171d5a72..b75836e16b7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java @@ -160,7 +160,9 @@ public abstract class TopTermsRewrite extends TermCollectingRewrite { for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes.toBytesRef()); - addClause(b, term, st.termState.docFreq(), st.boost, st.termState); // add to query + // We allow negative term scores (fuzzy query does this, for example) while collecting the terms, + // but truncate such boosts to 0.0f when building the query: + addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState); // add to query } return build(b); } diff --git a/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java b/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java index 427888b5a49..faf4552586a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java +++ b/lucene/core/src/test/org/apache/lucene/search/FuzzyTermOnShortTermsTest.java @@ -49,15 +49,16 @@ public class FuzzyTermOnShortTermsTest extends LuceneTestCase { countHits(a, new String[]{"abcde"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1); countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "abcde"), 2), 1); - //these don't - countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 0); - countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 0); + // LUCENE-7439: these now work as well: - countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 0); - countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 0); + countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "a"), 1), 1); + countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "ab"), 1), 1); + + countHits(a, new String[]{"abc"}, new FuzzyQuery(new Term(FIELD, "a"), 2), 1); + countHits(a, new String[]{"a"}, new FuzzyQuery(new Term(FIELD, "abc"), 2), 1); - countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 0); - countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 0); + countHits(a, new String[]{"abcd"}, new FuzzyQuery(new Term(FIELD, "ab"), 2), 1); + countHits(a, new String[]{"ab"}, new FuzzyQuery(new Term(FIELD, "abcd"), 2), 1); } private void countHits(Analyzer analyzer, String[] docs, Query q, int expected) throws Exception { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java index 1e90525d891..62e63eaa8c6 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java @@ -543,12 +543,10 @@ public class TestFuzzyQuery extends LuceneTestCase { continue; } int ed = getDistance(term, queryTerm); - if (Math.min(queryTerm.length(), term.length()) > ed) { - float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length()); - while (ed < 3) { - expected[ed].add(new TermAndScore(term, score)); - ed++; - } + float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length()); + while (ed < 3) { + expected[ed].add(new TermAndScore(term, score)); + ed++; } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java index 922213fdc74..3ff1f3bde99 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java @@ -16,9 +16,11 @@ */ package org.apache.lucene.sandbox.queries; -import java.util.List; -import java.util.Arrays; import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -472,8 +474,17 @@ public class TestSlowFuzzyQuery extends LuceneTestCase { q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50); hits = searcher.search(q, 10).scoreDocs; assertEquals(2, hits.length); - assertEquals("test", searcher.doc(hits[0].doc).get("field")); - assertEquals("foobar", searcher.doc(hits[1].doc).get("field")); + + // We cannot expect a particular order since both hits 0.0 score: + Set actual = new HashSet<>(); + actual.add(searcher.doc(hits[0].doc).get("field")); + actual.add(searcher.doc(hits[1].doc).get("field")); + + Set expected = new HashSet<>(); + expected.add("test"); + expected.add("foobar"); + + assertEquals(expected, actual); reader.close(); index.close();