From 6248e1451517ce5bba0a02ee4a0d3acc3042f836 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Fri, 27 Feb 2009 14:07:12 +0000 Subject: [PATCH] LUCENE-1548: fix distance normalization in LevenshteinDistance to not produce negative distances git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@748534 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/CHANGES.txt | 3 +++ .../search/spell/LevensteinDistance.java | 2 +- .../search/spell/TestLevenshteinDistance.java | 18 +++++++++--------- .../lucene/search/spell/TestSpellChecker.java | 14 +++++++++----- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 53f1af96d9a..5f02868c170 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -25,6 +25,9 @@ Bug fixes 4. LUCENE-1514: ShingleMatrixFilter#next(Token) easily throws a StackOverflowException due to recursive invocation. (Karl Wettin) + 5. LUCENE-1548: Fix distance normalization in LevenshteinDistance to + not produce negative distances (Thomas Morton via Mike McCandless) + New features 1. LUCENE-1470: Added TrieRangeQuery, a much faster implementation of diff --git a/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java b/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java index a17f5b0f424..4f8632632d6 100755 --- a/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java +++ b/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LevensteinDistance.java @@ -100,7 +100,7 @@ public final class LevensteinDistance implements StringDistance { // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts - return 1.0f - ((float) p[n] / Math.min(other.length(), sa.length)); + return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length)); } } diff --git a/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java b/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java index 695e3ee8142..14856685bf4 100644 --- a/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java +++ b/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java @@ -25,25 +25,25 @@ public class TestLevenshteinDistance extends TestCase { public void testGetDistance() { float d = sd.getDistance("al", "al"); - assertTrue(d == 1.0f); + assertEquals(d,1.0f,0.001); d = sd.getDistance("martha", "marhta"); - assertTrue(d > 0.66 && d <0.67); + assertEquals(d,0.6666,0.001); d = sd.getDistance("jones", "johnson"); - assertTrue(d > 0.199 && d < 0.201); + assertEquals(d,0.4285,0.001); d = sd.getDistance("abcvwxyz", "cabvwxyz"); - assertTrue(d > 0.749 && d < 0.751); + assertEquals(d,0.75,0.001); d = sd.getDistance("dwayne", "duane"); - assertTrue(d > 0.599 && d < 0.601); + assertEquals(d,0.666,0.001); d = sd.getDistance("dixon", "dicksonx"); - assertTrue(d > 0.199 && d < 0.201); + assertEquals(d,0.5,0.001); d = sd.getDistance("six", "ten"); - assertTrue(d == 0f); + assertEquals(d,0,0.001); float d1 = sd.getDistance("zac ephron", "zac efron"); float d2 = sd.getDistance("zac ephron", "kai ephron"); - assertTrue(d1 < d2); + assertEquals(d1,d2,0.001); d1 = sd.getDistance("brittney spears", "britney spears"); d2 = sd.getDistance("brittney spears", "brittney startzman"); - assertTrue(d1 > d2); + assertTrue(d1 > d2); } } diff --git a/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java index 2f3f83b6e18..b11b38a41d6 100755 --- a/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java +++ b/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java @@ -129,20 +129,23 @@ public class TestSpellChecker extends TestCase { assertEquals(similar[0], "five"); similar = spellChecker.suggestSimilar("ive", 2); - assertEquals(1, similar.length); + assertEquals(2, similar.length); assertEquals(similar[0], "five"); + assertEquals(similar[1], "nine"); similar = spellChecker.suggestSimilar("fives", 2); assertEquals(1, similar.length); assertEquals(similar[0], "five"); similar = spellChecker.suggestSimilar("fie", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "five"); + assertEquals(similar[1], "nine"); + + similar = spellChecker.suggestSimilar("fi", 2); assertEquals(1, similar.length); assertEquals(similar[0], "five"); - similar = spellChecker.suggestSimilar("fi", 2); - assertEquals(0, similar.length); - // test restraint to a field similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false); assertEquals(0, similar.length); // there isn't the term thousand in the field field1 @@ -151,8 +154,9 @@ public class TestSpellChecker extends TestCase { assertEquals(1, similar.length); // there is the term thousand in the field field2 similar = spellChecker.suggestSimilar("onety", 2); - assertEquals(1, similar.length); + assertEquals(2, similar.length); assertEquals(similar[0], "ninety"); + assertEquals(similar[1], "one"); try { similar = spellChecker.suggestSimilar("tousand", 10, r, null, false); } catch (NullPointerException e) {