LUCENE-1297 - Allow other string distance measures for the SpellChecker

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@669085 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2008-06-18 05:01:57 +00:00
parent 4adc3f7ba6
commit ce19aa28c0
8 changed files with 331 additions and 27 deletions

View File

@ -186,6 +186,9 @@ New features
16. LUCENE-1298: MoreLikeThis can now accept a custom Similarity (Grant Ingersoll) 16. LUCENE-1298: MoreLikeThis can now accept a custom Similarity (Grant Ingersoll)
17. LUCENE-1297: Allow other string distance measures for the SpellChecker
(Thomas Morton via Otis Gospodnetic)
Optimizations Optimizations
1. LUCENE-705: When building a compound file, use 1. LUCENE-705: When building a compound file, use

View File

@ -0,0 +1,112 @@
package org.apache.lucene.search.spell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
public class JaroWinklerDistance implements StringDistance {
private float threshold = 0.7f;
private int[] matches(String s1, String s2) {
String max, min;
if (s1.length() > s2.length()) {
max = s1;
min = s2;
} else {
max = s2;
min = s1;
}
int range = Math.max(max.length() / 2 - 1, 0);
int[] matchIndexes = new int[min.length()];
Arrays.fill(matchIndexes, -1);
boolean[] matchFlags = new boolean[max.length()];
int matches = 0;
for (int mi = 0; mi < min.length(); mi++) {
char c1 = min.charAt(mi);
for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max
.length()); xi < xn; xi++) {
if (!matchFlags[xi] && c1 == max.charAt(xi)) {
matchIndexes[mi] = xi;
matchFlags[xi] = true;
matches++;
break;
}
}
}
char[] ms1 = new char[matches];
char[] ms2 = new char[matches];
for (int i = 0, si = 0; i < min.length(); i++) {
if (matchIndexes[i] != -1) {
ms1[si] = min.charAt(i);
si++;
}
}
for (int i = 0, si = 0; i < max.length(); i++) {
if (matchFlags[i]) {
ms2[si] = max.charAt(i);
si++;
}
}
int transpositions = 0;
for (int mi = 0; mi < ms1.length; mi++) {
if (ms1[mi] != ms2[mi]) {
transpositions++;
}
}
int prefix = 0;
for (int mi = 0; mi < min.length(); mi++) {
if (s1.charAt(mi) == s2.charAt(mi)) {
prefix++;
} else {
break;
}
}
return new int[] { matches, transpositions / 2, prefix, max.length() };
}
public float getDistance(String s1, String s2) {
int[] mtp = matches(s1, s2);
float m = (float) mtp[0];
if (m == 0) {
return 0f;
}
float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
* (1 - j);
return jw;
}
/**
* Sets the threshold used to determine when Winkler bonus should be used.
* Set to a negative value to get the Jaro distance.
* @param threshold the new value of the threshold
*/
public void setThreshold(float threshold) {
this.threshold = threshold;
}
/**
* Returns the current value of the threshold used for adding the Winkler bonus.
* The deafult value is 0.7.
* @return the current value of the threshold
*/
public float getThreshold() {
return threshold;
}
}

View File

@ -76,6 +76,8 @@ public class SpellChecker {
// minimum score for hits generated by the spell checker query // minimum score for hits generated by the spell checker query
private float minScore = 0.5f; private float minScore = 0.5f;
private StringDistance sd;
/** /**
* Use the given directory as a spell checker index. The directory * Use the given directory as a spell checker index. The directory
@ -84,10 +86,15 @@ public class SpellChecker {
* @param spellIndex * @param spellIndex
* @throws IOException * @throws IOException
*/ */
public SpellChecker(Directory spellIndex) throws IOException { public SpellChecker(Directory spellIndex,StringDistance sd) throws IOException {
this.setSpellIndex(spellIndex); this.setSpellIndex(spellIndex);
this.setStringDistance(sd);
} }
public SpellChecker(Directory spellIndex) throws IOException {
this(spellIndex,new TRStringDistance());
}
/** /**
* Use a different index as the spell checker index or re-open * Use a different index as the spell checker index or re-open
* the existing index if <code>spellIndex</code> is the same value * the existing index if <code>spellIndex</code> is the same value
@ -108,6 +115,11 @@ public class SpellChecker {
} }
searcher = new IndexSearcher(this.spellIndex); searcher = new IndexSearcher(this.spellIndex);
} }
public void setStringDistance(StringDistance sd) {
this.sd = sd;
}
/** /**
* Sets the accuracy 0 &lt; minScore &lt; 1; default 0.5 * Sets the accuracy 0 &lt; minScore &lt; 1; default 0.5
@ -163,7 +175,6 @@ public class SpellChecker {
String field, boolean morePopular) throws IOException { String field, boolean morePopular) throws IOException {
float min = this.minScore; float min = this.minScore;
final TRStringDistance sd = new TRStringDistance(word);
final int lengthWord = word.length(); final int lengthWord = word.length();
final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0; final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
@ -217,9 +228,8 @@ public class SpellChecker {
continue; continue;
} }
// edit distance/normalize with the minScore word length // edit distance
sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math sugWord.score = sd.getDistance(word,sugWord.string);
.min(sugWord.string.length(), lengthWord));
if (sugWord.score < min) { if (sugWord.score < min) {
continue; continue;
} }

View File

@ -0,0 +1,35 @@
package org.apache.lucene.search.spell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Interface for string distances.
*/
public interface StringDistance {
/**
* Returns a float between 0 and 1 based on how similar the specified strings are to one another.
* Returning a value of 0 means the specified strings are identical and 1 means the
* string are maximally different.
* @param s1 The first string.
* @param s2 The second string.
* @return a float between 0 and 1 based on how similar the specified strings are to one another.
*/
public float getDistance(String s1,String s2);
}

View File

@ -18,34 +18,28 @@ package org.apache.lucene.search.spell;
*/ */
/** /**
* Edit distance class. * Levenshtein edit distance class.
* Note: this class is not thread-safe.
*/ */
final class TRStringDistance { final class TRStringDistance implements StringDistance {
final char[] sa;
final int n;
int p[]; //'previous' cost array, horizontally
int d[]; // cost array, horizontally
int _d[]; //placeholder to assist in swapping p and d
/** /**
* Optimized to run a bit faster than the static getDistance(). * Optimized to run a bit faster than the static getDistance().
* In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster. * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
*/ */
public TRStringDistance (String target) { public TRStringDistance () {
sa = target.toCharArray();
n = sa.length;
p = new int[n+1]; //'previous' cost array, horizontally
d = new int[n+1]; // cost array, horizontally
} }
//***************************** //*****************************
// Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String) // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
//***************************** //*****************************
public final int getDistance (String other) { public float getDistance (String target, String other) {
char[] sa;
int n;
int p[]; //'previous' cost array, horizontally
int d[]; // cost array, horizontally
int _d[]; //placeholder to assist in swapping p and d
/* /*
The difference between this impl. and the previous is that, rather The difference between this impl. and the previous is that, rather
than creating and retaining a matrix of size s.length()+1 by t.length()+1, than creating and retaining a matrix of size s.length()+1 by t.length()+1,
@ -63,12 +57,17 @@ final class TRStringDistance {
cause an out of memory condition when calculating the LD over two very large strings. cause an out of memory condition when calculating the LD over two very large strings.
*/ */
sa = target.toCharArray();
n = sa.length;
p = new int[n+1];
d = new int[n+1];
final int m = other.length(); final int m = other.length();
if (n == 0) { if (n == 0) {
return m; return 1;
} else if (m == 0) { } else if (m == 0) {
return n; return 1;
} }
// indexes into strings s and t // indexes into strings s and t
@ -101,7 +100,7 @@ final class TRStringDistance {
// our last action in the above loop was to switch d and p, so p now // our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts // actually has the most recent cost counts
return p[n]; return 1.0f - ((float) p[n] / Math.min(other.length(), sa.length));
} }
} }

View File

@ -0,0 +1,49 @@
package org.apache.lucene.search.spell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
public class TestJaroWinklerDistance extends TestCase {
private StringDistance sd = new JaroWinklerDistance();
public void testGetDistance() {
float d = sd.getDistance("al", "al");
assertTrue(d == 1.0f);
d = sd.getDistance("martha", "marhta");
assertTrue(d > 0.961 && d <0.962);
d = sd.getDistance("jones", "johnson");
assertTrue(d > 0.832 && d < 0.833);
d = sd.getDistance("abcvwxyz", "cabvwxyz");
assertTrue(d > 0.958 && d < 0.959);
d = sd.getDistance("dwayne", "duane");
assertTrue(d > 0.84 && d < 0.841);
d = sd.getDistance("dixon", "dicksonx");
assertTrue(d > 0.813 && d < 0.814);
d = sd.getDistance("fvie", "ten");
assertTrue(d == 0f);
float d1 = sd.getDistance("zac ephron", "zac efron");
float d2 = sd.getDistance("zac ephron", "kai ephron");
assertTrue(d1 > d2);
d1 = sd.getDistance("brittney spears", "britney spears");
d2 = sd.getDistance("brittney spears", "brittney startzman");
assertTrue(d1 > d2);
}
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.search.spell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
public class TestLevenshteinDistance extends TestCase {
private StringDistance sd = new TRStringDistance();
public void testGetDistance() {
float d = sd.getDistance("al", "al");
assertTrue(d == 1.0f);
d = sd.getDistance("martha", "marhta");
assertTrue(d > 0.66 && d <0.67);
d = sd.getDistance("jones", "johnson");
assertTrue(d > 0.199 && d < 0.201);
d = sd.getDistance("abcvwxyz", "cabvwxyz");
assertTrue(d > 0.749 && d < 0.751);
d = sd.getDistance("dwayne", "duane");
assertTrue(d > 0.599 && d < 0.601);
d = sd.getDistance("dixon", "dicksonx");
assertTrue(d > 0.199 && d < 0.201);
d = sd.getDistance("six", "ten");
assertTrue(d == 0f);
float d1 = sd.getDistance("zac ephron", "zac efron");
float d2 = sd.getDistance("zac ephron", "kai ephron");
assertTrue(d1 < d2);
d1 = sd.getDistance("brittney spears", "britney spears");
d2 = sd.getDistance("brittney spears", "brittney startzman");
assertTrue(d1 > d2);
}
}

View File

@ -43,7 +43,7 @@ public class TestSpellChecker extends TestCase {
protected void setUp() throws Exception { protected void setUp() throws Exception {
super.setUp(); super.setUp();
//create a user index //create a user index
userindex = new RAMDirectory(); userindex = new RAMDirectory();
IndexWriter writer = new IndexWriter(userindex, new SimpleAnalyzer(), true); IndexWriter writer = new IndexWriter(userindex, new SimpleAnalyzer(), true);
@ -75,6 +75,46 @@ public class TestSpellChecker extends TestCase {
assertEquals(num_field2, num_field1 + 1); assertEquals(num_field2, num_field1 + 1);
checkCommonSuggestions(r);
checkLevenshteinSuggestions(r);
spellChecker.setStringDistance(new JaroWinklerDistance());
spellChecker.setAccuracy(0.8f);
checkCommonSuggestions(r);
checkJaroWinklerSuggestions();
}
private void checkCommonSuggestions(IndexReader r) throws IOException {
String[] similar = spellChecker.suggestSimilar("fvie", 2);
assertTrue(similar.length > 0);
assertEquals(similar[0], "five");
similar = spellChecker.suggestSimilar("five", 2);
if (similar.length > 0) {
assertFalse(similar[0].equals("five")); // don't suggest a word for itself
}
similar = spellChecker.suggestSimilar("fiv", 2);
assertTrue(similar.length > 0);
assertEquals(similar[0], "five");
similar = spellChecker.suggestSimilar("fives", 2);
assertTrue(similar.length > 0);
assertEquals(similar[0], "five");
assertTrue(similar.length > 0);
similar = spellChecker.suggestSimilar("fie", 2);
assertEquals(similar[0], "five");
// test restraint to a field
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
assertEquals(1, similar.length); // there is the term thousand in the field field2
}
private void checkLevenshteinSuggestions(IndexReader r) throws IOException {
// test small word // test small word
String[] similar = spellChecker.suggestSimilar("fvie", 2); String[] similar = spellChecker.suggestSimilar("fvie", 2);
assertEquals(1, similar.length); assertEquals(1, similar.length);
@ -109,14 +149,21 @@ public class TestSpellChecker extends TestCase {
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false); similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
assertEquals(1, similar.length); // there is the term thousand in the field field2 assertEquals(1, similar.length); // there is the term thousand in the field field2
similar = spellChecker.suggestSimilar("onety", 2);
assertEquals(1, similar.length);
assertEquals(similar[0], "ninety");
try { try {
similar = spellChecker.suggestSimilar("tousand", 10, r, null, false); similar = spellChecker.suggestSimilar("tousand", 10, r, null, false);
} catch (NullPointerException e) { } catch (NullPointerException e) {
assertTrue("threw an NPE, and it shouldn't have", false); assertTrue("threw an NPE, and it shouldn't have", false);
} }
}
private void checkJaroWinklerSuggestions() throws IOException {
String[] similar = spellChecker.suggestSimilar("onety", 2);
assertEquals(2, similar.length);
assertEquals(similar[0], "one");
} }
private void addwords(IndexReader r, String field) throws IOException { private void addwords(IndexReader r, String field) throws IOException {