mirror of https://github.com/apache/lucene.git
LUCENE-1297 - Allow other string distance measures for the SpellChecker
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@669085 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4adc3f7ba6
commit
ce19aa28c0
|
@ -186,6 +186,9 @@ New features
|
|||
|
||||
16. LUCENE-1298: MoreLikeThis can now accept a custom Similarity (Grant Ingersoll)
|
||||
|
||||
17. LUCENE-1297: Allow other string distance measures for the SpellChecker
|
||||
(Thomas Morton via Otis Gospodnetic)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-705: When building a compound file, use
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class JaroWinklerDistance implements StringDistance {
|
||||
|
||||
private float threshold = 0.7f;
|
||||
|
||||
private int[] matches(String s1, String s2) {
|
||||
String max, min;
|
||||
if (s1.length() > s2.length()) {
|
||||
max = s1;
|
||||
min = s2;
|
||||
} else {
|
||||
max = s2;
|
||||
min = s1;
|
||||
}
|
||||
int range = Math.max(max.length() / 2 - 1, 0);
|
||||
int[] matchIndexes = new int[min.length()];
|
||||
Arrays.fill(matchIndexes, -1);
|
||||
boolean[] matchFlags = new boolean[max.length()];
|
||||
int matches = 0;
|
||||
for (int mi = 0; mi < min.length(); mi++) {
|
||||
char c1 = min.charAt(mi);
|
||||
for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max
|
||||
.length()); xi < xn; xi++) {
|
||||
if (!matchFlags[xi] && c1 == max.charAt(xi)) {
|
||||
matchIndexes[mi] = xi;
|
||||
matchFlags[xi] = true;
|
||||
matches++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
char[] ms1 = new char[matches];
|
||||
char[] ms2 = new char[matches];
|
||||
for (int i = 0, si = 0; i < min.length(); i++) {
|
||||
if (matchIndexes[i] != -1) {
|
||||
ms1[si] = min.charAt(i);
|
||||
si++;
|
||||
}
|
||||
}
|
||||
for (int i = 0, si = 0; i < max.length(); i++) {
|
||||
if (matchFlags[i]) {
|
||||
ms2[si] = max.charAt(i);
|
||||
si++;
|
||||
}
|
||||
}
|
||||
int transpositions = 0;
|
||||
for (int mi = 0; mi < ms1.length; mi++) {
|
||||
if (ms1[mi] != ms2[mi]) {
|
||||
transpositions++;
|
||||
}
|
||||
}
|
||||
int prefix = 0;
|
||||
for (int mi = 0; mi < min.length(); mi++) {
|
||||
if (s1.charAt(mi) == s2.charAt(mi)) {
|
||||
prefix++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return new int[] { matches, transpositions / 2, prefix, max.length() };
|
||||
}
|
||||
|
||||
public float getDistance(String s1, String s2) {
|
||||
int[] mtp = matches(s1, s2);
|
||||
float m = (float) mtp[0];
|
||||
if (m == 0) {
|
||||
return 0f;
|
||||
}
|
||||
float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
|
||||
float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
|
||||
* (1 - j);
|
||||
return jw;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the threshold used to determine when Winkler bonus should be used.
|
||||
* Set to a negative value to get the Jaro distance.
|
||||
* @param threshold the new value of the threshold
|
||||
*/
|
||||
public void setThreshold(float threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current value of the threshold used for adding the Winkler bonus.
|
||||
* The deafult value is 0.7.
|
||||
* @return the current value of the threshold
|
||||
*/
|
||||
public float getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
}
|
|
@ -77,6 +77,8 @@ public class SpellChecker {
|
|||
// minimum score for hits generated by the spell checker query
|
||||
private float minScore = 0.5f;
|
||||
|
||||
private StringDistance sd;
|
||||
|
||||
/**
|
||||
* Use the given directory as a spell checker index. The directory
|
||||
* is created if it doesn't exist yet.
|
||||
|
@ -84,8 +86,13 @@ public class SpellChecker {
|
|||
* @param spellIndex
|
||||
* @throws IOException
|
||||
*/
|
||||
public SpellChecker(Directory spellIndex) throws IOException {
|
||||
public SpellChecker(Directory spellIndex,StringDistance sd) throws IOException {
|
||||
this.setSpellIndex(spellIndex);
|
||||
this.setStringDistance(sd);
|
||||
}
|
||||
|
||||
public SpellChecker(Directory spellIndex) throws IOException {
|
||||
this(spellIndex,new TRStringDistance());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -109,6 +116,11 @@ public class SpellChecker {
|
|||
searcher = new IndexSearcher(this.spellIndex);
|
||||
}
|
||||
|
||||
public void setStringDistance(StringDistance sd) {
|
||||
this.sd = sd;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sets the accuracy 0 < minScore < 1; default 0.5
|
||||
*/
|
||||
|
@ -163,7 +175,6 @@ public class SpellChecker {
|
|||
String field, boolean morePopular) throws IOException {
|
||||
|
||||
float min = this.minScore;
|
||||
final TRStringDistance sd = new TRStringDistance(word);
|
||||
final int lengthWord = word.length();
|
||||
|
||||
final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
|
||||
|
@ -217,9 +228,8 @@ public class SpellChecker {
|
|||
continue;
|
||||
}
|
||||
|
||||
// edit distance/normalize with the minScore word length
|
||||
sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math
|
||||
.min(sugWord.string.length(), lengthWord));
|
||||
// edit distance
|
||||
sugWord.score = sd.getDistance(word,sugWord.string);
|
||||
if (sugWord.score < min) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Interface for string distances.
|
||||
*/
|
||||
public interface StringDistance {
|
||||
|
||||
/**
|
||||
* Returns a float between 0 and 1 based on how similar the specified strings are to one another.
|
||||
* Returning a value of 0 means the specified strings are identical and 1 means the
|
||||
* string are maximally different.
|
||||
* @param s1 The first string.
|
||||
* @param s2 The second string.
|
||||
* @return a float between 0 and 1 based on how similar the specified strings are to one another.
|
||||
*/
|
||||
public float getDistance(String s1,String s2);
|
||||
|
||||
}
|
|
@ -18,34 +18,28 @@ package org.apache.lucene.search.spell;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Edit distance class.
|
||||
* Note: this class is not thread-safe.
|
||||
* Levenshtein edit distance class.
|
||||
*/
|
||||
final class TRStringDistance {
|
||||
|
||||
final char[] sa;
|
||||
final int n;
|
||||
int p[]; //'previous' cost array, horizontally
|
||||
int d[]; // cost array, horizontally
|
||||
int _d[]; //placeholder to assist in swapping p and d
|
||||
|
||||
final class TRStringDistance implements StringDistance {
|
||||
|
||||
/**
|
||||
* Optimized to run a bit faster than the static getDistance().
|
||||
* In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
|
||||
*/
|
||||
public TRStringDistance (String target) {
|
||||
sa = target.toCharArray();
|
||||
n = sa.length;
|
||||
p = new int[n+1]; //'previous' cost array, horizontally
|
||||
d = new int[n+1]; // cost array, horizontally
|
||||
public TRStringDistance () {
|
||||
}
|
||||
|
||||
|
||||
//*****************************
|
||||
// Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
|
||||
//*****************************
|
||||
public final int getDistance (String other) {
|
||||
public float getDistance (String target, String other) {
|
||||
char[] sa;
|
||||
int n;
|
||||
int p[]; //'previous' cost array, horizontally
|
||||
int d[]; // cost array, horizontally
|
||||
int _d[]; //placeholder to assist in swapping p and d
|
||||
|
||||
/*
|
||||
The difference between this impl. and the previous is that, rather
|
||||
than creating and retaining a matrix of size s.length()+1 by t.length()+1,
|
||||
|
@ -63,12 +57,17 @@ final class TRStringDistance {
|
|||
cause an out of memory condition when calculating the LD over two very large strings.
|
||||
*/
|
||||
|
||||
sa = target.toCharArray();
|
||||
n = sa.length;
|
||||
p = new int[n+1];
|
||||
d = new int[n+1];
|
||||
|
||||
final int m = other.length();
|
||||
|
||||
if (n == 0) {
|
||||
return m;
|
||||
return 1;
|
||||
} else if (m == 0) {
|
||||
return n;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// indexes into strings s and t
|
||||
|
@ -101,7 +100,7 @@ final class TRStringDistance {
|
|||
|
||||
// our last action in the above loop was to switch d and p, so p now
|
||||
// actually has the most recent cost counts
|
||||
return p[n];
|
||||
return 1.0f - ((float) p[n] / Math.min(other.length(), sa.length));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestJaroWinklerDistance extends TestCase {
|
||||
|
||||
private StringDistance sd = new JaroWinklerDistance();
|
||||
|
||||
public void testGetDistance() {
|
||||
float d = sd.getDistance("al", "al");
|
||||
assertTrue(d == 1.0f);
|
||||
d = sd.getDistance("martha", "marhta");
|
||||
assertTrue(d > 0.961 && d <0.962);
|
||||
d = sd.getDistance("jones", "johnson");
|
||||
assertTrue(d > 0.832 && d < 0.833);
|
||||
d = sd.getDistance("abcvwxyz", "cabvwxyz");
|
||||
assertTrue(d > 0.958 && d < 0.959);
|
||||
d = sd.getDistance("dwayne", "duane");
|
||||
assertTrue(d > 0.84 && d < 0.841);
|
||||
d = sd.getDistance("dixon", "dicksonx");
|
||||
assertTrue(d > 0.813 && d < 0.814);
|
||||
d = sd.getDistance("fvie", "ten");
|
||||
assertTrue(d == 0f);
|
||||
float d1 = sd.getDistance("zac ephron", "zac efron");
|
||||
float d2 = sd.getDistance("zac ephron", "kai ephron");
|
||||
assertTrue(d1 > d2);
|
||||
d1 = sd.getDistance("brittney spears", "britney spears");
|
||||
d2 = sd.getDistance("brittney spears", "brittney startzman");
|
||||
assertTrue(d1 > d2);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestLevenshteinDistance extends TestCase {
|
||||
|
||||
private StringDistance sd = new TRStringDistance();
|
||||
|
||||
public void testGetDistance() {
|
||||
float d = sd.getDistance("al", "al");
|
||||
assertTrue(d == 1.0f);
|
||||
d = sd.getDistance("martha", "marhta");
|
||||
assertTrue(d > 0.66 && d <0.67);
|
||||
d = sd.getDistance("jones", "johnson");
|
||||
assertTrue(d > 0.199 && d < 0.201);
|
||||
d = sd.getDistance("abcvwxyz", "cabvwxyz");
|
||||
assertTrue(d > 0.749 && d < 0.751);
|
||||
d = sd.getDistance("dwayne", "duane");
|
||||
assertTrue(d > 0.599 && d < 0.601);
|
||||
d = sd.getDistance("dixon", "dicksonx");
|
||||
assertTrue(d > 0.199 && d < 0.201);
|
||||
d = sd.getDistance("six", "ten");
|
||||
assertTrue(d == 0f);
|
||||
float d1 = sd.getDistance("zac ephron", "zac efron");
|
||||
float d2 = sd.getDistance("zac ephron", "kai ephron");
|
||||
assertTrue(d1 < d2);
|
||||
d1 = sd.getDistance("brittney spears", "britney spears");
|
||||
d2 = sd.getDistance("brittney spears", "brittney startzman");
|
||||
assertTrue(d1 > d2);
|
||||
}
|
||||
|
||||
}
|
|
@ -75,6 +75,46 @@ public class TestSpellChecker extends TestCase {
|
|||
|
||||
assertEquals(num_field2, num_field1 + 1);
|
||||
|
||||
checkCommonSuggestions(r);
|
||||
checkLevenshteinSuggestions(r);
|
||||
|
||||
spellChecker.setStringDistance(new JaroWinklerDistance());
|
||||
spellChecker.setAccuracy(0.8f);
|
||||
checkCommonSuggestions(r);
|
||||
checkJaroWinklerSuggestions();
|
||||
}
|
||||
|
||||
private void checkCommonSuggestions(IndexReader r) throws IOException {
|
||||
String[] similar = spellChecker.suggestSimilar("fvie", 2);
|
||||
assertTrue(similar.length > 0);
|
||||
assertEquals(similar[0], "five");
|
||||
|
||||
similar = spellChecker.suggestSimilar("five", 2);
|
||||
if (similar.length > 0) {
|
||||
assertFalse(similar[0].equals("five")); // don't suggest a word for itself
|
||||
}
|
||||
|
||||
similar = spellChecker.suggestSimilar("fiv", 2);
|
||||
assertTrue(similar.length > 0);
|
||||
assertEquals(similar[0], "five");
|
||||
|
||||
similar = spellChecker.suggestSimilar("fives", 2);
|
||||
assertTrue(similar.length > 0);
|
||||
assertEquals(similar[0], "five");
|
||||
|
||||
assertTrue(similar.length > 0);
|
||||
similar = spellChecker.suggestSimilar("fie", 2);
|
||||
assertEquals(similar[0], "five");
|
||||
|
||||
// test restraint to a field
|
||||
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
|
||||
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
|
||||
|
||||
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
|
||||
assertEquals(1, similar.length); // there is the term thousand in the field field2
|
||||
}
|
||||
|
||||
private void checkLevenshteinSuggestions(IndexReader r) throws IOException {
|
||||
// test small word
|
||||
String[] similar = spellChecker.suggestSimilar("fvie", 2);
|
||||
assertEquals(1, similar.length);
|
||||
|
@ -110,13 +150,20 @@ public class TestSpellChecker extends TestCase {
|
|||
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
|
||||
assertEquals(1, similar.length); // there is the term thousand in the field field2
|
||||
|
||||
similar = spellChecker.suggestSimilar("onety", 2);
|
||||
assertEquals(1, similar.length);
|
||||
assertEquals(similar[0], "ninety");
|
||||
try {
|
||||
similar = spellChecker.suggestSimilar("tousand", 10, r, null, false);
|
||||
} catch (NullPointerException e) {
|
||||
assertTrue("threw an NPE, and it shouldn't have", false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void checkJaroWinklerSuggestions() throws IOException {
|
||||
String[] similar = spellChecker.suggestSimilar("onety", 2);
|
||||
assertEquals(2, similar.length);
|
||||
assertEquals(similar[0], "one");
|
||||
}
|
||||
|
||||
private void addwords(IndexReader r, String field) throws IOException {
|
||||
|
|
Loading…
Reference in New Issue