[LANG-684] Levenshtein Distance Within a Given Threshold; submitted by Eli Lindsey

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1136496 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Matthew Jason Benson 2011-06-16 15:52:09 +00:00
parent 734d218eb5
commit 0467966f4b
3 changed files with 245 additions and 0 deletions

View File

@ -287,6 +287,9 @@
<contributor>
<name>Rafal Krzewski</name>
</contributor>
<contributor>
<name>Eli Lindsey</name>
</contributor>
<contributor>
<name>Craig R. McClanahan</name>
</contributor>

View File

@ -19,6 +19,7 @@
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
@ -6137,6 +6138,168 @@ allows us to retain the previous cost counts as required by the algorithm (takin
return p[n];
}
/**
* <p>Find the Levenshtein distance between two Strings if it's less than or equal to a given
* threshold.</p>
*
* <p>This is the number of changes needed to change one String into
* another, where each change is a single character modification (deletion,
* insertion or substitution).</p>
*
* <p>This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield
* and Chas Emerick's implementation of the Levenshtein distance algorithm from
* <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
*
* <pre>
* StringUtils.getLevenshteinDistance(null, *, *) = IllegalArgumentException
* StringUtils.getLevenshteinDistance(*, null, *) = IllegalArgumentException
* StringUtils.getLevenshteinDistance(*, *, -1) = IllegalArgumentException
* StringUtils.getLevenshteinDistance("","", 0) = 0
* StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7
* StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7
* StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1
* StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7
* StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1
* StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7
* StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1
* </pre>
*
* @param s the first String, must not be null
* @param t the second String, must not be null
* @param threshold the target threshold, must not be negative
* @return result distance, or -1 if the distance would be greater than the threshold
* @throws IllegalArgumentException if either String input {@code null} or negative threshold
*/
public static int getLevenshteinDistance(CharSequence s, CharSequence t, int threshold) {
if(s == null || t == null) {
throw new IllegalArgumentException("String must not be null");
}
if(threshold < 0) {
throw new IllegalArgumentException("Threshold must not be negative");
}
/*
This implementation only computes the distance if it's less than or equal to the
threshold value, returning -1 if it's greater. The advantage is performance: unbounded
distance is O(nm), but a bound of k allows us to reduce it to O(km) time by only
computing a diagonal stripe of width 2k+1 of the cost table.
It is also possible to use this to compute the unbounded Levenshtein distance by starting
the threshold at 1 and doubling each time until the distance is found; this is O(dm), where
d is the distance.
One subtlety comes from needing to ignore entries on the border of our stripe
eg.
p[] = |#|#|#|*
d[] = *|#|#|#|
We must ignore the entry to the left of the leftmost member
We must ignore the entry above the rightmost member
Another subtlety comes from our stripe running off the matrix if the strings aren't
of the same size. Since string s is always swapped to be the shorter of the two,
the stripe will always run off to the upper right instead of the lower left of the matrix.
As a concrete example, suppose s is of length 5, t is of length 7, and our threshold is 1.
In this case we're going to walk a stripe of length 3. The matrix would look like so:
1 2 3 4 5
1 |#|#| | | |
2 |#|#|#| | |
3 | |#|#|#| |
4 | | |#|#|#|
5 | | | |#|#|
6 | | | | |#|
7 | | | | | |
Note how the stripe leads off the table as there is no possible way to turn a string of length 5
into one of length 7 in edit distance of 1.
Additionally, this implementation decreases memory usage by using two
single-dimensional arrays and swapping them back and forth instead of allocating
an entire n by m matrix. This requires a few minor changes, such as immediately returning
when it's detected that the stripe has run off the matrix and initially filling the arrays with
large values so that entries we don't compute are ignored.
See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion.
*/
int n = s.length(); // length of s
int m = t.length(); // length of t
// if one string is empty, the edit distance is necessarily the length of the other
if(n == 0) {
return m <= threshold? m : -1;
} else if(m == 0) {
return n <= threshold? n : -1;
}
if(n > m) {
// swap the two strings to consume less memory
CharSequence tmp = s;
s = t;
t = tmp;
n = m;
m = t.length();
}
int p[] = new int[n+1]; // 'previous' cost array, horizontally
int d[] = new int[n+1]; // cost array, horizontally
int _d[]; // placeholder to assist in swapping p and d
// fill in starting table values
int boundary = Math.min(n, threshold) + 1;
for(int i = 0; i < boundary; i++) {
p[i] = i;
}
// these fills ensure that the value above the rightmost entry of our
// stripe will be ignored in following loop iterations
Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
Arrays.fill(d, Integer.MAX_VALUE);
// iterates through t
for(int j = 1; j <= m; j++) {
char t_j = t.charAt(j-1); // jth character of t
d[0] = j;
// compute stripe indices, constrain to array size
int min = Math.max(1, j - threshold);
int max = Math.min(n, j + threshold);
// the stripe may lead off of the table if s and t are of different sizes
if(min > max) {
return -1;
}
// ignore entry left of leftmost
if(min > 1) {
d[min-1] = Integer.MAX_VALUE;
}
// iterates through [min, max] in s
for(int i = min; i <= max; i++) {
if(s.charAt(i-1) == t_j) {
// diagonally left and up
d[i] = p[i-1];
} else {
// 1 + minimum of cell to the left, to the top, diagonally left and up
d[i] = 1 + Math.min(Math.min(d[i-1], p[i]), p[i-1]);
}
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// if p[n] is greater than the threshold, there's no guarantee on it being the correct
// distance
if(p[n] <= threshold) {
return p[n];
} else {
return -1;
}
}
// startsWith
//-----------------------------------------------------------------------

View File

@ -1671,6 +1671,85 @@ public void testGetLevenshteinDistance_StringString() {
}
}
public void testGetLevenshteinDistance_StringStringInt() {
// empty strings
assertEquals(0, StringUtils.getLevenshteinDistance("", "", 0));
assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "", 8));
assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "", 7));
assertEquals(-1, StringUtils.getLevenshteinDistance("aaapppp", "", 6));
// unequal strings, zero threshold
assertEquals(-1, StringUtils.getLevenshteinDistance("b", "a", 0));
assertEquals(-1, StringUtils.getLevenshteinDistance("a", "b", 0));
// equal strings
assertEquals(0, StringUtils.getLevenshteinDistance("aa", "aa", 0));
assertEquals(0, StringUtils.getLevenshteinDistance("aa", "aa", 2));
// same length
assertEquals(-1, StringUtils.getLevenshteinDistance("aaa", "bbb", 2));
assertEquals(3, StringUtils.getLevenshteinDistance("aaa", "bbb", 3));
// big stripe
assertEquals(6, StringUtils.getLevenshteinDistance("aaaaaa", "b", 10));
// distance less than threshold
assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "b", 8));
assertEquals(3, StringUtils.getLevenshteinDistance("a", "bbb", 4));
// distance equal to threshold
assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "b", 7));
assertEquals(3, StringUtils.getLevenshteinDistance("a", "bbb", 3));
// distance greater than threshold
assertEquals(-1, StringUtils.getLevenshteinDistance("a", "bbb", 2));
assertEquals(-1, StringUtils.getLevenshteinDistance("bbb", "a", 2));
assertEquals(-1, StringUtils.getLevenshteinDistance("aaapppp", "b", 6));
// stripe runs off array, strings not similar
assertEquals(-1, StringUtils.getLevenshteinDistance("a", "bbb", 1));
assertEquals(-1, StringUtils.getLevenshteinDistance("bbb", "a", 1));
// stripe runs off array, strings are similar
assertEquals(-1, StringUtils.getLevenshteinDistance("12345", "1234567", 1));
assertEquals(-1, StringUtils.getLevenshteinDistance("1234567", "12345", 1));
// old getLevenshteinDistance test cases
assertEquals(1, StringUtils.getLevenshteinDistance("frog", "fog",1) );
assertEquals(3, StringUtils.getLevenshteinDistance("fly", "ant",3) );
assertEquals(7, StringUtils.getLevenshteinDistance("elephant", "hippo",7) );
assertEquals(-1, StringUtils.getLevenshteinDistance("elephant", "hippo",6) );
assertEquals(7, StringUtils.getLevenshteinDistance("hippo", "elephant",7) );
assertEquals(-1, StringUtils.getLevenshteinDistance("hippo", "elephant",6) );
assertEquals(8, StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz",8) );
assertEquals(8, StringUtils.getLevenshteinDistance("zzzzzzzz", "hippo",8) );
assertEquals(1, StringUtils.getLevenshteinDistance("hello", "hallo",1) );
// exceptions
try {
@SuppressWarnings("unused")
int d = StringUtils.getLevenshteinDistance("a", null, 0);
fail("expecting IllegalArgumentException");
} catch (IllegalArgumentException ex) {
// empty
}
try {
@SuppressWarnings("unused")
int d = StringUtils.getLevenshteinDistance(null, "a", 0);
fail("expecting IllegalArgumentException");
} catch (IllegalArgumentException ex) {
// empty
}
try {
@SuppressWarnings("unused")
int d = StringUtils.getLevenshteinDistance("a", "a", -1);
fail("expecting IllegalArgumentException");
} catch (IllegalArgumentException ex) {
// empty
}
}
/**
* A sanity check for {@link StringUtils#EMPTY}.
*/