From 0467966f4bf88af42b3e948e84761dcc560adf33 Mon Sep 17 00:00:00 2001 From: Matthew Jason Benson Date: Thu, 16 Jun 2011 15:52:09 +0000 Subject: [PATCH] [LANG-684] Levenshtein Distance Within a Given Threshold; submitted by Eli Lindsey git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1136496 13f79535-47bb-0310-9956-ffa450edef68 --- pom.xml | 3 + .../org/apache/commons/lang3/StringUtils.java | 163 ++++++++++++++++++ .../apache/commons/lang3/StringUtilsTest.java | 79 +++++++++ 3 files changed, 245 insertions(+) diff --git a/pom.xml b/pom.xml index e133a7c40..184e779ee 100644 --- a/pom.xml +++ b/pom.xml @@ -287,6 +287,9 @@ Rafal Krzewski + + Eli Lindsey + Craig R. McClanahan diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index b66b91edd..64491d42c 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -19,6 +19,7 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Locale; @@ -6137,6 +6138,168 @@ allows us to retain the previous cost counts as required by the algorithm (takin return p[n]; } + /** + *

Find the Levenshtein distance between two Strings if it's less than or equal to a given + * threshold.

+ * + *

This is the number of changes needed to change one String into + * another, where each change is a single character modification (deletion, + * insertion or substitution).

+ * + *

This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield + * and Chas Emerick's implementation of the Levenshtein distance algorithm from + * http://www.merriampark.com/ld.htm

+ * + *
+     * StringUtils.getLevenshteinDistance(null, *, *)             = IllegalArgumentException
+     * StringUtils.getLevenshteinDistance(*, null, *)             = IllegalArgumentException
+     * StringUtils.getLevenshteinDistance(*, *, -1)               = IllegalArgumentException
+     * StringUtils.getLevenshteinDistance("","", 0)               = 0
+     * StringUtils.getLevenshteinDistance("aaapppp", "", 8)       = 7
+     * StringUtils.getLevenshteinDistance("aaapppp", "", 7)       = 7
+     * StringUtils.getLevenshteinDistance("aaapppp", "", 6))      = -1
+     * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7
+     * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1
+     * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7
+     * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1
+     * 
+ * + * @param s the first String, must not be null + * @param t the second String, must not be null + * @param threshold the target threshold, must not be negative + * @return result distance, or -1 if the distance would be greater than the threshold + * @throws IllegalArgumentException if either String input {@code null} or negative threshold + */ + public static int getLevenshteinDistance(CharSequence s, CharSequence t, int threshold) { + if(s == null || t == null) { + throw new IllegalArgumentException("String must not be null"); + } + if(threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + + /* + This implementation only computes the distance if it's less than or equal to the + threshold value, returning -1 if it's greater. The advantage is performance: unbounded + distance is O(nm), but a bound of k allows us to reduce it to O(km) time by only + computing a diagonal stripe of width 2k+1 of the cost table. + It is also possible to use this to compute the unbounded Levenshtein distance by starting + the threshold at 1 and doubling each time until the distance is found; this is O(dm), where + d is the distance. + + One subtlety comes from needing to ignore entries on the border of our stripe + eg. + p[] = |#|#|#|* + d[] = *|#|#|#| + We must ignore the entry to the left of the leftmost member + We must ignore the entry above the rightmost member + + Another subtlety comes from our stripe running off the matrix if the strings aren't + of the same size. Since string s is always swapped to be the shorter of the two, + the stripe will always run off to the upper right instead of the lower left of the matrix. + + As a concrete example, suppose s is of length 5, t is of length 7, and our threshold is 1. + In this case we're going to walk a stripe of length 3. The matrix would look like so: + + 1 2 3 4 5 + 1 |#|#| | | | + 2 |#|#|#| | | + 3 | |#|#|#| | + 4 | | |#|#|#| + 5 | | | |#|#| + 6 | | | | |#| + 7 | | | | | | + + Note how the stripe leads off the table as there is no possible way to turn a string of length 5 + into one of length 7 in edit distance of 1. + + Additionally, this implementation decreases memory usage by using two + single-dimensional arrays and swapping them back and forth instead of allocating + an entire n by m matrix. This requires a few minor changes, such as immediately returning + when it's detected that the stripe has run off the matrix and initially filling the arrays with + large values so that entries we don't compute are ignored. + + See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion. + */ + + int n = s.length(); // length of s + int m = t.length(); // length of t + + // if one string is empty, the edit distance is necessarily the length of the other + if(n == 0) { + return m <= threshold? m : -1; + } else if(m == 0) { + return n <= threshold? n : -1; + } + + if(n > m) { + // swap the two strings to consume less memory + CharSequence tmp = s; + s = t; + t = tmp; + n = m; + m = t.length(); + } + + int p[] = new int[n+1]; // 'previous' cost array, horizontally + int d[] = new int[n+1]; // cost array, horizontally + int _d[]; // placeholder to assist in swapping p and d + + // fill in starting table values + int boundary = Math.min(n, threshold) + 1; + for(int i = 0; i < boundary; i++) { + p[i] = i; + } + // these fills ensure that the value above the rightmost entry of our + // stripe will be ignored in following loop iterations + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); + Arrays.fill(d, Integer.MAX_VALUE); + + // iterates through t + for(int j = 1; j <= m; j++) { + char t_j = t.charAt(j-1); // jth character of t + d[0] = j; + + // compute stripe indices, constrain to array size + int min = Math.max(1, j - threshold); + int max = Math.min(n, j + threshold); + + // the stripe may lead off of the table if s and t are of different sizes + if(min > max) { + return -1; + } + + // ignore entry left of leftmost + if(min > 1) { + d[min-1] = Integer.MAX_VALUE; + } + + // iterates through [min, max] in s + for(int i = min; i <= max; i++) { + if(s.charAt(i-1) == t_j) { + // diagonally left and up + d[i] = p[i-1]; + } else { + // 1 + minimum of cell to the left, to the top, diagonally left and up + d[i] = 1 + Math.min(Math.min(d[i-1], p[i]), p[i-1]); + } + } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } + + // if p[n] is greater than the threshold, there's no guarantee on it being the correct + // distance + if(p[n] <= threshold) { + return p[n]; + } else { + return -1; + } + } + // startsWith //----------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index e8823dc5c..aaa02d5eb 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -1671,6 +1671,85 @@ public void testGetLevenshteinDistance_StringString() { } } + public void testGetLevenshteinDistance_StringStringInt() { + // empty strings + assertEquals(0, StringUtils.getLevenshteinDistance("", "", 0)); + assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "", 8)); + assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "", 7)); + assertEquals(-1, StringUtils.getLevenshteinDistance("aaapppp", "", 6)); + + // unequal strings, zero threshold + assertEquals(-1, StringUtils.getLevenshteinDistance("b", "a", 0)); + assertEquals(-1, StringUtils.getLevenshteinDistance("a", "b", 0)); + + // equal strings + assertEquals(0, StringUtils.getLevenshteinDistance("aa", "aa", 0)); + assertEquals(0, StringUtils.getLevenshteinDistance("aa", "aa", 2)); + + // same length + assertEquals(-1, StringUtils.getLevenshteinDistance("aaa", "bbb", 2)); + assertEquals(3, StringUtils.getLevenshteinDistance("aaa", "bbb", 3)); + + // big stripe + assertEquals(6, StringUtils.getLevenshteinDistance("aaaaaa", "b", 10)); + + // distance less than threshold + assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "b", 8)); + assertEquals(3, StringUtils.getLevenshteinDistance("a", "bbb", 4)); + + // distance equal to threshold + assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "b", 7)); + assertEquals(3, StringUtils.getLevenshteinDistance("a", "bbb", 3)); + + // distance greater than threshold + assertEquals(-1, StringUtils.getLevenshteinDistance("a", "bbb", 2)); + assertEquals(-1, StringUtils.getLevenshteinDistance("bbb", "a", 2)); + assertEquals(-1, StringUtils.getLevenshteinDistance("aaapppp", "b", 6)); + + // stripe runs off array, strings not similar + assertEquals(-1, StringUtils.getLevenshteinDistance("a", "bbb", 1)); + assertEquals(-1, StringUtils.getLevenshteinDistance("bbb", "a", 1)); + + // stripe runs off array, strings are similar + assertEquals(-1, StringUtils.getLevenshteinDistance("12345", "1234567", 1)); + assertEquals(-1, StringUtils.getLevenshteinDistance("1234567", "12345", 1)); + + // old getLevenshteinDistance test cases + assertEquals(1, StringUtils.getLevenshteinDistance("frog", "fog",1) ); + assertEquals(3, StringUtils.getLevenshteinDistance("fly", "ant",3) ); + assertEquals(7, StringUtils.getLevenshteinDistance("elephant", "hippo",7) ); + assertEquals(-1, StringUtils.getLevenshteinDistance("elephant", "hippo",6) ); + assertEquals(7, StringUtils.getLevenshteinDistance("hippo", "elephant",7) ); + assertEquals(-1, StringUtils.getLevenshteinDistance("hippo", "elephant",6) ); + assertEquals(8, StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz",8) ); + assertEquals(8, StringUtils.getLevenshteinDistance("zzzzzzzz", "hippo",8) ); + assertEquals(1, StringUtils.getLevenshteinDistance("hello", "hallo",1) ); + + // exceptions + try { + @SuppressWarnings("unused") + int d = StringUtils.getLevenshteinDistance("a", null, 0); + fail("expecting IllegalArgumentException"); + } catch (IllegalArgumentException ex) { + // empty + } + try { + @SuppressWarnings("unused") + int d = StringUtils.getLevenshteinDistance(null, "a", 0); + fail("expecting IllegalArgumentException"); + } catch (IllegalArgumentException ex) { + // empty + } + + try { + @SuppressWarnings("unused") + int d = StringUtils.getLevenshteinDistance("a", "a", -1); + fail("expecting IllegalArgumentException"); + } catch (IllegalArgumentException ex) { + // empty + } + } + /** * A sanity check for {@link StringUtils#EMPTY}. */