From 103b64a373256feae6ca85f2bf220e7694e48fa4 Mon Sep 17 00:00:00 2001 From: yufcuy Date: Tue, 13 Sep 2016 17:38:00 +0800 Subject: [PATCH] LANG-1277: StringUtils#getLevenshteinDistance reduce memory consumption (closes #189) --- .../org/apache/commons/lang3/StringUtils.java | 48 +++++-------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index d92604d9b..d06d60ceb 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -7737,15 +7737,11 @@ public static String getCommonPrefix(final String... strs) { * insertion or substitution).

* *

The previous implementation of the Levenshtein distance algorithm - * was from - * https://web.archive.org/web/20120604192456/http://www.merriampark.com/ld.htm

- * - *

Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError - * which can occur when my Java implementation is used with very large strings.
- * This implementation of the Levenshtein distance algorithm - * is from + * was from * https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm

* + *

This implementation only need one single-dimensional arrays of length s.length() + 1

+ * *
      * StringUtils.getLevenshteinDistance(null, *)             = IllegalArgumentException
      * StringUtils.getLevenshteinDistance(*, null)             = IllegalArgumentException
@@ -7773,20 +7769,8 @@ public static int getLevenshteinDistance(CharSequence s, CharSequence t) {
         }
 
         /*
-           The difference between this impl. and the previous is that, rather
-           than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
-           we maintain two single-dimensional arrays of length s.length() + 1.  The first, d,
-           is the 'current working' distance array that maintains the newest distance cost
-           counts as we iterate through the characters of String s.  Each time we increment
-           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
-           allows us to retain the previous cost counts as required by the algorithm (taking
-           the minimum of the cost count to the left, up one, and diagonally up and to the left
-           of the current cost count being calculated).  (Note that the arrays aren't really
-           copied anymore, just switched...this is clearly much better than cloning an array
-           or doing a System.arraycopy() each time  through the outer loop.)
-
-           Effectively, the difference between the two implementations is this one does not
-           cause an out of memory condition when calculating the LD over two very large strings.
+           This implementation use two variable to record the previous cost counts,
+           So this implementation use less memory than previous impl.
          */
 
         int n = s.length(); // length of s
@@ -7807,16 +7791,14 @@ allows us to retain the previous cost counts as required by the algorithm (takin
             m = t.length();
         }
 
-        int p[] = new int[n + 1]; //'previous' cost array, horizontally
-        int d[] = new int[n + 1]; // cost array, horizontally
-        int _d[]; //placeholder to assist in swapping p and d
-
+        int p[] = new int[n + 1];
         // indexes into strings s and t
         int i; // iterates through s
         int j; // iterates through t
+        int upper_left;
+        int upper;
 
         char t_j; // jth character of t
-
         int cost; // cost
 
         for (i = 0; i <= n; i++) {
@@ -7824,23 +7806,19 @@ allows us to retain the previous cost counts as required by the algorithm (takin
         }
 
         for (j = 1; j <= m; j++) {
+        	upper_left = p[0];
             t_j = t.charAt(j - 1);
-            d[0] = j;
+            p[0] = j;
 
             for (i = 1; i <= n; i++) {
+            	upper = p[i];
                 cost = s.charAt(i - 1) == t_j ? 0 : 1;
                 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
-                d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
+                p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upper_left + cost);
+                upper_left = upper;
             }
-
-            // copy current distance counts to 'previous row' distance counts
-            _d = p;
-            p = d;
-            d = _d;
         }
 
-        // our last action in the above loop was to switch d and p, so p now
-        // actually has the most recent cost counts
         return p[n];
     }