LANG-1277: StringUtils#getLevenshteinDistance reduce memory consumption (closes #189)
This commit is contained in:
parent
3433a94e25
commit
103b64a373
|
@ -7737,15 +7737,11 @@ public class StringUtils {
|
|||
* insertion or substitution).</p>
|
||||
*
|
||||
* <p>The previous implementation of the Levenshtein distance algorithm
|
||||
* was from <a href="https://web.archive.org/web/20120604192456/http://www.merriampark.com/ld.htm">
|
||||
* https://web.archive.org/web/20120604192456/http://www.merriampark.com/ld.htm</a></p>
|
||||
*
|
||||
* <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
|
||||
* which can occur when my Java implementation is used with very large strings.<br>
|
||||
* This implementation of the Levenshtein distance algorithm
|
||||
* is from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm">
|
||||
* was from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm">
|
||||
* https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm</a></p>
|
||||
*
|
||||
* <p>This implementation only need one single-dimensional arrays of length s.length() + 1</p>
|
||||
*
|
||||
* <pre>
|
||||
* StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException
|
||||
* StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException
|
||||
|
@ -7773,20 +7769,8 @@ public class StringUtils {
|
|||
}
|
||||
|
||||
/*
|
||||
The difference between this impl. and the previous is that, rather
|
||||
than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
|
||||
we maintain two single-dimensional arrays of length s.length() + 1. The first, d,
|
||||
is the 'current working' distance array that maintains the newest distance cost
|
||||
counts as we iterate through the characters of String s. Each time we increment
|
||||
the index of String t we are comparing, d is copied to p, the second int[]. Doing so
|
||||
allows us to retain the previous cost counts as required by the algorithm (taking
|
||||
the minimum of the cost count to the left, up one, and diagonally up and to the left
|
||||
of the current cost count being calculated). (Note that the arrays aren't really
|
||||
copied anymore, just switched...this is clearly much better than cloning an array
|
||||
or doing a System.arraycopy() each time through the outer loop.)
|
||||
|
||||
Effectively, the difference between the two implementations is this one does not
|
||||
cause an out of memory condition when calculating the LD over two very large strings.
|
||||
This implementation use two variable to record the previous cost counts,
|
||||
So this implementation use less memory than previous impl.
|
||||
*/
|
||||
|
||||
int n = s.length(); // length of s
|
||||
|
@ -7807,16 +7791,14 @@ public class StringUtils {
|
|||
m = t.length();
|
||||
}
|
||||
|
||||
int p[] = new int[n + 1]; //'previous' cost array, horizontally
|
||||
int d[] = new int[n + 1]; // cost array, horizontally
|
||||
int _d[]; //placeholder to assist in swapping p and d
|
||||
|
||||
int p[] = new int[n + 1];
|
||||
// indexes into strings s and t
|
||||
int i; // iterates through s
|
||||
int j; // iterates through t
|
||||
int upper_left;
|
||||
int upper;
|
||||
|
||||
char t_j; // jth character of t
|
||||
|
||||
int cost; // cost
|
||||
|
||||
for (i = 0; i <= n; i++) {
|
||||
|
@ -7824,23 +7806,19 @@ public class StringUtils {
|
|||
}
|
||||
|
||||
for (j = 1; j <= m; j++) {
|
||||
upper_left = p[0];
|
||||
t_j = t.charAt(j - 1);
|
||||
d[0] = j;
|
||||
p[0] = j;
|
||||
|
||||
for (i = 1; i <= n; i++) {
|
||||
upper = p[i];
|
||||
cost = s.charAt(i - 1) == t_j ? 0 : 1;
|
||||
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
||||
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
|
||||
p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upper_left + cost);
|
||||
upper_left = upper;
|
||||
}
|
||||
|
||||
// copy current distance counts to 'previous row' distance counts
|
||||
_d = p;
|
||||
p = d;
|
||||
d = _d;
|
||||
}
|
||||
|
||||
// our last action in the above loop was to switch d and p, so p now
|
||||
// actually has the most recent cost counts
|
||||
return p[n];
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue