LANG-1277: StringUtils#getLevenshteinDistance reduce memory consumption (closes #189)
This commit is contained in:
parent
3433a94e25
commit
103b64a373
|
@ -7737,15 +7737,11 @@ public class StringUtils {
|
||||||
* insertion or substitution).</p>
|
* insertion or substitution).</p>
|
||||||
*
|
*
|
||||||
* <p>The previous implementation of the Levenshtein distance algorithm
|
* <p>The previous implementation of the Levenshtein distance algorithm
|
||||||
* was from <a href="https://web.archive.org/web/20120604192456/http://www.merriampark.com/ld.htm">
|
* was from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm">
|
||||||
* https://web.archive.org/web/20120604192456/http://www.merriampark.com/ld.htm</a></p>
|
|
||||||
*
|
|
||||||
* <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
|
|
||||||
* which can occur when my Java implementation is used with very large strings.<br>
|
|
||||||
* This implementation of the Levenshtein distance algorithm
|
|
||||||
* is from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm">
|
|
||||||
* https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm</a></p>
|
* https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm</a></p>
|
||||||
*
|
*
|
||||||
|
* <p>This implementation only need one single-dimensional arrays of length s.length() + 1</p>
|
||||||
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException
|
* StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException
|
||||||
* StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException
|
* StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException
|
||||||
|
@ -7773,20 +7769,8 @@ public class StringUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
The difference between this impl. and the previous is that, rather
|
This implementation use two variable to record the previous cost counts,
|
||||||
than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
|
So this implementation use less memory than previous impl.
|
||||||
we maintain two single-dimensional arrays of length s.length() + 1. The first, d,
|
|
||||||
is the 'current working' distance array that maintains the newest distance cost
|
|
||||||
counts as we iterate through the characters of String s. Each time we increment
|
|
||||||
the index of String t we are comparing, d is copied to p, the second int[]. Doing so
|
|
||||||
allows us to retain the previous cost counts as required by the algorithm (taking
|
|
||||||
the minimum of the cost count to the left, up one, and diagonally up and to the left
|
|
||||||
of the current cost count being calculated). (Note that the arrays aren't really
|
|
||||||
copied anymore, just switched...this is clearly much better than cloning an array
|
|
||||||
or doing a System.arraycopy() each time through the outer loop.)
|
|
||||||
|
|
||||||
Effectively, the difference between the two implementations is this one does not
|
|
||||||
cause an out of memory condition when calculating the LD over two very large strings.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int n = s.length(); // length of s
|
int n = s.length(); // length of s
|
||||||
|
@ -7807,16 +7791,14 @@ public class StringUtils {
|
||||||
m = t.length();
|
m = t.length();
|
||||||
}
|
}
|
||||||
|
|
||||||
int p[] = new int[n + 1]; //'previous' cost array, horizontally
|
int p[] = new int[n + 1];
|
||||||
int d[] = new int[n + 1]; // cost array, horizontally
|
|
||||||
int _d[]; //placeholder to assist in swapping p and d
|
|
||||||
|
|
||||||
// indexes into strings s and t
|
// indexes into strings s and t
|
||||||
int i; // iterates through s
|
int i; // iterates through s
|
||||||
int j; // iterates through t
|
int j; // iterates through t
|
||||||
|
int upper_left;
|
||||||
|
int upper;
|
||||||
|
|
||||||
char t_j; // jth character of t
|
char t_j; // jth character of t
|
||||||
|
|
||||||
int cost; // cost
|
int cost; // cost
|
||||||
|
|
||||||
for (i = 0; i <= n; i++) {
|
for (i = 0; i <= n; i++) {
|
||||||
|
@ -7824,23 +7806,19 @@ public class StringUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (j = 1; j <= m; j++) {
|
for (j = 1; j <= m; j++) {
|
||||||
|
upper_left = p[0];
|
||||||
t_j = t.charAt(j - 1);
|
t_j = t.charAt(j - 1);
|
||||||
d[0] = j;
|
p[0] = j;
|
||||||
|
|
||||||
for (i = 1; i <= n; i++) {
|
for (i = 1; i <= n; i++) {
|
||||||
|
upper = p[i];
|
||||||
cost = s.charAt(i - 1) == t_j ? 0 : 1;
|
cost = s.charAt(i - 1) == t_j ? 0 : 1;
|
||||||
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
||||||
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
|
p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upper_left + cost);
|
||||||
|
upper_left = upper;
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy current distance counts to 'previous row' distance counts
|
|
||||||
_d = p;
|
|
||||||
p = d;
|
|
||||||
d = _d;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// our last action in the above loop was to switch d and p, so p now
|
|
||||||
// actually has the most recent cost counts
|
|
||||||
return p[n];
|
return p[n];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue