LANG-1199: Fix implementation of StringUtils.getJaroWinklerDistance()
Replace current implementation with Jaro-Winkler distance implementation taken from Apache Lucene.
This commit is contained in:
parent
43a9bab8c0
commit
c35a041022
|
@ -46,6 +46,7 @@ The <action> type attribute can be add,update,fix,remove.
|
|||
<body>
|
||||
|
||||
<release version="3.5" date="tba" description="tba">
|
||||
<action issue="LANG-1199" type="fix" dev="pschumacher" due-to="M. Steiger">Fix implementation of StringUtils.getJaroWinklerDistance()</action>
|
||||
<action issue="LANG-1244" type="fix" dev="pschumacher" due-to="jjbankert">Fix dead links in StringUtils.getLevenshteinDistance() javadoc</action>
|
||||
<action issue="LANG-1242" type="fix" dev="pschumacher" due-to="Neal Stewart">"\u2284":"⊄" mapping missing from EntityArrays#HTML40_EXTENDED_ESCAPE</action>
|
||||
<action issue="LANG-1243" type="update" dev="sebb">Simplify ArrayUtils removeElements by using new decrementAndGet() method</action>
|
||||
|
|
|
@ -7679,10 +7679,10 @@ public class StringUtils {
|
|||
* StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
|
||||
* StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
|
||||
* StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
|
||||
* StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.91
|
||||
* StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93
|
||||
* StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94
|
||||
* StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9
|
||||
* StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.93
|
||||
* StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95
|
||||
* StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
|
||||
* StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
|
||||
* </pre>
|
||||
*
|
||||
* @param first the first String, must not be null
|
||||
|
@ -7698,62 +7698,70 @@ public class StringUtils {
|
|||
throw new IllegalArgumentException("Strings must not be null");
|
||||
}
|
||||
|
||||
final double jaro = score(first,second);
|
||||
final int cl = commonPrefixLength(first, second);
|
||||
final double matchScore = Math.round((jaro + (DEFAULT_SCALING_FACTOR * cl * (1.0 - jaro))) *100.0)/100.0;
|
||||
|
||||
return matchScore;
|
||||
int[] mtp = matches(first, second);
|
||||
double m = mtp[0];
|
||||
if (m == 0) {
|
||||
return 0D;
|
||||
}
|
||||
double j = ((m / first.length() + m / second.length() + (m - mtp[1]) / m)) / 3;
|
||||
double jw = j < 0.7D ? j : j + Math.min(DEFAULT_SCALING_FACTOR, 1D / mtp[3]) * mtp[2] * (1D - j);
|
||||
return Math.round(jw * 100.0D) / 100.0D;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method returns the Jaro-Winkler score for string matching.
|
||||
* @param first the first string to be matched
|
||||
* @param second the second string to be machted
|
||||
* @return matching score without scaling factor impact
|
||||
*/
|
||||
private static double score(final CharSequence first, final CharSequence second) {
|
||||
String shorter;
|
||||
String longer;
|
||||
|
||||
// Determine which String is longer.
|
||||
private static int[] matches(final CharSequence first, final CharSequence second) {
|
||||
CharSequence max, min;
|
||||
if (first.length() > second.length()) {
|
||||
longer = first.toString().toLowerCase();
|
||||
shorter = second.toString().toLowerCase();
|
||||
max = first;
|
||||
min = second;
|
||||
} else {
|
||||
longer = second.toString().toLowerCase();
|
||||
shorter = first.toString().toLowerCase();
|
||||
max = second;
|
||||
min = first;
|
||||
}
|
||||
|
||||
// Calculate the half length() distance of the shorter String.
|
||||
final int halflength = shorter.length() / 2 + 1;
|
||||
|
||||
// Find the set of matching characters between the shorter and longer strings. Note that
|
||||
// the set of matching characters may be different depending on the order of the strings.
|
||||
final String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength);
|
||||
final String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength);
|
||||
|
||||
// If one or both of the sets of common characters is empty, then
|
||||
// there is no similarity between the two strings.
|
||||
if (m1.length() == 0 || m2.length() == 0) {
|
||||
return 0.0;
|
||||
int range = Math.max(max.length() / 2 - 1, 0);
|
||||
int[] matchIndexes = new int[min.length()];
|
||||
Arrays.fill(matchIndexes, -1);
|
||||
boolean[] matchFlags = new boolean[max.length()];
|
||||
int matches = 0;
|
||||
for (int mi = 0; mi < min.length(); mi++) {
|
||||
char c1 = min.charAt(mi);
|
||||
for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
|
||||
if (!matchFlags[xi] && c1 == max.charAt(xi)) {
|
||||
matchIndexes[mi] = xi;
|
||||
matchFlags[xi] = true;
|
||||
matches++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the set of common characters is not the same size, then
|
||||
// there is no similarity between the two strings, either.
|
||||
if (m1.length() != m2.length()) {
|
||||
return 0.0;
|
||||
char[] ms1 = new char[matches];
|
||||
char[] ms2 = new char[matches];
|
||||
for (int i = 0, si = 0; i < min.length(); i++) {
|
||||
if (matchIndexes[i] != -1) {
|
||||
ms1[si] = min.charAt(i);
|
||||
si++;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate the number of transposition between the two sets
|
||||
// of common characters.
|
||||
final int transpositions = transpositions(m1, m2);
|
||||
|
||||
// Calculate the distance.
|
||||
final double dist =
|
||||
(m1.length() / ((double)shorter.length()) +
|
||||
m2.length() / ((double)longer.length()) +
|
||||
(m1.length() - transpositions) / ((double)m1.length())) / 3.0;
|
||||
return dist;
|
||||
for (int i = 0, si = 0; i < max.length(); i++) {
|
||||
if (matchFlags[i]) {
|
||||
ms2[si] = max.charAt(i);
|
||||
si++;
|
||||
}
|
||||
}
|
||||
int transpositions = 0;
|
||||
for (int mi = 0; mi < ms1.length; mi++) {
|
||||
if (ms1[mi] != ms2[mi]) {
|
||||
transpositions++;
|
||||
}
|
||||
}
|
||||
int prefix = 0;
|
||||
for (int mi = 0; mi < min.length(); mi++) {
|
||||
if (first.charAt(mi) == second.charAt(mi)) {
|
||||
prefix++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return new int[] { matches, transpositions / 2, prefix, max.length() };
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -7835,67 +7843,6 @@ public class StringUtils {
|
|||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a set of matching characters between two strings.
|
||||
*
|
||||
* <p><Two characters from the first string and the second string are considered matching if the character's
|
||||
* respective positions are no farther than the limit value.</p>
|
||||
*
|
||||
* @param first The first string.
|
||||
* @param second The second string.
|
||||
* @param limit The maximum distance to consider.
|
||||
* @return A string contain the set of common characters.
|
||||
*/
|
||||
private static String getSetOfMatchingCharacterWithin(final CharSequence first, final CharSequence second, final int limit) {
|
||||
final StringBuilder common = new StringBuilder();
|
||||
final StringBuilder copy = new StringBuilder(second);
|
||||
|
||||
for (int i = 0; i < first.length(); i++) {
|
||||
final char ch = first.charAt(i);
|
||||
boolean found = false;
|
||||
|
||||
// See if the character is within the limit positions away from the original position of that character.
|
||||
for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++) {
|
||||
if (copy.charAt(j) == ch) {
|
||||
found = true;
|
||||
common.append(ch);
|
||||
copy.setCharAt(j,'*');
|
||||
}
|
||||
}
|
||||
}
|
||||
return common.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of transposition between two strings.
|
||||
* @param first The first string.
|
||||
* @param second The second string.
|
||||
* @return The number of transposition between the two strings.
|
||||
*/
|
||||
private static int transpositions(final CharSequence first, final CharSequence second) {
|
||||
int transpositions = 0;
|
||||
for (int i = 0; i < first.length(); i++) {
|
||||
if (first.charAt(i) != second.charAt(i)) {
|
||||
transpositions++;
|
||||
}
|
||||
}
|
||||
return transpositions / 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of characters from the beginning of the strings that match exactly one-to-one,
|
||||
* up to a maximum of four (4) characters.
|
||||
* @param first The first string.
|
||||
* @param second The second string.
|
||||
* @return A number between 0 and 4.
|
||||
*/
|
||||
private static int commonPrefixLength(final CharSequence first, final CharSequence second) {
|
||||
final int result = getCommonPrefix(first.toString(), second.toString()).length();
|
||||
|
||||
// Limit the result to 4.
|
||||
return result > 4 ? 4 : result;
|
||||
}
|
||||
|
||||
// startsWith
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -2182,10 +2182,12 @@ public class StringUtilsTest {
|
|||
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("frog", "fog"), 0.0d);
|
||||
assertEquals(0.0d, StringUtils.getJaroWinklerDistance("fly", "ant"), 0.0d);
|
||||
assertEquals(0.44d, StringUtils.getJaroWinklerDistance("elephant", "hippo"), 0.0d);
|
||||
assertEquals(0.91d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d);
|
||||
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d);
|
||||
assertEquals(0.94d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
|
||||
assertEquals(0.9d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
|
||||
assertEquals(0.84d, StringUtils.getJaroWinklerDistance("dwayne", "duane"), 0.0d);
|
||||
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d);
|
||||
assertEquals(0.95d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d);
|
||||
assertEquals(0.92d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
|
||||
assertEquals(0.88d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
|
||||
assertEquals(0.63d, StringUtils.getJaroWinklerDistance("Haus Ingeborg", "Ingeborg Esser"), 0.0d);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
|
|
Loading…
Reference in New Issue