LANG-1199: Fix implementation of StringUtils.getJaroWinklerDistance()

Replace current implementation with Jaro-Winkler distance implementation taken from Apache Lucene.
This commit is contained in:
pascalschumacher 2016-05-20 21:18:47 +02:00
parent 43a9bab8c0
commit c35a041022
3 changed files with 68 additions and 118 deletions

View File

@ -46,6 +46,7 @@ The <action> type attribute can be add,update,fix,remove.
<body> <body>
<release version="3.5" date="tba" description="tba"> <release version="3.5" date="tba" description="tba">
<action issue="LANG-1199" type="fix" dev="pschumacher" due-to="M. Steiger">Fix implementation of StringUtils.getJaroWinklerDistance()</action>
<action issue="LANG-1244" type="fix" dev="pschumacher" due-to="jjbankert">Fix dead links in StringUtils.getLevenshteinDistance() javadoc</action> <action issue="LANG-1244" type="fix" dev="pschumacher" due-to="jjbankert">Fix dead links in StringUtils.getLevenshteinDistance() javadoc</action>
<action issue="LANG-1242" type="fix" dev="pschumacher" due-to="Neal Stewart">"\u2284":"&nsub;" mapping missing from EntityArrays#HTML40_EXTENDED_ESCAPE</action> <action issue="LANG-1242" type="fix" dev="pschumacher" due-to="Neal Stewart">"\u2284":"&nsub;" mapping missing from EntityArrays#HTML40_EXTENDED_ESCAPE</action>
<action issue="LANG-1243" type="update" dev="sebb">Simplify ArrayUtils removeElements by using new decrementAndGet() method</action> <action issue="LANG-1243" type="update" dev="sebb">Simplify ArrayUtils removeElements by using new decrementAndGet() method</action>

View File

@ -7679,10 +7679,10 @@ else if (Math.abs(n - m) > threshold) {
* StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44 * StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
* StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0 * StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
* StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88 * StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
* StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.91 * StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.93
* StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.93 * StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
* StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94 * StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
* StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9 * StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
* </pre> * </pre>
* *
* @param first the first String, must not be null * @param first the first String, must not be null
@ -7698,62 +7698,70 @@ public static double getJaroWinklerDistance(final CharSequence first, final Char
throw new IllegalArgumentException("Strings must not be null"); throw new IllegalArgumentException("Strings must not be null");
} }
final double jaro = score(first,second); int[] mtp = matches(first, second);
final int cl = commonPrefixLength(first, second); double m = mtp[0];
final double matchScore = Math.round((jaro + (DEFAULT_SCALING_FACTOR * cl * (1.0 - jaro))) *100.0)/100.0; if (m == 0) {
return 0D;
return matchScore; }
double j = ((m / first.length() + m / second.length() + (m - mtp[1]) / m)) / 3;
double jw = j < 0.7D ? j : j + Math.min(DEFAULT_SCALING_FACTOR, 1D / mtp[3]) * mtp[2] * (1D - j);
return Math.round(jw * 100.0D) / 100.0D;
} }
/** private static int[] matches(final CharSequence first, final CharSequence second) {
* This method returns the Jaro-Winkler score for string matching. CharSequence max, min;
* @param first the first string to be matched
* @param second the second string to be machted
* @return matching score without scaling factor impact
*/
private static double score(final CharSequence first, final CharSequence second) {
String shorter;
String longer;
// Determine which String is longer.
if (first.length() > second.length()) { if (first.length() > second.length()) {
longer = first.toString().toLowerCase(); max = first;
shorter = second.toString().toLowerCase(); min = second;
} else { } else {
longer = second.toString().toLowerCase(); max = second;
shorter = first.toString().toLowerCase(); min = first;
} }
int range = Math.max(max.length() / 2 - 1, 0);
// Calculate the half length() distance of the shorter String. int[] matchIndexes = new int[min.length()];
final int halflength = shorter.length() / 2 + 1; Arrays.fill(matchIndexes, -1);
boolean[] matchFlags = new boolean[max.length()];
// Find the set of matching characters between the shorter and longer strings. Note that int matches = 0;
// the set of matching characters may be different depending on the order of the strings. for (int mi = 0; mi < min.length(); mi++) {
final String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength); char c1 = min.charAt(mi);
final String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength); for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
if (!matchFlags[xi] && c1 == max.charAt(xi)) {
// If one or both of the sets of common characters is empty, then matchIndexes[mi] = xi;
// there is no similarity between the two strings. matchFlags[xi] = true;
if (m1.length() == 0 || m2.length() == 0) { matches++;
return 0.0; break;
}
}
} }
char[] ms1 = new char[matches];
// If the set of common characters is not the same size, then char[] ms2 = new char[matches];
// there is no similarity between the two strings, either. for (int i = 0, si = 0; i < min.length(); i++) {
if (m1.length() != m2.length()) { if (matchIndexes[i] != -1) {
return 0.0; ms1[si] = min.charAt(i);
si++;
}
} }
for (int i = 0, si = 0; i < max.length(); i++) {
// Calculate the number of transposition between the two sets if (matchFlags[i]) {
// of common characters. ms2[si] = max.charAt(i);
final int transpositions = transpositions(m1, m2); si++;
}
// Calculate the distance. }
final double dist = int transpositions = 0;
(m1.length() / ((double)shorter.length()) + for (int mi = 0; mi < ms1.length; mi++) {
m2.length() / ((double)longer.length()) + if (ms1[mi] != ms2[mi]) {
(m1.length() - transpositions) / ((double)m1.length())) / 3.0; transpositions++;
return dist; }
}
int prefix = 0;
for (int mi = 0; mi < min.length(); mi++) {
if (first.charAt(mi) == second.charAt(mi)) {
prefix++;
} else {
break;
}
}
return new int[] { matches, transpositions / 2, prefix, max.length() };
} }
/** /**
@ -7835,67 +7843,6 @@ public static int getFuzzyDistance(final CharSequence term, final CharSequence q
return score; return score;
} }
/**
* Gets a set of matching characters between two strings.
*
* <p><Two characters from the first string and the second string are considered matching if the character's
* respective positions are no farther than the limit value.</p>
*
* @param first The first string.
* @param second The second string.
* @param limit The maximum distance to consider.
* @return A string contain the set of common characters.
*/
private static String getSetOfMatchingCharacterWithin(final CharSequence first, final CharSequence second, final int limit) {
final StringBuilder common = new StringBuilder();
final StringBuilder copy = new StringBuilder(second);
for (int i = 0; i < first.length(); i++) {
final char ch = first.charAt(i);
boolean found = false;
// See if the character is within the limit positions away from the original position of that character.
for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++) {
if (copy.charAt(j) == ch) {
found = true;
common.append(ch);
copy.setCharAt(j,'*');
}
}
}
return common.toString();
}
/**
* Calculates the number of transposition between two strings.
* @param first The first string.
* @param second The second string.
* @return The number of transposition between the two strings.
*/
private static int transpositions(final CharSequence first, final CharSequence second) {
int transpositions = 0;
for (int i = 0; i < first.length(); i++) {
if (first.charAt(i) != second.charAt(i)) {
transpositions++;
}
}
return transpositions / 2;
}
/**
* Calculates the number of characters from the beginning of the strings that match exactly one-to-one,
* up to a maximum of four (4) characters.
* @param first The first string.
* @param second The second string.
* @return A number between 0 and 4.
*/
private static int commonPrefixLength(final CharSequence first, final CharSequence second) {
final int result = getCommonPrefix(first.toString(), second.toString()).length();
// Limit the result to 4.
return result > 4 ? 4 : result;
}
// startsWith // startsWith
//----------------------------------------------------------------------- //-----------------------------------------------------------------------

View File

@ -2182,10 +2182,12 @@ public void testGetJaroWinklerDistance_StringString() {
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("frog", "fog"), 0.0d); assertEquals(0.93d, StringUtils.getJaroWinklerDistance("frog", "fog"), 0.0d);
assertEquals(0.0d, StringUtils.getJaroWinklerDistance("fly", "ant"), 0.0d); assertEquals(0.0d, StringUtils.getJaroWinklerDistance("fly", "ant"), 0.0d);
assertEquals(0.44d, StringUtils.getJaroWinklerDistance("elephant", "hippo"), 0.0d); assertEquals(0.44d, StringUtils.getJaroWinklerDistance("elephant", "hippo"), 0.0d);
assertEquals(0.91d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d); assertEquals(0.84d, StringUtils.getJaroWinklerDistance("dwayne", "duane"), 0.0d);
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); assertEquals(0.93d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d);
assertEquals(0.94d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); assertEquals(0.95d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d);
assertEquals(0.9d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); assertEquals(0.92d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
assertEquals(0.88d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
assertEquals(0.63d, StringUtils.getJaroWinklerDistance("Haus Ingeborg", "Ingeborg Esser"), 0.0d);
} }
@Test(expected = IllegalArgumentException.class) @Test(expected = IllegalArgumentException.class)