LANG-944: Add the Jaro-Winkler string distance algorithm to StringUtils - partially applying the patch by Rekha Joshi as submitted for further refinement

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1560727 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Benedikt Ritter 2014-01-23 16:27:51 +00:00
parent 8cd2339a97
commit 7460a856f2
3 changed files with 239 additions and 0 deletions

View File

@ -22,6 +22,7 @@
<body>
<release version="3.3" date="TBA" description="Bugfix and Feature release">
<action issue="LANG-944" type="add" dev="britter" due-to="Rekha Joshi">Add the Jaro-Winkler string distance algorithm to StringUtils</action>
<action issue="LANG-936" type="fix" dev="bayard" due-to="Yaniv Kunda, Eli Lindsey">StringUtils.getLevenshteinDistance with too big of a threshold returns wrong result</action>
<action issue="LANG-943" type="fix" dev="kinow">Test DurationFormatUtilsTest.testEdgeDuration fails in JDK 1.6, 1.7 and 1.8, BRST time zone</action>
<action issue="LANG-613" type="fix" dev="mbenson">ConstructorUtils.getAccessibleConstructor() Does Not Check the Accessibility of Enclosing Classes</action>

View File

@ -17,6 +17,7 @@
package org.apache.commons.lang3;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.ArrayList;
@ -6974,7 +6975,205 @@ public class StringUtils {
}
return -1;
}
/**
* <p>Find the Jaro Winkler Distance which indicates the similarity score between two Strings.</p>
*
* <p>The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters.
* Winkler increased this measure for matching initial characters</p>
*
* <p>This implementation is based on the Jaro Winkler similarity algorithm
* from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a></p>
*
* <pre>
* StringUtils.getJaroWinklerDistance(null, null) = IllegalArgumentException
* StringUtils.getJaroWinklerDistance("","") = 0.0
* StringUtils.getJaroWinklerDistance("","a") = 0.0
* StringUtils.getJaroWinklerDistance("aaapppp", "") = 0.0
* StringUtils.getJaroWinklerDistance("frog", "fog") = 0.93
* StringUtils.getJaroWinklerDistance("fly", "ant") = 0.0
* StringUtils.getJaroWinklerDistance("elephant", "hippo") = 0.44
* StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
* StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
* StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
* StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.91
* StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93
* StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94
* StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9
* </pre>
*
* @param s the first String, must not be null
* @param t the second String, must not be null
* @return result distance
* @throws IllegalArgumentException if either String input {@code null}
*/
public static double getJaroWinklerDistance(CharSequence first, CharSequence second){
double matchScore = 0.0;
final double DEFAULT_SCALING_FACTOR = 0.1;
if (first == null || second == null)
throw new IllegalArgumentException("Strings must not be null");
try {
double jaro = score(first,second);
int cl = commonPrefixLength(first, second);
matchScore = Math.round((jaro + (DEFAULT_SCALING_FACTOR * cl * (1.0 - jaro))) *100.0)/100.0;
//System.out.format("The score is %f for %s and %s ", matchScore,s1, s2);
return matchScore;
} catch (Exception e) {
}
return matchScore;
}
/**
* This method returns the jarowinkler score for string matching.
* @param strings to be matched
* @return matching score without scaling factor impact
*/
private static double score(CharSequence first, CharSequence second) {
String shorter;
String longer;
// Determine which String is longer.
if (first.length() > second.length())
{
longer = first.toString().toLowerCase();
shorter = second.toString().toLowerCase();
}
else
{
longer = second.toString().toLowerCase();
shorter = first.toString().toLowerCase();
}
// Calculate the half length() distance of the shorter String.
int halflength = (shorter.length() / 2) + 1;
// Find the set of matching characters between the shorter and longer strings. Note that
// the set of matching characters may be different depending on the order of the strings.
String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength);
String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength);
// If one or both of the sets of common characters is empty, then
// there is no similarity between the two strings.
if (m1.length() == 0 || m2.length() == 0) return 0.0;
// If the set of common characters is not the same size, then
// there is no similarity between the two strings, either.
if (m1.length() != m2.length()) return 0.0;
// Calculate the number of transposition between the two sets
// of common characters.
int transpositions = transpositions(m1, m2);
// Calculate the distance.
double dist =
(m1.length() / ((double)shorter.length()) +
m2.length() / ((double)longer.length()) +
(m1.length() - transpositions) / ((double)m1.length())) / 3.0;
return dist;
}
/**
* Gets a set of matching characters between two strings.
*
* @param first The first string.
* @param second The second string.
* @param limit The maximum distance to consider.
* @return A string contain the set of common characters.
* @remarks Two characters from the first string and the second string are considered matching if the character's
* respective positions are no farther than the limit value.
*/
private static String getSetOfMatchingCharacterWithin(CharSequence first, CharSequence second, int limit)
{
StringBuilder common = new StringBuilder();
StringBuilder copy = new StringBuilder(second);
for (int i = 0; i < first.length(); i++)
{
char ch = first.charAt(i);
boolean found = false;
// See if the character is within the limit positions away from the original position of that character.
for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++)
{
if (copy.charAt(j) == ch)
{
found = true;
common.append(ch);
copy.setCharAt(j,'*');
}
}
}
return common.toString();
}
/**
* Calculates the number of transposition between two strings.
* @param first The first string.
* @param second The second string.
* @return The number of transposition between the two strings.
*/
private static int transpositions(CharSequence first, CharSequence second)
{
int transpositions = 0;
for (int i = 0; i < first.length(); i++)
{
if (first.charAt(i) != second.charAt(i))
{
transpositions++;
}
}
transpositions /= 2;
return transpositions;
}
/**
* Calculates the number of characters from the beginning of the strings that match exactly one-to-one,
* up to a maximum of four (4) characters.
* @param first The first string.
* @param second The second string.
* @return A number between 0 and 4.
*/
private static int commonPrefixLength(CharSequence first, CharSequence second)
{
String shorter;
String longer;
// Determine which String is longer.
if (first.length() > second.length())
{
longer = first.toString().toLowerCase();
shorter = second.toString().toLowerCase();
}
else
{
longer = second.toString().toLowerCase();
shorter = first.toString().toLowerCase();
}
int result = 0;
// Iterate through the shorter string.
for (int i = 0; i < shorter.length(); i++)
{
if (shorter.charAt(i) != longer.charAt(i))
{
break;
}
result++;
}
// Limit the result to 4.
return result > 4? 4: result;
}
// startsWith
//-----------------------------------------------------------------------

View File

@ -1918,6 +1918,45 @@ public class StringUtilsTest {
// empty
}
}
@Test
public void testGetJaroWinklerDistance_StringString() {
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("frog", "fog"), 0.0d);
assertEquals(0.0d, StringUtils.getJaroWinklerDistance("fly", "ant"), 0.0d);
assertEquals(0.44d, StringUtils.getJaroWinklerDistance("elephant", "hippo"), 0.0d);
assertEquals(0.91d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d);
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d);
assertEquals(0.94d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
assertEquals(0.9d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
// exceptions
try {
@SuppressWarnings("unused")
final
double d = StringUtils.getJaroWinklerDistance(null, null);
fail("expecting IllegalArgumentException");
} catch (final IllegalArgumentException ex) {
// empty
}
try {
@SuppressWarnings("unused")
final
double d = StringUtils.getJaroWinklerDistance(" ", null);
fail("expecting IllegalArgumentException");
} catch (final IllegalArgumentException ex) {
// empty
}
try {
@SuppressWarnings("unused")
final
double d = StringUtils.getJaroWinklerDistance(null, "clear");
fail("expecting IllegalArgumentException");
} catch (final IllegalArgumentException ex) {
// empty
}
}
@Test
public void testGetLevenshteinDistance_StringStringInt() {