LANG-944: Add the Jaro-Winkler string distance algorithm to StringUtils - partially applying the patch by Rekha Joshi as submitted for further refinement
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1560727 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8cd2339a97
commit
7460a856f2
|
@ -22,6 +22,7 @@
|
|||
<body>
|
||||
|
||||
<release version="3.3" date="TBA" description="Bugfix and Feature release">
|
||||
<action issue="LANG-944" type="add" dev="britter" due-to="Rekha Joshi">Add the Jaro-Winkler string distance algorithm to StringUtils</action>
|
||||
<action issue="LANG-936" type="fix" dev="bayard" due-to="Yaniv Kunda, Eli Lindsey">StringUtils.getLevenshteinDistance with too big of a threshold returns wrong result</action>
|
||||
<action issue="LANG-943" type="fix" dev="kinow">Test DurationFormatUtilsTest.testEdgeDuration fails in JDK 1.6, 1.7 and 1.8, BRST time zone</action>
|
||||
<action issue="LANG-613" type="fix" dev="mbenson">ConstructorUtils.getAccessibleConstructor() Does Not Check the Accessibility of Enclosing Classes</action>
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.commons.lang3;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.ArrayList;
|
||||
|
@ -6975,6 +6976,204 @@ public class StringUtils {
|
|||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Find the Jaro Winkler Distance which indicates the similarity score between two Strings.</p>
|
||||
*
|
||||
* <p>The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters.
|
||||
* Winkler increased this measure for matching initial characters</p>
|
||||
*
|
||||
* <p>This implementation is based on the Jaro Winkler similarity algorithm
|
||||
* from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a></p>
|
||||
*
|
||||
* <pre>
|
||||
* StringUtils.getJaroWinklerDistance(null, null) = IllegalArgumentException
|
||||
* StringUtils.getJaroWinklerDistance("","") = 0.0
|
||||
* StringUtils.getJaroWinklerDistance("","a") = 0.0
|
||||
* StringUtils.getJaroWinklerDistance("aaapppp", "") = 0.0
|
||||
* StringUtils.getJaroWinklerDistance("frog", "fog") = 0.93
|
||||
* StringUtils.getJaroWinklerDistance("fly", "ant") = 0.0
|
||||
* StringUtils.getJaroWinklerDistance("elephant", "hippo") = 0.44
|
||||
* StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
|
||||
* StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
|
||||
* StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
|
||||
* StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.91
|
||||
* StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93
|
||||
* StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94
|
||||
* StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9
|
||||
* </pre>
|
||||
*
|
||||
* @param s the first String, must not be null
|
||||
* @param t the second String, must not be null
|
||||
* @return result distance
|
||||
* @throws IllegalArgumentException if either String input {@code null}
|
||||
*/
|
||||
public static double getJaroWinklerDistance(CharSequence first, CharSequence second){
|
||||
double matchScore = 0.0;
|
||||
final double DEFAULT_SCALING_FACTOR = 0.1;
|
||||
|
||||
if (first == null || second == null)
|
||||
throw new IllegalArgumentException("Strings must not be null");
|
||||
|
||||
try {
|
||||
double jaro = score(first,second);
|
||||
int cl = commonPrefixLength(first, second);
|
||||
matchScore = Math.round((jaro + (DEFAULT_SCALING_FACTOR * cl * (1.0 - jaro))) *100.0)/100.0;
|
||||
//System.out.format("The score is %f for %s and %s ", matchScore,s1, s2);
|
||||
|
||||
return matchScore;
|
||||
|
||||
} catch (Exception e) {
|
||||
|
||||
}
|
||||
return matchScore;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method returns the jarowinkler score for string matching.
|
||||
* @param strings to be matched
|
||||
* @return matching score without scaling factor impact
|
||||
*/
|
||||
private static double score(CharSequence first, CharSequence second) {
|
||||
String shorter;
|
||||
String longer;
|
||||
|
||||
// Determine which String is longer.
|
||||
if (first.length() > second.length())
|
||||
{
|
||||
longer = first.toString().toLowerCase();
|
||||
shorter = second.toString().toLowerCase();
|
||||
}
|
||||
else
|
||||
{
|
||||
longer = second.toString().toLowerCase();
|
||||
shorter = first.toString().toLowerCase();
|
||||
}
|
||||
|
||||
// Calculate the half length() distance of the shorter String.
|
||||
int halflength = (shorter.length() / 2) + 1;
|
||||
|
||||
// Find the set of matching characters between the shorter and longer strings. Note that
|
||||
// the set of matching characters may be different depending on the order of the strings.
|
||||
String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength);
|
||||
String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength);
|
||||
|
||||
|
||||
// If one or both of the sets of common characters is empty, then
|
||||
// there is no similarity between the two strings.
|
||||
if (m1.length() == 0 || m2.length() == 0) return 0.0;
|
||||
|
||||
// If the set of common characters is not the same size, then
|
||||
// there is no similarity between the two strings, either.
|
||||
if (m1.length() != m2.length()) return 0.0;
|
||||
|
||||
// Calculate the number of transposition between the two sets
|
||||
// of common characters.
|
||||
int transpositions = transpositions(m1, m2);
|
||||
|
||||
// Calculate the distance.
|
||||
double dist =
|
||||
(m1.length() / ((double)shorter.length()) +
|
||||
m2.length() / ((double)longer.length()) +
|
||||
(m1.length() - transpositions) / ((double)m1.length())) / 3.0;
|
||||
return dist;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a set of matching characters between two strings.
|
||||
*
|
||||
* @param first The first string.
|
||||
* @param second The second string.
|
||||
* @param limit The maximum distance to consider.
|
||||
* @return A string contain the set of common characters.
|
||||
* @remarks Two characters from the first string and the second string are considered matching if the character's
|
||||
* respective positions are no farther than the limit value.
|
||||
*/
|
||||
private static String getSetOfMatchingCharacterWithin(CharSequence first, CharSequence second, int limit)
|
||||
{
|
||||
|
||||
StringBuilder common = new StringBuilder();
|
||||
StringBuilder copy = new StringBuilder(second);
|
||||
for (int i = 0; i < first.length(); i++)
|
||||
{
|
||||
char ch = first.charAt(i);
|
||||
boolean found = false;
|
||||
|
||||
// See if the character is within the limit positions away from the original position of that character.
|
||||
for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++)
|
||||
{
|
||||
if (copy.charAt(j) == ch)
|
||||
{
|
||||
found = true;
|
||||
common.append(ch);
|
||||
copy.setCharAt(j,'*');
|
||||
}
|
||||
}
|
||||
}
|
||||
return common.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of transposition between two strings.
|
||||
* @param first The first string.
|
||||
* @param second The second string.
|
||||
* @return The number of transposition between the two strings.
|
||||
*/
|
||||
private static int transpositions(CharSequence first, CharSequence second)
|
||||
{
|
||||
int transpositions = 0;
|
||||
for (int i = 0; i < first.length(); i++)
|
||||
{
|
||||
if (first.charAt(i) != second.charAt(i))
|
||||
{
|
||||
transpositions++;
|
||||
}
|
||||
}
|
||||
transpositions /= 2;
|
||||
return transpositions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of characters from the beginning of the strings that match exactly one-to-one,
|
||||
* up to a maximum of four (4) characters.
|
||||
* @param first The first string.
|
||||
* @param second The second string.
|
||||
* @return A number between 0 and 4.
|
||||
*/
|
||||
private static int commonPrefixLength(CharSequence first, CharSequence second)
|
||||
{
|
||||
String shorter;
|
||||
String longer;
|
||||
|
||||
// Determine which String is longer.
|
||||
if (first.length() > second.length())
|
||||
{
|
||||
longer = first.toString().toLowerCase();
|
||||
shorter = second.toString().toLowerCase();
|
||||
}
|
||||
else
|
||||
{
|
||||
longer = second.toString().toLowerCase();
|
||||
shorter = first.toString().toLowerCase();
|
||||
}
|
||||
|
||||
int result = 0;
|
||||
|
||||
// Iterate through the shorter string.
|
||||
for (int i = 0; i < shorter.length(); i++)
|
||||
{
|
||||
if (shorter.charAt(i) != longer.charAt(i))
|
||||
{
|
||||
break;
|
||||
}
|
||||
result++;
|
||||
}
|
||||
|
||||
// Limit the result to 4.
|
||||
return result > 4? 4: result;
|
||||
}
|
||||
|
||||
// startsWith
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -1919,6 +1919,45 @@ public class StringUtilsTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetJaroWinklerDistance_StringString() {
|
||||
|
||||
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("frog", "fog"), 0.0d);
|
||||
assertEquals(0.0d, StringUtils.getJaroWinklerDistance("fly", "ant"), 0.0d);
|
||||
assertEquals(0.44d, StringUtils.getJaroWinklerDistance("elephant", "hippo"), 0.0d);
|
||||
assertEquals(0.91d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d);
|
||||
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d);
|
||||
assertEquals(0.94d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
|
||||
assertEquals(0.9d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
|
||||
// exceptions
|
||||
try {
|
||||
@SuppressWarnings("unused")
|
||||
final
|
||||
double d = StringUtils.getJaroWinklerDistance(null, null);
|
||||
fail("expecting IllegalArgumentException");
|
||||
} catch (final IllegalArgumentException ex) {
|
||||
// empty
|
||||
}
|
||||
|
||||
try {
|
||||
@SuppressWarnings("unused")
|
||||
final
|
||||
double d = StringUtils.getJaroWinklerDistance(" ", null);
|
||||
fail("expecting IllegalArgumentException");
|
||||
} catch (final IllegalArgumentException ex) {
|
||||
// empty
|
||||
}
|
||||
try {
|
||||
@SuppressWarnings("unused")
|
||||
final
|
||||
double d = StringUtils.getJaroWinklerDistance(null, "clear");
|
||||
fail("expecting IllegalArgumentException");
|
||||
} catch (final IllegalArgumentException ex) {
|
||||
// empty
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetLevenshteinDistance_StringStringInt() {
|
||||
// empty strings
|
||||
|
|
Loading…
Reference in New Issue