LANG-944: Add the Jaro-Winkler string distance algorithm to StringUtils - partially applying the patch by Rekha Joshi as submitted for further refinement
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1560727 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8cd2339a97
commit
7460a856f2
|
@ -22,6 +22,7 @@
|
||||||
<body>
|
<body>
|
||||||
|
|
||||||
<release version="3.3" date="TBA" description="Bugfix and Feature release">
|
<release version="3.3" date="TBA" description="Bugfix and Feature release">
|
||||||
|
<action issue="LANG-944" type="add" dev="britter" due-to="Rekha Joshi">Add the Jaro-Winkler string distance algorithm to StringUtils</action>
|
||||||
<action issue="LANG-936" type="fix" dev="bayard" due-to="Yaniv Kunda, Eli Lindsey">StringUtils.getLevenshteinDistance with too big of a threshold returns wrong result</action>
|
<action issue="LANG-936" type="fix" dev="bayard" due-to="Yaniv Kunda, Eli Lindsey">StringUtils.getLevenshteinDistance with too big of a threshold returns wrong result</action>
|
||||||
<action issue="LANG-943" type="fix" dev="kinow">Test DurationFormatUtilsTest.testEdgeDuration fails in JDK 1.6, 1.7 and 1.8, BRST time zone</action>
|
<action issue="LANG-943" type="fix" dev="kinow">Test DurationFormatUtilsTest.testEdgeDuration fails in JDK 1.6, 1.7 and 1.8, BRST time zone</action>
|
||||||
<action issue="LANG-613" type="fix" dev="mbenson">ConstructorUtils.getAccessibleConstructor() Does Not Check the Accessibility of Enclosing Classes</action>
|
<action issue="LANG-613" type="fix" dev="mbenson">ConstructorUtils.getAccessibleConstructor() Does Not Check the Accessibility of Enclosing Classes</action>
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.commons.lang3;
|
package org.apache.commons.lang3;
|
||||||
|
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -6975,6 +6976,204 @@ public class StringUtils {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>Find the Jaro Winkler Distance which indicates the similarity score between two Strings.</p>
|
||||||
|
*
|
||||||
|
* <p>The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters.
|
||||||
|
* Winkler increased this measure for matching initial characters</p>
|
||||||
|
*
|
||||||
|
* <p>This implementation is based on the Jaro Winkler similarity algorithm
|
||||||
|
* from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a></p>
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* StringUtils.getJaroWinklerDistance(null, null) = IllegalArgumentException
|
||||||
|
* StringUtils.getJaroWinklerDistance("","") = 0.0
|
||||||
|
* StringUtils.getJaroWinklerDistance("","a") = 0.0
|
||||||
|
* StringUtils.getJaroWinklerDistance("aaapppp", "") = 0.0
|
||||||
|
* StringUtils.getJaroWinklerDistance("frog", "fog") = 0.93
|
||||||
|
* StringUtils.getJaroWinklerDistance("fly", "ant") = 0.0
|
||||||
|
* StringUtils.getJaroWinklerDistance("elephant", "hippo") = 0.44
|
||||||
|
* StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44
|
||||||
|
* StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0
|
||||||
|
* StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88
|
||||||
|
* StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.91
|
||||||
|
* StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93
|
||||||
|
* StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94
|
||||||
|
* StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @param s the first String, must not be null
|
||||||
|
* @param t the second String, must not be null
|
||||||
|
* @return result distance
|
||||||
|
* @throws IllegalArgumentException if either String input {@code null}
|
||||||
|
*/
|
||||||
|
public static double getJaroWinklerDistance(CharSequence first, CharSequence second){
|
||||||
|
double matchScore = 0.0;
|
||||||
|
final double DEFAULT_SCALING_FACTOR = 0.1;
|
||||||
|
|
||||||
|
if (first == null || second == null)
|
||||||
|
throw new IllegalArgumentException("Strings must not be null");
|
||||||
|
|
||||||
|
try {
|
||||||
|
double jaro = score(first,second);
|
||||||
|
int cl = commonPrefixLength(first, second);
|
||||||
|
matchScore = Math.round((jaro + (DEFAULT_SCALING_FACTOR * cl * (1.0 - jaro))) *100.0)/100.0;
|
||||||
|
//System.out.format("The score is %f for %s and %s ", matchScore,s1, s2);
|
||||||
|
|
||||||
|
return matchScore;
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
return matchScore;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method returns the jarowinkler score for string matching.
|
||||||
|
* @param strings to be matched
|
||||||
|
* @return matching score without scaling factor impact
|
||||||
|
*/
|
||||||
|
private static double score(CharSequence first, CharSequence second) {
|
||||||
|
String shorter;
|
||||||
|
String longer;
|
||||||
|
|
||||||
|
// Determine which String is longer.
|
||||||
|
if (first.length() > second.length())
|
||||||
|
{
|
||||||
|
longer = first.toString().toLowerCase();
|
||||||
|
shorter = second.toString().toLowerCase();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
longer = second.toString().toLowerCase();
|
||||||
|
shorter = first.toString().toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the half length() distance of the shorter String.
|
||||||
|
int halflength = (shorter.length() / 2) + 1;
|
||||||
|
|
||||||
|
// Find the set of matching characters between the shorter and longer strings. Note that
|
||||||
|
// the set of matching characters may be different depending on the order of the strings.
|
||||||
|
String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength);
|
||||||
|
String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength);
|
||||||
|
|
||||||
|
|
||||||
|
// If one or both of the sets of common characters is empty, then
|
||||||
|
// there is no similarity between the two strings.
|
||||||
|
if (m1.length() == 0 || m2.length() == 0) return 0.0;
|
||||||
|
|
||||||
|
// If the set of common characters is not the same size, then
|
||||||
|
// there is no similarity between the two strings, either.
|
||||||
|
if (m1.length() != m2.length()) return 0.0;
|
||||||
|
|
||||||
|
// Calculate the number of transposition between the two sets
|
||||||
|
// of common characters.
|
||||||
|
int transpositions = transpositions(m1, m2);
|
||||||
|
|
||||||
|
// Calculate the distance.
|
||||||
|
double dist =
|
||||||
|
(m1.length() / ((double)shorter.length()) +
|
||||||
|
m2.length() / ((double)longer.length()) +
|
||||||
|
(m1.length() - transpositions) / ((double)m1.length())) / 3.0;
|
||||||
|
return dist;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a set of matching characters between two strings.
|
||||||
|
*
|
||||||
|
* @param first The first string.
|
||||||
|
* @param second The second string.
|
||||||
|
* @param limit The maximum distance to consider.
|
||||||
|
* @return A string contain the set of common characters.
|
||||||
|
* @remarks Two characters from the first string and the second string are considered matching if the character's
|
||||||
|
* respective positions are no farther than the limit value.
|
||||||
|
*/
|
||||||
|
private static String getSetOfMatchingCharacterWithin(CharSequence first, CharSequence second, int limit)
|
||||||
|
{
|
||||||
|
|
||||||
|
StringBuilder common = new StringBuilder();
|
||||||
|
StringBuilder copy = new StringBuilder(second);
|
||||||
|
for (int i = 0; i < first.length(); i++)
|
||||||
|
{
|
||||||
|
char ch = first.charAt(i);
|
||||||
|
boolean found = false;
|
||||||
|
|
||||||
|
// See if the character is within the limit positions away from the original position of that character.
|
||||||
|
for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++)
|
||||||
|
{
|
||||||
|
if (copy.charAt(j) == ch)
|
||||||
|
{
|
||||||
|
found = true;
|
||||||
|
common.append(ch);
|
||||||
|
copy.setCharAt(j,'*');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return common.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of transposition between two strings.
|
||||||
|
* @param first The first string.
|
||||||
|
* @param second The second string.
|
||||||
|
* @return The number of transposition between the two strings.
|
||||||
|
*/
|
||||||
|
private static int transpositions(CharSequence first, CharSequence second)
|
||||||
|
{
|
||||||
|
int transpositions = 0;
|
||||||
|
for (int i = 0; i < first.length(); i++)
|
||||||
|
{
|
||||||
|
if (first.charAt(i) != second.charAt(i))
|
||||||
|
{
|
||||||
|
transpositions++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
transpositions /= 2;
|
||||||
|
return transpositions;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of characters from the beginning of the strings that match exactly one-to-one,
|
||||||
|
* up to a maximum of four (4) characters.
|
||||||
|
* @param first The first string.
|
||||||
|
* @param second The second string.
|
||||||
|
* @return A number between 0 and 4.
|
||||||
|
*/
|
||||||
|
private static int commonPrefixLength(CharSequence first, CharSequence second)
|
||||||
|
{
|
||||||
|
String shorter;
|
||||||
|
String longer;
|
||||||
|
|
||||||
|
// Determine which String is longer.
|
||||||
|
if (first.length() > second.length())
|
||||||
|
{
|
||||||
|
longer = first.toString().toLowerCase();
|
||||||
|
shorter = second.toString().toLowerCase();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
longer = second.toString().toLowerCase();
|
||||||
|
shorter = first.toString().toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
int result = 0;
|
||||||
|
|
||||||
|
// Iterate through the shorter string.
|
||||||
|
for (int i = 0; i < shorter.length(); i++)
|
||||||
|
{
|
||||||
|
if (shorter.charAt(i) != longer.charAt(i))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Limit the result to 4.
|
||||||
|
return result > 4? 4: result;
|
||||||
|
}
|
||||||
|
|
||||||
// startsWith
|
// startsWith
|
||||||
//-----------------------------------------------------------------------
|
//-----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -1919,6 +1919,45 @@ public class StringUtilsTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetJaroWinklerDistance_StringString() {
|
||||||
|
|
||||||
|
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("frog", "fog"), 0.0d);
|
||||||
|
assertEquals(0.0d, StringUtils.getJaroWinklerDistance("fly", "ant"), 0.0d);
|
||||||
|
assertEquals(0.44d, StringUtils.getJaroWinklerDistance("elephant", "hippo"), 0.0d);
|
||||||
|
assertEquals(0.91d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d);
|
||||||
|
assertEquals(0.93d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d);
|
||||||
|
assertEquals(0.94d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
|
||||||
|
assertEquals(0.9d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
|
||||||
|
// exceptions
|
||||||
|
try {
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
final
|
||||||
|
double d = StringUtils.getJaroWinklerDistance(null, null);
|
||||||
|
fail("expecting IllegalArgumentException");
|
||||||
|
} catch (final IllegalArgumentException ex) {
|
||||||
|
// empty
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
final
|
||||||
|
double d = StringUtils.getJaroWinklerDistance(" ", null);
|
||||||
|
fail("expecting IllegalArgumentException");
|
||||||
|
} catch (final IllegalArgumentException ex) {
|
||||||
|
// empty
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
final
|
||||||
|
double d = StringUtils.getJaroWinklerDistance(null, "clear");
|
||||||
|
fail("expecting IllegalArgumentException");
|
||||||
|
} catch (final IllegalArgumentException ex) {
|
||||||
|
// empty
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetLevenshteinDistance_StringStringInt() {
|
public void testGetLevenshteinDistance_StringStringInt() {
|
||||||
// empty strings
|
// empty strings
|
||||||
|
|
Loading…
Reference in New Issue