LANG-1269: Wrong name or result of StringUtils#getJaroWinklerDistance (closes #198)
deprecate StringUtils#getJaroWinklerDistance and add StringUtils#getJaroWinklerSimilarity instead
This commit is contained in:
parent
1e5c2b8779
commit
a40b2a907a
|
@ -46,6 +46,7 @@ The <action> type attribute can be add,update,fix,remove.
|
|||
<body>
|
||||
|
||||
<release version="3.6" date="2016-MM-DD" description="TBD">
|
||||
<action issue="LANG-1269" type="fix" dev="paschuma">Wrong name or result of StringUtils#getJaroWinklerDistance</action>
|
||||
<action issue="LANG-1188" type="fix" dev="paschuma">StringUtils#join(T...): warning: [unchecked] Possible heap pollution from parameterized vararg type T</action>
|
||||
<action issue="LANG-1144" type="fix" dev="ggregory" due-to="Waldemar Maier, Gary Gregory">Multiple calls of org.apache.commons.lang3.concurrent.LazyInitializer.initialize() are possible.</action>
|
||||
<action issue="LANG-1276" type="fix" dev="pschumacher" due-to="Andy Klimczak">StrBuilder#replaceAll ArrayIndexOutOfBoundsException</action>
|
||||
|
|
|
@ -8009,7 +8009,9 @@ public class StringUtils {
|
|||
* @return result distance
|
||||
* @throws IllegalArgumentException if either String input {@code null}
|
||||
* @since 3.3
|
||||
* @deprecated as of 3.6, due to a misleading name, use {@link #getJaroWinklerSimilarity()} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public static double getJaroWinklerDistance(final CharSequence first, final CharSequence second) {
|
||||
final double DEFAULT_SCALING_FACTOR = 0.1;
|
||||
|
||||
|
@ -8027,6 +8029,55 @@ public class StringUtils {
|
|||
return Math.round(jw * 100.0D) / 100.0D;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Find the Jaro Winkler Similarity which indicates the similarity score between two Strings.</p>
|
||||
*
|
||||
* <p>The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters.
|
||||
* Winkler increased this measure for matching initial characters.</p>
|
||||
*
|
||||
* <p>This implementation is based on the Jaro Winkler similarity algorithm
|
||||
* from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.</p>
|
||||
*
|
||||
* <pre>
|
||||
* StringUtils.getJaroWinklerSimilarity(null, null) = IllegalArgumentException
|
||||
* StringUtils.getJaroWinklerSimilarity("","") = 0.0
|
||||
* StringUtils.getJaroWinklerSimilarity("","a") = 0.0
|
||||
* StringUtils.getJaroWinklerSimilarity("aaapppp", "") = 0.0
|
||||
* StringUtils.getJaroWinklerSimilarity("frog", "fog") = 0.93
|
||||
* StringUtils.getJaroWinklerSimilarity("fly", "ant") = 0.0
|
||||
* StringUtils.getJaroWinklerSimilarity("elephant", "hippo") = 0.44
|
||||
* StringUtils.getJaroWinklerSimilarity("hippo", "elephant") = 0.44
|
||||
* StringUtils.getJaroWinklerSimilarity("hippo", "zzzzzzzz") = 0.0
|
||||
* StringUtils.getJaroWinklerSimilarity("hello", "hallo") = 0.88
|
||||
* StringUtils.getJaroWinklerSimilarity("ABC Corporation", "ABC Corp") = 0.93
|
||||
* StringUtils.getJaroWinklerSimilarity("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95
|
||||
* StringUtils.getJaroWinklerSimilarity("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
|
||||
* StringUtils.getJaroWinklerSimilarity("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
|
||||
* </pre>
|
||||
*
|
||||
* @param first the first String, must not be null
|
||||
* @param second the second String, must not be null
|
||||
* @return result similarity
|
||||
* @throws IllegalArgumentException if either String input {@code null}
|
||||
* @since 3.6
|
||||
*/
|
||||
public static double getJaroWinklerSimilarity(final CharSequence first, final CharSequence second) {
|
||||
final double DEFAULT_SCALING_FACTOR = 0.1;
|
||||
|
||||
if (first == null || second == null) {
|
||||
throw new IllegalArgumentException("Strings must not be null");
|
||||
}
|
||||
|
||||
int[] mtp = matches(first, second);
|
||||
double m = mtp[0];
|
||||
if (m == 0) {
|
||||
return 0D;
|
||||
}
|
||||
double j = ((m / first.length() + m / second.length() + (m - mtp[1]) / m)) / 3;
|
||||
double jw = j < 0.7D ? j : j + Math.min(DEFAULT_SCALING_FACTOR, 1D / mtp[3]) * mtp[2] * (1D - j);
|
||||
return Math.round(jw * 100.0D) / 100.0D;
|
||||
}
|
||||
|
||||
private static int[] matches(final CharSequence first, final CharSequence second) {
|
||||
CharSequence max, min;
|
||||
if (first.length() > second.length()) {
|
||||
|
|
|
@ -2384,6 +2384,34 @@ public class StringUtilsTest {
|
|||
StringUtils.getJaroWinklerDistance(null, "clear");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetJaroWinklerSimilarity_StringString() {
|
||||
assertEquals(0.93d, StringUtils.getJaroWinklerSimilarity("frog", "fog"), 0.0d);
|
||||
assertEquals(0.0d, StringUtils.getJaroWinklerSimilarity("fly", "ant"), 0.0d);
|
||||
assertEquals(0.44d, StringUtils.getJaroWinklerSimilarity("elephant", "hippo"), 0.0d);
|
||||
assertEquals(0.84d, StringUtils.getJaroWinklerSimilarity("dwayne", "duane"), 0.0d);
|
||||
assertEquals(0.93d, StringUtils.getJaroWinklerSimilarity("ABC Corporation", "ABC Corp"), 0.0d);
|
||||
assertEquals(0.95d, StringUtils.getJaroWinklerSimilarity("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d);
|
||||
assertEquals(0.92d, StringUtils.getJaroWinklerSimilarity("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d);
|
||||
assertEquals(0.88d, StringUtils.getJaroWinklerSimilarity("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d);
|
||||
assertEquals(0.63d, StringUtils.getJaroWinklerSimilarity("Haus Ingeborg", "Ingeborg Esser"), 0.0d);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testGetJaroWinklerSimilarity_NullNull() throws Exception {
|
||||
StringUtils.getJaroWinklerSimilarity(null, null);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testGetJaroWinklerSimilarity_StringNull() throws Exception {
|
||||
StringUtils.getJaroWinklerSimilarity(" ", null);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testGetJaroWinklerSimilarity_NullString() throws Exception {
|
||||
StringUtils.getJaroWinklerSimilarity(null, "clear");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetFuzzyDistance() throws Exception {
|
||||
assertEquals(0, StringUtils.getFuzzyDistance("", "", Locale.ENGLISH));
|
||||
|
|
Loading…
Reference in New Issue