diff --git a/src/changes/changes.xml b/src/changes/changes.xml index b73f6bcba..437d59bf9 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -46,6 +46,7 @@ The type attribute can be add,update,fix,remove. + Wrong name or result of StringUtils#getJaroWinklerDistance StringUtils#join(T...): warning: [unchecked] Possible heap pollution from parameterized vararg type T Multiple calls of org.apache.commons.lang3.concurrent.LazyInitializer.initialize() are possible. StrBuilder#replaceAll ArrayIndexOutOfBoundsException diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index f1431775a..a6c7c5625 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -8009,7 +8009,9 @@ else if (Math.abs(n - m) > threshold) { * @return result distance * @throws IllegalArgumentException if either String input {@code null} * @since 3.3 + * @deprecated as of 3.6, due to a misleading name, use {@link #getJaroWinklerSimilarity()} instead */ + @Deprecated public static double getJaroWinklerDistance(final CharSequence first, final CharSequence second) { final double DEFAULT_SCALING_FACTOR = 0.1; @@ -8027,6 +8029,55 @@ public static double getJaroWinklerDistance(final CharSequence first, final Char return Math.round(jw * 100.0D) / 100.0D; } + /** + *

Find the Jaro Winkler Similarity which indicates the similarity score between two Strings.

+ * + *

The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters. + * Winkler increased this measure for matching initial characters.

+ * + *

This implementation is based on the Jaro Winkler similarity algorithm + * from http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance.

+ * + *
+     * StringUtils.getJaroWinklerSimilarity(null, null)          = IllegalArgumentException
+     * StringUtils.getJaroWinklerSimilarity("","")               = 0.0
+     * StringUtils.getJaroWinklerSimilarity("","a")              = 0.0
+     * StringUtils.getJaroWinklerSimilarity("aaapppp", "")       = 0.0
+     * StringUtils.getJaroWinklerSimilarity("frog", "fog")       = 0.93
+     * StringUtils.getJaroWinklerSimilarity("fly", "ant")        = 0.0
+     * StringUtils.getJaroWinklerSimilarity("elephant", "hippo") = 0.44
+     * StringUtils.getJaroWinklerSimilarity("hippo", "elephant") = 0.44
+     * StringUtils.getJaroWinklerSimilarity("hippo", "zzzzzzzz") = 0.0
+     * StringUtils.getJaroWinklerSimilarity("hello", "hallo")    = 0.88
+     * StringUtils.getJaroWinklerSimilarity("ABC Corporation", "ABC Corp") = 0.93
+     * StringUtils.getJaroWinklerSimilarity("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95
+     * StringUtils.getJaroWinklerSimilarity("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
+     * StringUtils.getJaroWinklerSimilarity("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
+     * 
+ * + * @param first the first String, must not be null + * @param second the second String, must not be null + * @return result similarity + * @throws IllegalArgumentException if either String input {@code null} + * @since 3.6 + */ + public static double getJaroWinklerSimilarity(final CharSequence first, final CharSequence second) { + final double DEFAULT_SCALING_FACTOR = 0.1; + + if (first == null || second == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + int[] mtp = matches(first, second); + double m = mtp[0]; + if (m == 0) { + return 0D; + } + double j = ((m / first.length() + m / second.length() + (m - mtp[1]) / m)) / 3; + double jw = j < 0.7D ? j : j + Math.min(DEFAULT_SCALING_FACTOR, 1D / mtp[3]) * mtp[2] * (1D - j); + return Math.round(jw * 100.0D) / 100.0D; + } + private static int[] matches(final CharSequence first, final CharSequence second) { CharSequence max, min; if (first.length() > second.length()) { diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index 82fee9248..524bd8ddb 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -2384,6 +2384,34 @@ public void testGetJaroWinklerDistance_NullString() throws Exception { StringUtils.getJaroWinklerDistance(null, "clear"); } + @Test + public void testGetJaroWinklerSimilarity_StringString() { + assertEquals(0.93d, StringUtils.getJaroWinklerSimilarity("frog", "fog"), 0.0d); + assertEquals(0.0d, StringUtils.getJaroWinklerSimilarity("fly", "ant"), 0.0d); + assertEquals(0.44d, StringUtils.getJaroWinklerSimilarity("elephant", "hippo"), 0.0d); + assertEquals(0.84d, StringUtils.getJaroWinklerSimilarity("dwayne", "duane"), 0.0d); + assertEquals(0.93d, StringUtils.getJaroWinklerSimilarity("ABC Corporation", "ABC Corp"), 0.0d); + assertEquals(0.95d, StringUtils.getJaroWinklerSimilarity("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); + assertEquals(0.92d, StringUtils.getJaroWinklerSimilarity("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); + assertEquals(0.88d, StringUtils.getJaroWinklerSimilarity("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); + assertEquals(0.63d, StringUtils.getJaroWinklerSimilarity("Haus Ingeborg", "Ingeborg Esser"), 0.0d); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetJaroWinklerSimilarity_NullNull() throws Exception { + StringUtils.getJaroWinklerSimilarity(null, null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetJaroWinklerSimilarity_StringNull() throws Exception { + StringUtils.getJaroWinklerSimilarity(" ", null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetJaroWinklerSimilarity_NullString() throws Exception { + StringUtils.getJaroWinklerSimilarity(null, "clear"); + } + @Test public void testGetFuzzyDistance() throws Exception { assertEquals(0, StringUtils.getFuzzyDistance("", "", Locale.ENGLISH));