diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 2226f6d21..6c10abf87 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -46,6 +46,7 @@ The type attribute can be add,update,fix,remove. + Fix implementation of StringUtils.getJaroWinklerDistance() Fix dead links in StringUtils.getLevenshteinDistance() javadoc "\u2284":"⊄" mapping missing from EntityArrays#HTML40_EXTENDED_ESCAPE Simplify ArrayUtils removeElements by using new decrementAndGet() method diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index 81d824aea..d4fdcf257 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -7679,10 +7679,10 @@ else if (Math.abs(n - m) > threshold) { * StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44 * StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0 * StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88 - * StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.91 - * StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93 - * StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94 - * StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9 + * StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.93 + * StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95 + * StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92 + * StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88 * * * @param first the first String, must not be null @@ -7698,62 +7698,70 @@ public static double getJaroWinklerDistance(final CharSequence first, final Char throw new IllegalArgumentException("Strings must not be null"); } - final double jaro = score(first,second); - final int cl = commonPrefixLength(first, second); - final double matchScore = Math.round((jaro + (DEFAULT_SCALING_FACTOR * cl * (1.0 - jaro))) *100.0)/100.0; - - return matchScore; + int[] mtp = matches(first, second); + double m = mtp[0]; + if (m == 0) { + return 0D; + } + double j = ((m / first.length() + m / second.length() + (m - mtp[1]) / m)) / 3; + double jw = j < 0.7D ? j : j + Math.min(DEFAULT_SCALING_FACTOR, 1D / mtp[3]) * mtp[2] * (1D - j); + return Math.round(jw * 100.0D) / 100.0D; } - /** - * This method returns the Jaro-Winkler score for string matching. - * @param first the first string to be matched - * @param second the second string to be machted - * @return matching score without scaling factor impact - */ - private static double score(final CharSequence first, final CharSequence second) { - String shorter; - String longer; - - // Determine which String is longer. + private static int[] matches(final CharSequence first, final CharSequence second) { + CharSequence max, min; if (first.length() > second.length()) { - longer = first.toString().toLowerCase(); - shorter = second.toString().toLowerCase(); + max = first; + min = second; } else { - longer = second.toString().toLowerCase(); - shorter = first.toString().toLowerCase(); + max = second; + min = first; } - - // Calculate the half length() distance of the shorter String. - final int halflength = shorter.length() / 2 + 1; - - // Find the set of matching characters between the shorter and longer strings. Note that - // the set of matching characters may be different depending on the order of the strings. - final String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength); - final String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength); - - // If one or both of the sets of common characters is empty, then - // there is no similarity between the two strings. - if (m1.length() == 0 || m2.length() == 0) { - return 0.0; + int range = Math.max(max.length() / 2 - 1, 0); + int[] matchIndexes = new int[min.length()]; + Arrays.fill(matchIndexes, -1); + boolean[] matchFlags = new boolean[max.length()]; + int matches = 0; + for (int mi = 0; mi < min.length(); mi++) { + char c1 = min.charAt(mi); + for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) { + if (!matchFlags[xi] && c1 == max.charAt(xi)) { + matchIndexes[mi] = xi; + matchFlags[xi] = true; + matches++; + break; + } + } } - - // If the set of common characters is not the same size, then - // there is no similarity between the two strings, either. - if (m1.length() != m2.length()) { - return 0.0; + char[] ms1 = new char[matches]; + char[] ms2 = new char[matches]; + for (int i = 0, si = 0; i < min.length(); i++) { + if (matchIndexes[i] != -1) { + ms1[si] = min.charAt(i); + si++; + } } - - // Calculate the number of transposition between the two sets - // of common characters. - final int transpositions = transpositions(m1, m2); - - // Calculate the distance. - final double dist = - (m1.length() / ((double)shorter.length()) + - m2.length() / ((double)longer.length()) + - (m1.length() - transpositions) / ((double)m1.length())) / 3.0; - return dist; + for (int i = 0, si = 0; i < max.length(); i++) { + if (matchFlags[i]) { + ms2[si] = max.charAt(i); + si++; + } + } + int transpositions = 0; + for (int mi = 0; mi < ms1.length; mi++) { + if (ms1[mi] != ms2[mi]) { + transpositions++; + } + } + int prefix = 0; + for (int mi = 0; mi < min.length(); mi++) { + if (first.charAt(mi) == second.charAt(mi)) { + prefix++; + } else { + break; + } + } + return new int[] { matches, transpositions / 2, prefix, max.length() }; } /** @@ -7835,67 +7843,6 @@ public static int getFuzzyDistance(final CharSequence term, final CharSequence q return score; } - /** - * Gets a set of matching characters between two strings. - * - *

- * - * @param first The first string. - * @param second The second string. - * @param limit The maximum distance to consider. - * @return A string contain the set of common characters. - */ - private static String getSetOfMatchingCharacterWithin(final CharSequence first, final CharSequence second, final int limit) { - final StringBuilder common = new StringBuilder(); - final StringBuilder copy = new StringBuilder(second); - - for (int i = 0; i < first.length(); i++) { - final char ch = first.charAt(i); - boolean found = false; - - // See if the character is within the limit positions away from the original position of that character. - for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++) { - if (copy.charAt(j) == ch) { - found = true; - common.append(ch); - copy.setCharAt(j,'*'); - } - } - } - return common.toString(); - } - - /** - * Calculates the number of transposition between two strings. - * @param first The first string. - * @param second The second string. - * @return The number of transposition between the two strings. - */ - private static int transpositions(final CharSequence first, final CharSequence second) { - int transpositions = 0; - for (int i = 0; i < first.length(); i++) { - if (first.charAt(i) != second.charAt(i)) { - transpositions++; - } - } - return transpositions / 2; - } - - /** - * Calculates the number of characters from the beginning of the strings that match exactly one-to-one, - * up to a maximum of four (4) characters. - * @param first The first string. - * @param second The second string. - * @return A number between 0 and 4. - */ - private static int commonPrefixLength(final CharSequence first, final CharSequence second) { - final int result = getCommonPrefix(first.toString(), second.toString()).length(); - - // Limit the result to 4. - return result > 4 ? 4 : result; - } - // startsWith //----------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index 1fa164ea4..6c9219380 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -2182,10 +2182,12 @@ public void testGetJaroWinklerDistance_StringString() { assertEquals(0.93d, StringUtils.getJaroWinklerDistance("frog", "fog"), 0.0d); assertEquals(0.0d, StringUtils.getJaroWinklerDistance("fly", "ant"), 0.0d); assertEquals(0.44d, StringUtils.getJaroWinklerDistance("elephant", "hippo"), 0.0d); - assertEquals(0.91d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d); - assertEquals(0.93d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); - assertEquals(0.94d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); - assertEquals(0.9d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); + assertEquals(0.84d, StringUtils.getJaroWinklerDistance("dwayne", "duane"), 0.0d); + assertEquals(0.93d, StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp"), 0.0d); + assertEquals(0.95d, StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); + assertEquals(0.92d, StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); + assertEquals(0.88d, StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); + assertEquals(0.63d, StringUtils.getJaroWinklerDistance("Haus Ingeborg", "Ingeborg Esser"), 0.0d); } @Test(expected = IllegalArgumentException.class)