From b78295b4faca2fd26024a26ef4638b0dce5d40a3 Mon Sep 17 00:00:00 2001 From: Benedikt Ritter Date: Wed, 7 May 2014 18:42:33 +0000 Subject: [PATCH] LANG-999: Add fuzzy String matching logic to StringUtils. This also closes #20 from github. Thanks to Ben Ripkens. git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1593112 13f79535-47bb-0310-9956-ffa450edef68 --- src/changes/changes.xml | 1 + .../org/apache/commons/lang3/StringUtils.java | 79 +++++++++++++++++++ .../apache/commons/lang3/StringUtilsTest.java | 31 ++++++++ 3 files changed, 111 insertions(+) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 0a29cd65b..e63d21950 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -22,6 +22,7 @@ + Add fuzzy String matching logic to StringUtils Add wrap (with String or char) to StringUtils Extend DurationFormatUtils#formatDurationISO default pattern to match #formatDurationHMS Fixing NumberUtils JAVADoc comments for max methods diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index 02fb3a4df..0d9f09a06 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -7072,6 +7072,85 @@ private static double score(final CharSequence first, final CharSequence second) return dist; } + /** + *

Determine the fuzzy score which indicates the similarity between two Strings.

+ * + *

This string matching algorithm is similar to the algorithms of editors such as Sublime Text, + * TextMate, Atom and others. One point is given for every matched character. Subsequent + * matches yield two bonus points. A higher score indicates a higher similarity.

+ * + *
+     * StringUtils.getFuzzyDistance(null, null, null)                                    = IllegalArgumentException
+     * StringUtils.getFuzzyDistance("", "", Locale.ENGLISH)                              = 0
+     * StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH)                     = 0
+     * StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH)                         = 1
+     * StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH)                     = 1
+     * StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH)                    = 2
+     * StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH)                    = 4
+     * StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
+     * 
+ * + * @param term a full term that should be matched against, must not be null + * @param query the query that will be matched against a term, must not be null + * @param locale This string matching logic is case insensitive. A locale is necessary to normalize + * both Strings to lower case. + * @return result score + * @throws IllegalArgumentException if either String input {@code null} or Locale input {@code null} + * @since 3.4 + */ + public static int getFuzzyDistance(final CharSequence term, final CharSequence query, final Locale locale) { + if (term == null || query == null) { + throw new IllegalArgumentException("Strings must not be null"); + } else if (locale == null) { + throw new IllegalArgumentException("Locale must not be null"); + } + + // fuzzy logic is case insensitive. We normalize the Strings to lower + // case right from the start. Turning characters to lower case + // via Character.toLowerCase(char) is unfortunately insufficient + // as it does not accept a locale. + final String termLowerCase = term.toString().toLowerCase(locale); + final String queryLowerCase = query.toString().toLowerCase(locale); + + // the resulting score + int score = 0; + + // the position in the term which will be scanned next for potential + // query character matches + int termIndex = 0; + + // index of the previously matched character in the term + int previousMatchingCharacterIndex = Integer.MIN_VALUE; + + for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) { + char queryChar = queryLowerCase.charAt(queryIndex); + + boolean termCharacterMatchFound = false; + for (; termIndex < termLowerCase.length() && !termCharacterMatchFound; termIndex++) { + char termChar = termLowerCase.charAt(termIndex); + + if (queryChar == termChar) { + // simple character matches result in one point + score++; + + // subsequent character matches further improve + // the score. + if (previousMatchingCharacterIndex + 1 == termIndex) { + score += 2; + } + + previousMatchingCharacterIndex = termIndex; + + // we can leave the nested loop. Every character in the + // query can match at most one character in the term. + termCharacterMatchFound = true; + } + } + } + + return score; + } + /** * Gets a set of matching characters between two strings. * diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index 974edc8dc..0341c9c48 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -2018,6 +2018,37 @@ public void testGetJaroWinklerDistance_NullString() throws Exception { StringUtils.getJaroWinklerDistance(null, "clear"); } + @Test + public void testGetFuzzyDistance() throws Exception { + assertEquals(0, StringUtils.getFuzzyDistance("", "", Locale.ENGLISH)); + assertEquals(0, StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH)); + assertEquals(1, StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH)); + assertEquals(1, StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH)); + assertEquals(2, StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH)); + assertEquals(4, StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH)); + assertEquals(3, StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH)); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetFuzzyDistance_NullNullNull() throws Exception { + StringUtils.getFuzzyDistance(null, null, null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetFuzzyDistance_StringNullLoclae() throws Exception { + StringUtils.getFuzzyDistance(" ", null, Locale.ENGLISH); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetFuzzyDistance_NullStringLocale() throws Exception { + StringUtils.getFuzzyDistance(null, "clear", Locale.ENGLISH); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetFuzzyDistance_StringStringNull() throws Exception { + StringUtils.getFuzzyDistance(" ", "clear", null); + } + /** * A sanity check for {@link StringUtils#EMPTY}. */