LANG-999: Add fuzzy String matching logic to StringUtils. This also closes #20 from github. Thanks to Ben Ripkens.

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1593112 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Benedikt Ritter 2014-05-07 18:42:33 +00:00
parent 287aba5b2e
commit b78295b4fa
3 changed files with 111 additions and 0 deletions

View File

@ -22,6 +22,7 @@
<body>
<release version="3.4" date="tba" description="tba">
<action issue="LANG-999" type="add" dev="britter" due-to="Ben Ripkens">Add fuzzy String matching logic to StringUtils</action>
<action issue="LANG-1006" type="update" dev="britter" due-to="Thiago Andrade">Add wrap (with String or char) to StringUtils</action>
<action issue="LANG-1005" type="update" dev="britter" due-to="Michael Osipov">Extend DurationFormatUtils#formatDurationISO default pattern to match #formatDurationHMS</action>
<action issue="LANG-1007" type="update" dev="britter" due-to="Thiago Andrade">Fixing NumberUtils JAVADoc comments for max methods</action>

View File

@ -7072,6 +7072,85 @@ private static double score(final CharSequence first, final CharSequence second)
return dist;
}
/**
* <p>Determine the fuzzy score which indicates the similarity between two Strings.</p>
*
* <p>This string matching algorithm is similar to the algorithms of editors such as Sublime Text,
* TextMate, Atom and others. One point is given for every matched character. Subsequent
* matches yield two bonus points. A higher score indicates a higher similarity.</p>
*
* <pre>
* StringUtils.getFuzzyDistance(null, null, null) = IllegalArgumentException
* StringUtils.getFuzzyDistance("", "", Locale.ENGLISH) = 0
* StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH) = 0
* StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH) = 1
* StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH) = 1
* StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH) = 2
* StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH) = 4
* StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
* </pre>
*
* @param term a full term that should be matched against, must not be null
* @param query the query that will be matched against a term, must not be null
* @param locale This string matching logic is case insensitive. A locale is necessary to normalize
* both Strings to lower case.
* @return result score
* @throws IllegalArgumentException if either String input {@code null} or Locale input {@code null}
* @since 3.4
*/
public static int getFuzzyDistance(final CharSequence term, final CharSequence query, final Locale locale) {
if (term == null || query == null) {
throw new IllegalArgumentException("Strings must not be null");
} else if (locale == null) {
throw new IllegalArgumentException("Locale must not be null");
}
// fuzzy logic is case insensitive. We normalize the Strings to lower
// case right from the start. Turning characters to lower case
// via Character.toLowerCase(char) is unfortunately insufficient
// as it does not accept a locale.
final String termLowerCase = term.toString().toLowerCase(locale);
final String queryLowerCase = query.toString().toLowerCase(locale);
// the resulting score
int score = 0;
// the position in the term which will be scanned next for potential
// query character matches
int termIndex = 0;
// index of the previously matched character in the term
int previousMatchingCharacterIndex = Integer.MIN_VALUE;
for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
char queryChar = queryLowerCase.charAt(queryIndex);
boolean termCharacterMatchFound = false;
for (; termIndex < termLowerCase.length() && !termCharacterMatchFound; termIndex++) {
char termChar = termLowerCase.charAt(termIndex);
if (queryChar == termChar) {
// simple character matches result in one point
score++;
// subsequent character matches further improve
// the score.
if (previousMatchingCharacterIndex + 1 == termIndex) {
score += 2;
}
previousMatchingCharacterIndex = termIndex;
// we can leave the nested loop. Every character in the
// query can match at most one character in the term.
termCharacterMatchFound = true;
}
}
}
return score;
}
/**
* Gets a set of matching characters between two strings.
*

View File

@ -2018,6 +2018,37 @@ public void testGetJaroWinklerDistance_NullString() throws Exception {
StringUtils.getJaroWinklerDistance(null, "clear");
}
@Test
public void testGetFuzzyDistance() throws Exception {
assertEquals(0, StringUtils.getFuzzyDistance("", "", Locale.ENGLISH));
assertEquals(0, StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH));
assertEquals(1, StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH));
assertEquals(1, StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH));
assertEquals(2, StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH));
assertEquals(4, StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH));
assertEquals(3, StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH));
}
@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyDistance_NullNullNull() throws Exception {
StringUtils.getFuzzyDistance(null, null, null);
}
@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyDistance_StringNullLoclae() throws Exception {
StringUtils.getFuzzyDistance(" ", null, Locale.ENGLISH);
}
@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyDistance_NullStringLocale() throws Exception {
StringUtils.getFuzzyDistance(null, "clear", Locale.ENGLISH);
}
@Test(expected = IllegalArgumentException.class)
public void testGetFuzzyDistance_StringStringNull() throws Exception {
StringUtils.getFuzzyDistance(" ", "clear", null);
}
/**
* A sanity check for {@link StringUtils#EMPTY}.
*/