LANG-999: Add fuzzy String matching logic to StringUtils. This also closes #20 from github. Thanks to Ben Ripkens.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1593112 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
287aba5b2e
commit
b78295b4fa
|
@ -22,6 +22,7 @@
|
|||
<body>
|
||||
|
||||
<release version="3.4" date="tba" description="tba">
|
||||
<action issue="LANG-999" type="add" dev="britter" due-to="Ben Ripkens">Add fuzzy String matching logic to StringUtils</action>
|
||||
<action issue="LANG-1006" type="update" dev="britter" due-to="Thiago Andrade">Add wrap (with String or char) to StringUtils</action>
|
||||
<action issue="LANG-1005" type="update" dev="britter" due-to="Michael Osipov">Extend DurationFormatUtils#formatDurationISO default pattern to match #formatDurationHMS</action>
|
||||
<action issue="LANG-1007" type="update" dev="britter" due-to="Thiago Andrade">Fixing NumberUtils JAVADoc comments for max methods</action>
|
||||
|
|
|
@ -7072,6 +7072,85 @@ public class StringUtils {
|
|||
return dist;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Determine the fuzzy score which indicates the similarity between two Strings.</p>
|
||||
*
|
||||
* <p>This string matching algorithm is similar to the algorithms of editors such as Sublime Text,
|
||||
* TextMate, Atom and others. One point is given for every matched character. Subsequent
|
||||
* matches yield two bonus points. A higher score indicates a higher similarity.</p>
|
||||
*
|
||||
* <pre>
|
||||
* StringUtils.getFuzzyDistance(null, null, null) = IllegalArgumentException
|
||||
* StringUtils.getFuzzyDistance("", "", Locale.ENGLISH) = 0
|
||||
* StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH) = 0
|
||||
* StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH) = 1
|
||||
* StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH) = 1
|
||||
* StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH) = 2
|
||||
* StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH) = 4
|
||||
* StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
|
||||
* </pre>
|
||||
*
|
||||
* @param term a full term that should be matched against, must not be null
|
||||
* @param query the query that will be matched against a term, must not be null
|
||||
* @param locale This string matching logic is case insensitive. A locale is necessary to normalize
|
||||
* both Strings to lower case.
|
||||
* @return result score
|
||||
* @throws IllegalArgumentException if either String input {@code null} or Locale input {@code null}
|
||||
* @since 3.4
|
||||
*/
|
||||
public static int getFuzzyDistance(final CharSequence term, final CharSequence query, final Locale locale) {
|
||||
if (term == null || query == null) {
|
||||
throw new IllegalArgumentException("Strings must not be null");
|
||||
} else if (locale == null) {
|
||||
throw new IllegalArgumentException("Locale must not be null");
|
||||
}
|
||||
|
||||
// fuzzy logic is case insensitive. We normalize the Strings to lower
|
||||
// case right from the start. Turning characters to lower case
|
||||
// via Character.toLowerCase(char) is unfortunately insufficient
|
||||
// as it does not accept a locale.
|
||||
final String termLowerCase = term.toString().toLowerCase(locale);
|
||||
final String queryLowerCase = query.toString().toLowerCase(locale);
|
||||
|
||||
// the resulting score
|
||||
int score = 0;
|
||||
|
||||
// the position in the term which will be scanned next for potential
|
||||
// query character matches
|
||||
int termIndex = 0;
|
||||
|
||||
// index of the previously matched character in the term
|
||||
int previousMatchingCharacterIndex = Integer.MIN_VALUE;
|
||||
|
||||
for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
|
||||
char queryChar = queryLowerCase.charAt(queryIndex);
|
||||
|
||||
boolean termCharacterMatchFound = false;
|
||||
for (; termIndex < termLowerCase.length() && !termCharacterMatchFound; termIndex++) {
|
||||
char termChar = termLowerCase.charAt(termIndex);
|
||||
|
||||
if (queryChar == termChar) {
|
||||
// simple character matches result in one point
|
||||
score++;
|
||||
|
||||
// subsequent character matches further improve
|
||||
// the score.
|
||||
if (previousMatchingCharacterIndex + 1 == termIndex) {
|
||||
score += 2;
|
||||
}
|
||||
|
||||
previousMatchingCharacterIndex = termIndex;
|
||||
|
||||
// we can leave the nested loop. Every character in the
|
||||
// query can match at most one character in the term.
|
||||
termCharacterMatchFound = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a set of matching characters between two strings.
|
||||
*
|
||||
|
|
|
@ -2018,6 +2018,37 @@ public class StringUtilsTest {
|
|||
StringUtils.getJaroWinklerDistance(null, "clear");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetFuzzyDistance() throws Exception {
|
||||
assertEquals(0, StringUtils.getFuzzyDistance("", "", Locale.ENGLISH));
|
||||
assertEquals(0, StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH));
|
||||
assertEquals(1, StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH));
|
||||
assertEquals(1, StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH));
|
||||
assertEquals(2, StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH));
|
||||
assertEquals(4, StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH));
|
||||
assertEquals(3, StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH));
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testGetFuzzyDistance_NullNullNull() throws Exception {
|
||||
StringUtils.getFuzzyDistance(null, null, null);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testGetFuzzyDistance_StringNullLoclae() throws Exception {
|
||||
StringUtils.getFuzzyDistance(" ", null, Locale.ENGLISH);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testGetFuzzyDistance_NullStringLocale() throws Exception {
|
||||
StringUtils.getFuzzyDistance(null, "clear", Locale.ENGLISH);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testGetFuzzyDistance_StringStringNull() throws Exception {
|
||||
StringUtils.getFuzzyDistance(" ", "clear", null);
|
||||
}
|
||||
|
||||
/**
|
||||
* A sanity check for {@link StringUtils#EMPTY}.
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue