LANG-1020: Improve performance of normalize space. Thanks to Libor Ondrusek. This closes #27 from github.

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1620317 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Benedikt Ritter 2014-08-25 13:04:54 +00:00
parent 092d6da566
commit bc8e23808b
2 changed files with 28 additions and 13 deletions

View File

@ -22,6 +22,7 @@
<body> <body>
<release version="3.4" date="tba" description="tba"> <release version="3.4" date="tba" description="tba">
<action issue="LANG-1020" type="update" dev="britter" due-to="Libor Ondrusek">Improve performance of normalize space</action>
<action issue="LANG-1033" type="add" dev="ggregory">Add StringUtils.countMatches(CharSequence, char)</action> <action issue="LANG-1033" type="add" dev="ggregory">Add StringUtils.countMatches(CharSequence, char)</action>
<action issue="LANG-1027" type="update" dev="rmannibucau">org.apache.commons.lang3.SystemUtils#isJavaVersionAtLeast should return true by default</action> <action issue="LANG-1027" type="update" dev="rmannibucau">org.apache.commons.lang3.SystemUtils#isJavaVersionAtLeast should return true by default</action>
<action issue="LANG-1021" type="add" dev="britter" due-to="Alexander Müller">Provide methods to retrieve all fields/methods annotated with a specific type</action> <action issue="LANG-1021" type="add" dev="britter" due-to="Alexander Müller">Provide methods to retrieve all fields/methods annotated with a specific type</action>

View File

@ -172,16 +172,6 @@ public class StringUtils {
*/ */
private static final int PAD_LIMIT = 8192; private static final int PAD_LIMIT = 8192;
/**
* A regex pattern for recognizing blocks of whitespace characters.
* The apparent convolutedness of the pattern serves the purpose of
* ignoring "blocks" consisting of only a single space: the pattern
* is used only to normalize whitespace, condensing "blocks" down to a
* single space, thus matching the same would likely cause a great
* many noop replacements.
*/
private static final Pattern WHITESPACE_PATTERN = Pattern.compile("(?: |\\u00A0|\\s|[\\s&&[^ ]])\\s*");
/** /**
* <p>{@code StringUtils} instances should NOT be constructed in * <p>{@code StringUtils} instances should NOT be constructed in
* standard programming. Instead, the class should be used as * standard programming. Instead, the class should be used as
@ -7477,10 +7467,34 @@ public class StringUtils {
* @since 3.0 * @since 3.0
*/ */
public static String normalizeSpace(final String str) { public static String normalizeSpace(final String str) {
if (str == null) { // LANG-1020: Improved performance significantly normalizing manually instead of using regex
return null; // See https://github.com/librucha/commons-lang-normalizespaces-benchmark for performance test
if (isEmpty(str)) {
return str;
} }
return WHITESPACE_PATTERN.matcher(trim(str)).replaceAll(SPACE); final int size = str.length();
final char[] newChars = new char[size];
int count = 0;
int whitespacesCount = 0;
boolean startWhitespaces = true;
for (int i = 0; i < size; i++) {
char actualChar = str.charAt(i);
boolean isWhitespace = Character.isWhitespace(actualChar);
if (!isWhitespace) {
startWhitespaces = false;
newChars[count++] = (actualChar == 160 ? 32 : actualChar);
whitespacesCount = 0;
} else {
if (whitespacesCount == 0 && !startWhitespaces) {
newChars[count++] = SPACE.charAt(0);
}
whitespacesCount++;
}
}
if (startWhitespaces) {
return EMPTY;
}
return new String(newChars, 0, count - (whitespacesCount > 0 ? 1 : 0));
} }
/** /**