LANG-1020: Improve performance of normalize space. Thanks to Libor Ondrusek. This closes #27 from github.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1620317 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
092d6da566
commit
bc8e23808b
|
@ -22,6 +22,7 @@
|
|||
<body>
|
||||
|
||||
<release version="3.4" date="tba" description="tba">
|
||||
<action issue="LANG-1020" type="update" dev="britter" due-to="Libor Ondrusek">Improve performance of normalize space</action>
|
||||
<action issue="LANG-1033" type="add" dev="ggregory">Add StringUtils.countMatches(CharSequence, char)</action>
|
||||
<action issue="LANG-1027" type="update" dev="rmannibucau">org.apache.commons.lang3.SystemUtils#isJavaVersionAtLeast should return true by default</action>
|
||||
<action issue="LANG-1021" type="add" dev="britter" due-to="Alexander Müller">Provide methods to retrieve all fields/methods annotated with a specific type</action>
|
||||
|
|
|
@ -172,16 +172,6 @@ public class StringUtils {
|
|||
*/
|
||||
private static final int PAD_LIMIT = 8192;
|
||||
|
||||
/**
|
||||
* A regex pattern for recognizing blocks of whitespace characters.
|
||||
* The apparent convolutedness of the pattern serves the purpose of
|
||||
* ignoring "blocks" consisting of only a single space: the pattern
|
||||
* is used only to normalize whitespace, condensing "blocks" down to a
|
||||
* single space, thus matching the same would likely cause a great
|
||||
* many noop replacements.
|
||||
*/
|
||||
private static final Pattern WHITESPACE_PATTERN = Pattern.compile("(?: |\\u00A0|\\s|[\\s&&[^ ]])\\s*");
|
||||
|
||||
/**
|
||||
* <p>{@code StringUtils} instances should NOT be constructed in
|
||||
* standard programming. Instead, the class should be used as
|
||||
|
@ -7477,10 +7467,34 @@ public class StringUtils {
|
|||
* @since 3.0
|
||||
*/
|
||||
public static String normalizeSpace(final String str) {
|
||||
if (str == null) {
|
||||
return null;
|
||||
// LANG-1020: Improved performance significantly normalizing manually instead of using regex
|
||||
// See https://github.com/librucha/commons-lang-normalizespaces-benchmark for performance test
|
||||
if (isEmpty(str)) {
|
||||
return str;
|
||||
}
|
||||
return WHITESPACE_PATTERN.matcher(trim(str)).replaceAll(SPACE);
|
||||
final int size = str.length();
|
||||
final char[] newChars = new char[size];
|
||||
int count = 0;
|
||||
int whitespacesCount = 0;
|
||||
boolean startWhitespaces = true;
|
||||
for (int i = 0; i < size; i++) {
|
||||
char actualChar = str.charAt(i);
|
||||
boolean isWhitespace = Character.isWhitespace(actualChar);
|
||||
if (!isWhitespace) {
|
||||
startWhitespaces = false;
|
||||
newChars[count++] = (actualChar == 160 ? 32 : actualChar);
|
||||
whitespacesCount = 0;
|
||||
} else {
|
||||
if (whitespacesCount == 0 && !startWhitespaces) {
|
||||
newChars[count++] = SPACE.charAt(0);
|
||||
}
|
||||
whitespacesCount++;
|
||||
}
|
||||
}
|
||||
if (startWhitespaces) {
|
||||
return EMPTY;
|
||||
}
|
||||
return new String(newChars, 0, count - (whitespacesCount > 0 ? 1 : 0));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue