LANG-1020: Improve performance of normalize space. Thanks to Libor Ondrusek. This closes #27 from github.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1620317 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
092d6da566
commit
bc8e23808b
|
@ -22,6 +22,7 @@
|
||||||
<body>
|
<body>
|
||||||
|
|
||||||
<release version="3.4" date="tba" description="tba">
|
<release version="3.4" date="tba" description="tba">
|
||||||
|
<action issue="LANG-1020" type="update" dev="britter" due-to="Libor Ondrusek">Improve performance of normalize space</action>
|
||||||
<action issue="LANG-1033" type="add" dev="ggregory">Add StringUtils.countMatches(CharSequence, char)</action>
|
<action issue="LANG-1033" type="add" dev="ggregory">Add StringUtils.countMatches(CharSequence, char)</action>
|
||||||
<action issue="LANG-1027" type="update" dev="rmannibucau">org.apache.commons.lang3.SystemUtils#isJavaVersionAtLeast should return true by default</action>
|
<action issue="LANG-1027" type="update" dev="rmannibucau">org.apache.commons.lang3.SystemUtils#isJavaVersionAtLeast should return true by default</action>
|
||||||
<action issue="LANG-1021" type="add" dev="britter" due-to="Alexander Müller">Provide methods to retrieve all fields/methods annotated with a specific type</action>
|
<action issue="LANG-1021" type="add" dev="britter" due-to="Alexander Müller">Provide methods to retrieve all fields/methods annotated with a specific type</action>
|
||||||
|
|
|
@ -172,16 +172,6 @@ public class StringUtils {
|
||||||
*/
|
*/
|
||||||
private static final int PAD_LIMIT = 8192;
|
private static final int PAD_LIMIT = 8192;
|
||||||
|
|
||||||
/**
|
|
||||||
* A regex pattern for recognizing blocks of whitespace characters.
|
|
||||||
* The apparent convolutedness of the pattern serves the purpose of
|
|
||||||
* ignoring "blocks" consisting of only a single space: the pattern
|
|
||||||
* is used only to normalize whitespace, condensing "blocks" down to a
|
|
||||||
* single space, thus matching the same would likely cause a great
|
|
||||||
* many noop replacements.
|
|
||||||
*/
|
|
||||||
private static final Pattern WHITESPACE_PATTERN = Pattern.compile("(?: |\\u00A0|\\s|[\\s&&[^ ]])\\s*");
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>{@code StringUtils} instances should NOT be constructed in
|
* <p>{@code StringUtils} instances should NOT be constructed in
|
||||||
* standard programming. Instead, the class should be used as
|
* standard programming. Instead, the class should be used as
|
||||||
|
@ -7477,10 +7467,34 @@ public class StringUtils {
|
||||||
* @since 3.0
|
* @since 3.0
|
||||||
*/
|
*/
|
||||||
public static String normalizeSpace(final String str) {
|
public static String normalizeSpace(final String str) {
|
||||||
if (str == null) {
|
// LANG-1020: Improved performance significantly normalizing manually instead of using regex
|
||||||
return null;
|
// See https://github.com/librucha/commons-lang-normalizespaces-benchmark for performance test
|
||||||
|
if (isEmpty(str)) {
|
||||||
|
return str;
|
||||||
}
|
}
|
||||||
return WHITESPACE_PATTERN.matcher(trim(str)).replaceAll(SPACE);
|
final int size = str.length();
|
||||||
|
final char[] newChars = new char[size];
|
||||||
|
int count = 0;
|
||||||
|
int whitespacesCount = 0;
|
||||||
|
boolean startWhitespaces = true;
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
char actualChar = str.charAt(i);
|
||||||
|
boolean isWhitespace = Character.isWhitespace(actualChar);
|
||||||
|
if (!isWhitespace) {
|
||||||
|
startWhitespaces = false;
|
||||||
|
newChars[count++] = (actualChar == 160 ? 32 : actualChar);
|
||||||
|
whitespacesCount = 0;
|
||||||
|
} else {
|
||||||
|
if (whitespacesCount == 0 && !startWhitespaces) {
|
||||||
|
newChars[count++] = SPACE.charAt(0);
|
||||||
|
}
|
||||||
|
whitespacesCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (startWhitespaces) {
|
||||||
|
return EMPTY;
|
||||||
|
}
|
||||||
|
return new String(newChars, 0, count - (whitespacesCount > 0 ? 1 : 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue