LANG-1020: Improve performance of normalize space. Thanks to Libor Ondrusek. This closes #27 from github.

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@1620317 13f79535-47bb-0310-9956-ffa450edef68
2025-02-08 02:58:33 +00:00 · 2014-08-25 13:04:54 +00:00 · 2014-08-25 13:04:54 +00:00 · bc8e23808b
commit bc8e23808b
parent 092d6da566
2 changed files with 28 additions and 13 deletions
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@ -22,6 +22,7 @@
  <body>

  <release version="3.4" date="tba" description="tba">
+    <action issue="LANG-1020" type="update" dev="britter" due-to="Libor Ondrusek">Improve performance of normalize space</action>
    <action issue="LANG-1033" type="add" dev="ggregory">Add StringUtils.countMatches(CharSequence, char)</action>
    <action issue="LANG-1027" type="update" dev="rmannibucau">org.apache.commons.lang3.SystemUtils#isJavaVersionAtLeast should return true by default</action>
    <action issue="LANG-1021" type="add" dev="britter" due-to="Alexander Müller">Provide methods to retrieve all fields/methods annotated with a specific type</action>
--- a/src/main/java/org/apache/commons/lang3/StringUtils.java
+++ b/src/main/java/org/apache/commons/lang3/StringUtils.java
@ -172,16 +172,6 @@ public class StringUtils {
     */
    private static final int PAD_LIMIT = 8192;

-    /**
-     * A regex pattern for recognizing blocks of whitespace characters.
-     * The apparent convolutedness of the pattern serves the purpose of
-     * ignoring "blocks" consisting of only a single space:  the pattern
-     * is used only to normalize whitespace, condensing "blocks" down to a
-     * single space, thus matching the same would likely cause a great
-     * many noop replacements.
-     */
-    private static final Pattern WHITESPACE_PATTERN = Pattern.compile("(?: |\\u00A0|\\s|[\\s&&[^ ]])\\s*");
-
    /**
     * <p>{@code StringUtils} instances should NOT be constructed in
     * standard programming. Instead, the class should be used as
@ -7477,10 +7467,34 @@ private static boolean endsWith(final CharSequence str, final CharSequence suffi
     * @since 3.0
     */
    public static String normalizeSpace(final String str) {
-        if (str == null) {
-            return null;
+        // LANG-1020: Improved performance significantly normalizing manually instead of using regex
+        // See https://github.com/librucha/commons-lang-normalizespaces-benchmark for performance test
+        if (isEmpty(str)) {
+            return str;
        }
-        return WHITESPACE_PATTERN.matcher(trim(str)).replaceAll(SPACE);
+        final int size = str.length();
+        final char[] newChars = new char[size];
+        int count = 0;
+        int whitespacesCount = 0;
+        boolean startWhitespaces = true;
+        for (int i = 0; i < size; i++) {
+            char actualChar = str.charAt(i);
+            boolean isWhitespace = Character.isWhitespace(actualChar);
+            if (!isWhitespace) {
+                startWhitespaces = false;
+                newChars[count++] = (actualChar == 160 ? 32 : actualChar);
+                whitespacesCount = 0;
+            } else {
+                if (whitespacesCount == 0 && !startWhitespaces) {
+                    newChars[count++] = SPACE.charAt(0);
+                }
+                whitespacesCount++;
+            }
+        }
+        if (startWhitespaces) {
+            return EMPTY;
+        }
+        return new String(newChars, 0, count - (whitespacesCount > 0 ? 1 : 0));
    }

    /**