From 823f2603a4de316b1ae4d5d6d4ed0175f6edc985 Mon Sep 17 00:00:00 2001 From: "James W. Carman" Date: Fri, 6 Aug 2010 01:08:57 +0000 Subject: [PATCH] LANG-640: Add normalizeSpace to StringUtils git-svn-id: https://svn.apache.org/repos/asf/commons/proper/lang/trunk@982844 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/commons/lang3/StringUtils.java | 54 +++++++++++++++++++ .../apache/commons/lang3/StringUtilsTest.java | 13 +++++ 2 files changed, 67 insertions(+) diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index 269bb9f92..50b31d891 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -20,6 +20,7 @@ import java.util.Iterator; import java.util.List; import java.util.Locale; +import java.util.regex.Pattern; /** *

Operations on {@link java.lang.String} that are @@ -156,6 +157,11 @@ public class StringUtils { */ private static final int PAD_LIMIT = 8192; + /** + * A regex pattern for recognizing blocks of whitespace characters. + */ + private static final Pattern WHITESPACE_BLOCK = Pattern.compile("\\s+"); + /** *

StringUtils instances should NOT be constructed in * standard programming. Instead, the class should be used as @@ -6257,4 +6263,52 @@ private static boolean endsWith(String str, String suffix, boolean ignoreCase) { int strOffset = str.length() - suffix.length(); return str.regionMatches(ignoreCase, strOffset, suffix, 0, suffix.length()); } + + /** + *

+ * Similar to http://www.w3.org/TR/xpath/#function-normalize + * -space + *

+ *

+ * The function returns the argument string with whitespace normalized by using + * {@link #trim(String)} to remove leading and trailing whitespace + * and then replacing sequences of whitespace characters by a single space. + *

+ * In XML Whitespace characters are the same as those allowed by the S production, which is S ::= (#x20 | #x9 | #xD | #xA)+ + *

+ * Java's regexp pattern \s defines whitespace as [ \t\n\x0B\f\r] + *

+ * For reference: + *

+ *

+ *

+ * The difference is that Java's whitespace includes vertical tab and form feed, which this functional will also + * normalize. Additonally {@link #trim(String)} removes control characters (char <= 32) from both + * ends of this String. + *

+ * + * @see Pattern + * @see #trim(String) + * @see http://www.w3.org/TR/xpath/#function-normalize-space + * @param str the source String to normalize whitespaces from, may be null + * @return the modified string with whitespace normalized, null if null String input + * + * @since 3.0 + */ + public static String normalizeSpace(String str) { + if(str == null) { + return null; + } + return WHITESPACE_BLOCK.matcher(trim(str)).replaceAll(" "); + } } diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index d95b06066..6f044d0a6 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -1855,4 +1855,17 @@ public void testStartsWithAny() { assertFalse(StringUtils.startsWithAny("abcxyz", new String[] {null, "xyz", "abcd"})); } + public void testNormalizeSpace() { + assertEquals(null, StringUtils.normalizeSpace(null)); + assertEquals("", StringUtils.normalizeSpace("")); + assertEquals("", StringUtils.normalizeSpace(" ")); + assertEquals("", StringUtils.normalizeSpace("\t")); + assertEquals("", StringUtils.normalizeSpace("\n")); + assertEquals("", StringUtils.normalizeSpace("\u000B")); + assertEquals("", StringUtils.normalizeSpace("\f")); + assertEquals("", StringUtils.normalizeSpace("\r")); + assertEquals("a", StringUtils.normalizeSpace(" a ")); + assertEquals("a b c", StringUtils.normalizeSpace(" a b c ")); + assertEquals("a b c", StringUtils.normalizeSpace("a\t\f\r b\u000B c\n")); + } }