Fix split to use whitespace, remove StringTokenizer

Performance tune some methods


git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137452 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stephen Colebourne 2003-07-19 18:09:33 +00:00
parent 4bf65d4732
commit 6c009fd782
1 changed files with 168 additions and 102 deletions

View File

@ -56,7 +56,6 @@
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.StringTokenizer;
import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.lang.math.NumberUtils;
@ -99,10 +98,25 @@
* @author Arun Mammen Thomas * @author Arun Mammen Thomas
* @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a> * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
* @since 1.0 * @since 1.0
* @version $Id: StringUtils.java,v 1.64 2003/07/19 00:22:50 scolebourne Exp $ * @version $Id: StringUtils.java,v 1.65 2003/07/19 18:09:33 scolebourne Exp $
*/ */
public class StringUtils { public class StringUtils {
// Performance testing notes (JDK 1.4, Jul03, scolebourne)
// Whitespace:
// Character.isWhitespace() is faster than WHITESPACE.indexOf()
// where WHITESPACE is a string of all whitespace characters
//
// Character access:
// String.charAt(n) versus toCharArray(), then array[n]
// String.charAt(n) is about 15% worse for a 10K string
// They are about equal for a length 50 string
// String.charAt(n) is about 4 times better for a length 3 string
// String.charAt(n) is best bet overall
//
// Append:
// String.concat about twice as fast as StringBuffer.append
// (not sure who tested this)
/** /**
* <p>The maximum size to which the padding constant(s) can expand.</p> * <p>The maximum size to which the padding constant(s) can expand.</p>
*/ */
@ -121,7 +135,6 @@ public class StringUtils {
* <p>Used for efficient space padding. The length of each String expands as needed.</p> * <p>Used for efficient space padding. The length of each String expands as needed.</p>
*/ */
private final static String[] padding = new String[Character.MAX_VALUE]; private final static String[] padding = new String[Character.MAX_VALUE];
// String.concat about twice as fast as StringBuffer.append
/** /**
* <p><code>StringUtils<code> instances should NOT be constructed in * <p><code>StringUtils<code> instances should NOT be constructed in
@ -181,7 +194,7 @@ public static String clean(String str) {
* *
* @see java.lang.String#trim() * @see java.lang.String#trim()
* @param str the String to be trimmed, may be null * @param str the String to be trimmed, may be null
* @return the trimmed text, <code>null</code> if null String input * @return the trimmed string, <code>null</code> if null String input
*/ */
public static String trim(String str) { public static String trim(String str) {
return (str == null ? null : str.trim()); return (str == null ? null : str.trim());
@ -206,7 +219,7 @@ public static String trim(String str) {
* @see java.lang.String#trim() * @see java.lang.String#trim()
* @param str the String to be trimmed, may be null * @param str the String to be trimmed, may be null
* @return the trimmed String, * @return the trimmed String,
* <code>null</code> if a whitespace, empty or null String input * <code>null</code> if only chars &lt;= 32, empty or null String input
*/ */
public static String trimToNull(String str) { public static String trimToNull(String str) {
String ts = trim(str); String ts = trim(str);
@ -231,7 +244,7 @@ public static String trimToNull(String str) {
* *
* @see java.lang.String#trim() * @see java.lang.String#trim()
* @param str the String to be trimmed, may be null * @param str the String to be trimmed, may be null
* @return the trimmed String, or an empty String if null input * @return the trimmed String, or an empty String if <code>null</code> input
*/ */
public static String trimToEmpty(String str) { public static String trimToEmpty(String str) {
return (str == null ? "" : str.trim()); return (str == null ? "" : str.trim());
@ -287,8 +300,8 @@ public static String deleteWhitespace(String str) {
if (str == null) { if (str == null) {
return null; return null;
} }
StringBuffer buffer = new StringBuffer();
int sz = str.length(); int sz = str.length();
StringBuffer buffer = new StringBuffer(sz);
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
if (!Character.isWhitespace(str.charAt(i))) { if (!Character.isWhitespace(str.charAt(i))) {
buffer.append(str.charAt(i)); buffer.append(str.charAt(i));
@ -992,13 +1005,16 @@ public static String mid(String str, int pos, int len) {
* separator. * separator.
* Whitespace is defined by {@link Character#isWhitespace(char)}.</p> * Whitespace is defined by {@link Character#isWhitespace(char)}.</p>
* *
* <p>The separator is not included in the returned String array.</p> * <p>The separator is not included in the returned String array.
* Adjacent separators are treated as one separator.</p>
* *
* <p>A <code>null</code> input String returns <code>null</code>.</p> * <p>A <code>null</code> input String returns <code>null</code>.</p>
* *
* <pre> * <pre>
* StringUtils.split(null) = null * StringUtils.split(null) = null
* StringUtils.split("abc def") = ["abc", "def"] * StringUtils.split("") = []
* StringUtils.split("abc def") = ["abc", "def"]
* StringUtils.split("abc def") = ["abc", "def"]
* </pre> * </pre>
* *
* @param str the String to parse, may be null * @param str the String to parse, may be null
@ -1009,17 +1025,19 @@ public static String[] split(String str) {
} }
/** /**
* <p>Splits the provided text into an array, using the specified separator.</p> * <p>Splits the provided text into an array, separator specified.
* This is an alternative to using StringTokenizer.</p>
* *
* <p>The separator is not included in the returned String array. * <p>The separator is not included in the returned String array.
* Adjacent separators will cause an empty String to be returned ("").</p> * Adjacent separators are treated as one separator.</p>
* *
* <p>A <code>null</code> input String returns <code>null</code>.</p> * <p>A <code>null</code> input String returns <code>null</code>.</p>
* *
* <pre> * <pre>
* StringUtils.split(null, '.') = null * StringUtils.split(null, '.') = null
* StringUtils.split("a.b.c", '.') = ["a", "b", "c"] * StringUtils.split("", '.') = []
* StringUtils.split("a..b.c", '.') = ["a", "", "b", "c"] * StringUtils.split("a.b.c", '.') = ["a", "b", "c"]
* StringUtils.split("a..b.c", '.') = ["a", "b", "c"]
* StringUtils.split("a:b:c", '.') = ["a:b:c"] * StringUtils.split("a:b:c", '.') = ["a:b:c"]
* </pre> * </pre>
* *
@ -1029,34 +1047,52 @@ public static String[] split(String str) {
* @return an array of parsed Strings, <code>null</code> if null String input * @return an array of parsed Strings, <code>null</code> if null String input
*/ */
public static String[] split(String str, char separatorChar) { public static String[] split(String str, char separatorChar) {
// Performance tuned for 2.0 (JDK1.4)
if (str == null) { if (str == null) {
return null; return null;
} }
char[] chars = str.toCharArray(); int len = str.length();
List list = new ArrayList(); if (len == 0) {
int start = 0; return ArrayUtils.EMPTY_STRING_ARRAY;
for (int i = 0; i < chars.length; i++) { }
if (chars[i] == separatorChar) { List list = new ArrayList();
list.add(str.substring(start, i)); int i =0, start = 0;
start = i + 1; boolean match = false;
} while (i < len) {
if (str.charAt(i) == separatorChar) {
if (match) {
list.add(str.substring(start, i));
match = false;
}
start = ++i;
continue;
}
match = true;
i++;
}
if (match) {
list.add(str.substring(start, i));
} }
list.add(str.substring(start));
return (String[]) list.toArray(new String[list.size()]); return (String[]) list.toArray(new String[list.size()]);
} }
/** /**
* <p>Splits the provided text into an array, using the specified separators.</p> * <p>Splits the provided text into an array, separators specified.
* This is an alternative to using StringTokenizer.</p>
* *
* <p>The separator is not included in the returned String array.</p> * <p>The separator is not included in the returned String array.
* Adjacent separators are treated as one separator.</p>
* *
* <p>A <code>null</code> input String returns <code>null</code>. * <p>A <code>null</code> input String returns <code>null</code>.
* A <code>null</code> separatorChars splits on whitespace.</p> * A <code>null</code> separatorChars splits on whitespace.</p>
* *
* <pre> * <pre>
* StringUtils.split(null, null) = null * StringUtils.split(null, null) = null
* StringUtils.split("", null) = []
* StringUtils.split("abc def", null) = ["abc", "def"] * StringUtils.split("abc def", null) = ["abc", "def"]
* StringUtils.split("abc def", " ") = ["abc", "def"] * StringUtils.split("abc def", " ") = ["abc", "def"]
* StringUtils.split("abc def", " ") = ["abc", "def"]
* StringUtils.split("ab:cd:ef", ":") = ["ab", "cd", "ef"] * StringUtils.split("ab:cd:ef", ":") = ["ab", "cd", "ef"]
* </pre> * </pre>
* *
@ -1070,74 +1106,104 @@ public static String[] split(String str, String separatorChars) {
} }
/** /**
* <p>Splits the provided text into a array, based on a given separator.</p> * <p>Splits the provided text into an array, separators specified.
* This is an alternative to using StringTokenizer.</p>
* *
* <p>The separator is not included in the returned String array. The * <p>The separator is not included in the returned String array.
* maximum number of splits to perfom can be controlled. A <code>null</code> * Adjacent separators are treated as one separator.</p>
* separator will cause parsing to be on whitespace.</p>
*
* <p>This is useful for quickly splitting a String directly into
* an array of tokens, instead of an enumeration of tokens (as
* <code>StringTokenizer</code> does).</p>
* *
* <p>A <code>null</code> input String returns <code>null</code>. * <p>A <code>null</code> input String returns <code>null</code>.
* A <code>null</code> separatorChars splits on whitespace.</p> * A <code>null</code> separatorChars splits on whitespace.</p>
* *
* <pre> * <pre>
* StringUtils.split(null, null, 0) = null * StringUtils.split(null, null, 0) = null
* StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"] * StringUtils.split("", null, 0) = []
* StringUtils.split("ab:cd:ef", ":", 0) = ["ab", "cd", "ef"] * StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"]
* StringUtils.split("ab:cd:ef", ":", 2) = ["ab", "cdef"] * StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"]
* StringUtils.split("ab:cd:ef", ":", 0) = ["ab", "cd", "ef"]
* StringUtils.split("ab:cd:ef", ":", 2) = ["ab", "cdef"]
* </pre> * </pre>
* *
* @param str the String to parse, may be null * @param str the String to parse, may be null
* @param separatorChars the characters used as the delimiters, * @param separatorChars the characters used as the delimiters,
* <code>null</code> splits on whitespace * <code>null</code> splits on whitespace
* @param max the maximum number of elements to include in the * @param max the maximum number of elements to include in the
* array. A zero or negative value implies no limit. * array. A zero or negative value implies no limit
* @return an array of parsed Strings, <code>null</code> if null String input * @return an array of parsed Strings, <code>null</code> if null String input
*/ */
public static String[] split(String str, String separatorChars, int max) { public static String[] split(String str, String separatorChars, int max) {
// Performance tuned for 2.0 (JDK1.4)
// Direct code is quicker than StringTokenizer.
// Also, StringTokenizer uses isSpace() not isWhitespace()
if (str == null) { if (str == null) {
return null; return null;
} }
StringTokenizer tok = null; int len = str.length();
if (len == 0) {
return ArrayUtils.EMPTY_STRING_ARRAY;
}
List list = new ArrayList();
int sizePlus1 = 1;
int i =0, start = 0;
boolean match = false;
if (separatorChars == null) { if (separatorChars == null) {
// Null separator means we're using StringTokenizer's default // Null separator means use whitespace
// delimiter, which comprises all whitespace characters. while (i < len) {
if (Character.isWhitespace(str.charAt(i))) {
// TODO: StringTokenizer uses isSpace() not isWhitespace() if (match) {
tok = new StringTokenizer(str); if (sizePlus1++ == max) {
} else { i = len;
tok = new StringTokenizer(str, separatorChars); }
} list.add(str.substring(start, i));
match = false;
int listSize = tok.countTokens(); }
if (max > 0 && listSize > max) { start = ++i;
listSize = max; continue;
} }
match = true;
String[] list = new String[listSize]; i++;
int i = 0; }
int lastTokenBegin = 0; } else if (separatorChars.length() == 1) {
int lastTokenEnd = 0; // Optimise 1 character case
while (tok.hasMoreTokens()) { char sep = separatorChars.charAt(0);
if (max > 0 && i == listSize - 1) { while (i < len) {
// In the situation where we hit the max yet have if (str.charAt(i) == sep) {
// tokens left over in our input, the last list if (match) {
// element gets all remaining text. if (sizePlus1++ == max) {
String endToken = tok.nextToken(); i = len;
lastTokenBegin = str.indexOf(endToken, lastTokenEnd); }
list[i] = str.substring(lastTokenBegin); list.add(str.substring(start, i));
break; match = false;
} else { }
list[i] = tok.nextToken(); start = ++i;
lastTokenBegin = str.indexOf(list[i], lastTokenEnd); continue;
lastTokenEnd = lastTokenBegin + list[i].length(); }
match = true;
i++;
}
} else {
// standard case
while (i < len) {
if (separatorChars.indexOf(str.charAt(i)) >= 0) {
if (match) {
if (sizePlus1++ == max) {
i = len;
}
list.add(str.substring(start, i));
match = false;
}
start = ++i;
continue;
}
match = true;
i++;
} }
i++;
} }
return list; if (match) {
list.add(str.substring(start, i));
}
return (String[]) list.toArray(new String[list.size()]);
} }
// Joining // Joining
@ -1792,41 +1858,47 @@ public static String escape(String str) {
* <code>null</code> if null String input * <code>null</code> if null String input
*/ */
public static String repeat(String str, int repeat) { public static String repeat(String str, int repeat) {
// Performance tuned for 2.0 (JDK1.4)
if (str == null) { if (str == null) {
return null; return null;
} }
if (repeat <= 0) { if (repeat <= 0) {
return ""; return "";
} }
int inputLength = str.length(); int inputLength;
if (repeat == 1 || (inputLength = str.length()) == 0) {
return str;
}
if (inputLength == 1 && repeat <= PAD_LIMIT) { if (inputLength == 1 && repeat <= PAD_LIMIT) {
return padding(repeat, str.charAt(0)); return padding(repeat, str.charAt(0));
} }
char[] input = str.toCharArray(); int outputLength = inputLength * repeat;
char[] output = new char[repeat * inputLength];
switch (inputLength) { switch (inputLength) {
case 1: case 1:
char ch = input[0]; char ch = str.charAt(0);
char[] output1 = new char[outputLength];
for (int i = repeat - 1; i >= 0; i--) { for (int i = repeat - 1; i >= 0; i--) {
output[i] = ch; output1[i] = ch;
} }
break; return new String(output1);
case 2: case 2:
char ch0 = input[0]; char ch0 = str.charAt(0);
char ch1 = input[1]; char ch1 = str.charAt(1);
char[] output2 = new char[outputLength];
for (int i = repeat * 2 - 2; i >= 0; i--,i--) { for (int i = repeat * 2 - 2; i >= 0; i--,i--) {
output[i] = ch0; output2[i] = ch0;
output[i + 1] = ch1; output2[i + 1] = ch1;
} }
break; return new String(output2);
default: default:
for (int i = repeat - 1; i >= 0; i--) { StringBuffer buf = new StringBuffer(outputLength);
System.arraycopy(input, 0, output, i * inputLength, inputLength); for (int i = 0; i < repeat; i++) {
} buf.append(str);
break; }
return buf.toString();
} }
return new String(output);
} }
/** /**
@ -2210,8 +2282,7 @@ public static String center(String str, int size, String padStr) {
* This is similar to {@link String#trim()} but instead removes whitespace. * This is similar to {@link String#trim()} but instead removes whitespace.
* Whitespace is defined by {@link Character#isWhitespace(char)}.</p> * Whitespace is defined by {@link Character#isWhitespace(char)}.</p>
* *
* <p>If the input String is <code>null</code>, <code>null</code> * <p>A <code>null</code> input String returns <code>null</code>.</p>
* is returned.</p>
* *
* <pre> * <pre>
* StringUtils.strip(null) = null * StringUtils.strip(null) = null
@ -2234,8 +2305,7 @@ public static String strip(String str) {
* This is similar to {@link String#trim()} but allows the characters * This is similar to {@link String#trim()} but allows the characters
* to be stripped to be controlled.</p> * to be stripped to be controlled.</p>
* *
* <p>If the input String is <code>null</code>, <code>null</code> * <p>A <code>null</code> input String returns <code>null</code>.</p>
* is returned.</p>
* *
* <p>If the stripChars String is <code>null</code>, whitespace is * <p>If the stripChars String is <code>null</code>, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}. * stripped as defined by {@link Character#isWhitespace(char)}.
@ -2262,8 +2332,7 @@ public static String strip(String str, String stripChars) {
/** /**
* <p>Strips any of a set of characters from the start of a String.</p> * <p>Strips any of a set of characters from the start of a String.</p>
* *
* <p>If the input String is <code>null</code>, <code>null</code> * <p>A <code>null</code> input String returns <code>null</code>.</p>
* is returned.</p>
* *
* <p>If the stripChars String is <code>null</code>, whitespace is * <p>If the stripChars String is <code>null</code>, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}.</p> * stripped as defined by {@link Character#isWhitespace(char)}.</p>
@ -2302,8 +2371,7 @@ public static String stripStart(String str, String stripChars) {
/** /**
* <p>Strips any of a set of characters from the end of a String.</p> * <p>Strips any of a set of characters from the end of a String.</p>
* *
* <p>If the input String is <code>null</code>, <code>null</code> * <p>A <code>null</code> input String returns <code>null</code>.</p>
* is returned.</p>
* *
* <p>If the stripChars String is <code>null</code>, whitespace is * <p>If the stripChars String is <code>null</code>, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}.</p> * stripped as defined by {@link Character#isWhitespace(char)}.</p>
@ -2570,9 +2638,8 @@ public static String capitaliseAllWords(String str) {
} }
StringBuffer buffer = new StringBuffer(strLen); StringBuffer buffer = new StringBuffer(strLen);
boolean whitespace = true; boolean whitespace = true;
char[] strChars = str.toCharArray();
for (int i = 0; i < strLen; i++) { for (int i = 0; i < strLen; i++) {
char ch = strChars[i]; char ch = str.charAt(i);
if (Character.isWhitespace(ch)) { if (Character.isWhitespace(ch)) {
buffer.append(ch); buffer.append(ch);
whitespace = true; whitespace = true;
@ -2609,9 +2676,8 @@ public static String uncapitaliseAllWords(String str) {
} }
StringBuffer buffer = new StringBuffer(strLen); StringBuffer buffer = new StringBuffer(strLen);
boolean whitespace = true; boolean whitespace = true;
char[] strChars = str.toCharArray();
for (int i = 0; i < strLen; i++) { for (int i = 0; i < strLen; i++) {
char ch = strChars[i]; char ch = str.charAt(i);
if (Character.isWhitespace(ch)) { if (Character.isWhitespace(ch)) {
buffer.append(ch); buffer.append(ch);
whitespace = true; whitespace = true;