Fix split to use whitespace, remove StringTokenizer

Performance tune some methods


git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137452 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stephen Colebourne 2003-07-19 18:09:33 +00:00
parent 4bf65d4732
commit 6c009fd782
1 changed files with 168 additions and 102 deletions

View File

@ -56,7 +56,6 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.commons.lang.math.NumberUtils;
@ -99,10 +98,25 @@
* @author Arun Mammen Thomas
* @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
* @since 1.0
* @version $Id: StringUtils.java,v 1.64 2003/07/19 00:22:50 scolebourne Exp $
* @version $Id: StringUtils.java,v 1.65 2003/07/19 18:09:33 scolebourne Exp $
*/
public class StringUtils {
// Performance testing notes (JDK 1.4, Jul03, scolebourne)
// Whitespace:
// Character.isWhitespace() is faster than WHITESPACE.indexOf()
// where WHITESPACE is a string of all whitespace characters
//
// Character access:
// String.charAt(n) versus toCharArray(), then array[n]
// String.charAt(n) is about 15% worse for a 10K string
// They are about equal for a length 50 string
// String.charAt(n) is about 4 times better for a length 3 string
// String.charAt(n) is best bet overall
//
// Append:
// String.concat about twice as fast as StringBuffer.append
// (not sure who tested this)
/**
* <p>The maximum size to which the padding constant(s) can expand.</p>
*/
@ -121,7 +135,6 @@ public class StringUtils {
* <p>Used for efficient space padding. The length of each String expands as needed.</p>
*/
private final static String[] padding = new String[Character.MAX_VALUE];
// String.concat about twice as fast as StringBuffer.append
/**
* <p><code>StringUtils<code> instances should NOT be constructed in
@ -181,7 +194,7 @@ public static String clean(String str) {
*
* @see java.lang.String#trim()
* @param str the String to be trimmed, may be null
* @return the trimmed text, <code>null</code> if null String input
* @return the trimmed string, <code>null</code> if null String input
*/
public static String trim(String str) {
return (str == null ? null : str.trim());
@ -206,7 +219,7 @@ public static String trim(String str) {
* @see java.lang.String#trim()
* @param str the String to be trimmed, may be null
* @return the trimmed String,
* <code>null</code> if a whitespace, empty or null String input
* <code>null</code> if only chars &lt;= 32, empty or null String input
*/
public static String trimToNull(String str) {
String ts = trim(str);
@ -231,7 +244,7 @@ public static String trimToNull(String str) {
*
* @see java.lang.String#trim()
* @param str the String to be trimmed, may be null
* @return the trimmed String, or an empty String if null input
* @return the trimmed String, or an empty String if <code>null</code> input
*/
public static String trimToEmpty(String str) {
return (str == null ? "" : str.trim());
@ -287,8 +300,8 @@ public static String deleteWhitespace(String str) {
if (str == null) {
return null;
}
StringBuffer buffer = new StringBuffer();
int sz = str.length();
StringBuffer buffer = new StringBuffer(sz);
for (int i = 0; i < sz; i++) {
if (!Character.isWhitespace(str.charAt(i))) {
buffer.append(str.charAt(i));
@ -992,13 +1005,16 @@ public static String mid(String str, int pos, int len) {
* separator.
* Whitespace is defined by {@link Character#isWhitespace(char)}.</p>
*
* <p>The separator is not included in the returned String array.</p>
* <p>The separator is not included in the returned String array.
* Adjacent separators are treated as one separator.</p>
*
* <p>A <code>null</code> input String returns <code>null</code>.</p>
*
* <pre>
* StringUtils.split(null) = null
* StringUtils.split("abc def") = ["abc", "def"]
* StringUtils.split(null) = null
* StringUtils.split("") = []
* StringUtils.split("abc def") = ["abc", "def"]
* StringUtils.split("abc def") = ["abc", "def"]
* </pre>
*
* @param str the String to parse, may be null
@ -1009,17 +1025,19 @@ public static String[] split(String str) {
}
/**
* <p>Splits the provided text into an array, using the specified separator.</p>
* <p>Splits the provided text into an array, separator specified.
* This is an alternative to using StringTokenizer.</p>
*
* <p>The separator is not included in the returned String array.
* Adjacent separators will cause an empty String to be returned ("").</p>
* Adjacent separators are treated as one separator.</p>
*
* <p>A <code>null</code> input String returns <code>null</code>.</p>
*
* <pre>
* StringUtils.split(null, '.') = null
* StringUtils.split("a.b.c", '.') = ["a", "b", "c"]
* StringUtils.split("a..b.c", '.') = ["a", "", "b", "c"]
* StringUtils.split(null, '.') = null
* StringUtils.split("", '.') = []
* StringUtils.split("a.b.c", '.') = ["a", "b", "c"]
* StringUtils.split("a..b.c", '.') = ["a", "b", "c"]
* StringUtils.split("a:b:c", '.') = ["a:b:c"]
* </pre>
*
@ -1029,34 +1047,52 @@ public static String[] split(String str) {
* @return an array of parsed Strings, <code>null</code> if null String input
*/
public static String[] split(String str, char separatorChar) {
// Performance tuned for 2.0 (JDK1.4)
if (str == null) {
return null;
}
char[] chars = str.toCharArray();
List list = new ArrayList();
int start = 0;
for (int i = 0; i < chars.length; i++) {
if (chars[i] == separatorChar) {
list.add(str.substring(start, i));
start = i + 1;
}
int len = str.length();
if (len == 0) {
return ArrayUtils.EMPTY_STRING_ARRAY;
}
List list = new ArrayList();
int i =0, start = 0;
boolean match = false;
while (i < len) {
if (str.charAt(i) == separatorChar) {
if (match) {
list.add(str.substring(start, i));
match = false;
}
start = ++i;
continue;
}
match = true;
i++;
}
if (match) {
list.add(str.substring(start, i));
}
list.add(str.substring(start));
return (String[]) list.toArray(new String[list.size()]);
}
/**
* <p>Splits the provided text into an array, using the specified separators.</p>
* <p>Splits the provided text into an array, separators specified.
* This is an alternative to using StringTokenizer.</p>
*
* <p>The separator is not included in the returned String array.</p>
* <p>The separator is not included in the returned String array.
* Adjacent separators are treated as one separator.</p>
*
* <p>A <code>null</code> input String returns <code>null</code>.
* A <code>null</code> separatorChars splits on whitespace.</p>
*
* <pre>
* StringUtils.split(null, null) = null
* StringUtils.split("", null) = []
* StringUtils.split("abc def", null) = ["abc", "def"]
* StringUtils.split("abc def", " ") = ["abc", "def"]
* StringUtils.split("abc def", " ") = ["abc", "def"]
* StringUtils.split("ab:cd:ef", ":") = ["ab", "cd", "ef"]
* </pre>
*
@ -1070,74 +1106,104 @@ public static String[] split(String str, String separatorChars) {
}
/**
* <p>Splits the provided text into a array, based on a given separator.</p>
* <p>Splits the provided text into an array, separators specified.
* This is an alternative to using StringTokenizer.</p>
*
* <p>The separator is not included in the returned String array. The
* maximum number of splits to perfom can be controlled. A <code>null</code>
* separator will cause parsing to be on whitespace.</p>
*
* <p>This is useful for quickly splitting a String directly into
* an array of tokens, instead of an enumeration of tokens (as
* <code>StringTokenizer</code> does).</p>
* <p>The separator is not included in the returned String array.
* Adjacent separators are treated as one separator.</p>
*
* <p>A <code>null</code> input String returns <code>null</code>.
* A <code>null</code> separatorChars splits on whitespace.</p>
*
* <pre>
* StringUtils.split(null, null, 0) = null
* StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"]
* StringUtils.split("ab:cd:ef", ":", 0) = ["ab", "cd", "ef"]
* StringUtils.split("ab:cd:ef", ":", 2) = ["ab", "cdef"]
* StringUtils.split(null, null, 0) = null
* StringUtils.split("", null, 0) = []
* StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"]
* StringUtils.split("ab de fg", null, 0) = ["ab", "cd", "ef"]
* StringUtils.split("ab:cd:ef", ":", 0) = ["ab", "cd", "ef"]
* StringUtils.split("ab:cd:ef", ":", 2) = ["ab", "cdef"]
* </pre>
*
* @param str the String to parse, may be null
* @param separatorChars the characters used as the delimiters,
* <code>null</code> splits on whitespace
* @param max the maximum number of elements to include in the
* array. A zero or negative value implies no limit.
* array. A zero or negative value implies no limit
* @return an array of parsed Strings, <code>null</code> if null String input
*/
public static String[] split(String str, String separatorChars, int max) {
// Performance tuned for 2.0 (JDK1.4)
// Direct code is quicker than StringTokenizer.
// Also, StringTokenizer uses isSpace() not isWhitespace()
if (str == null) {
return null;
}
StringTokenizer tok = null;
int len = str.length();
if (len == 0) {
return ArrayUtils.EMPTY_STRING_ARRAY;
}
List list = new ArrayList();
int sizePlus1 = 1;
int i =0, start = 0;
boolean match = false;
if (separatorChars == null) {
// Null separator means we're using StringTokenizer's default
// delimiter, which comprises all whitespace characters.
// TODO: StringTokenizer uses isSpace() not isWhitespace()
tok = new StringTokenizer(str);
} else {
tok = new StringTokenizer(str, separatorChars);
}
int listSize = tok.countTokens();
if (max > 0 && listSize > max) {
listSize = max;
}
String[] list = new String[listSize];
int i = 0;
int lastTokenBegin = 0;
int lastTokenEnd = 0;
while (tok.hasMoreTokens()) {
if (max > 0 && i == listSize - 1) {
// In the situation where we hit the max yet have
// tokens left over in our input, the last list
// element gets all remaining text.
String endToken = tok.nextToken();
lastTokenBegin = str.indexOf(endToken, lastTokenEnd);
list[i] = str.substring(lastTokenBegin);
break;
} else {
list[i] = tok.nextToken();
lastTokenBegin = str.indexOf(list[i], lastTokenEnd);
lastTokenEnd = lastTokenBegin + list[i].length();
// Null separator means use whitespace
while (i < len) {
if (Character.isWhitespace(str.charAt(i))) {
if (match) {
if (sizePlus1++ == max) {
i = len;
}
list.add(str.substring(start, i));
match = false;
}
start = ++i;
continue;
}
match = true;
i++;
}
} else if (separatorChars.length() == 1) {
// Optimise 1 character case
char sep = separatorChars.charAt(0);
while (i < len) {
if (str.charAt(i) == sep) {
if (match) {
if (sizePlus1++ == max) {
i = len;
}
list.add(str.substring(start, i));
match = false;
}
start = ++i;
continue;
}
match = true;
i++;
}
} else {
// standard case
while (i < len) {
if (separatorChars.indexOf(str.charAt(i)) >= 0) {
if (match) {
if (sizePlus1++ == max) {
i = len;
}
list.add(str.substring(start, i));
match = false;
}
start = ++i;
continue;
}
match = true;
i++;
}
i++;
}
return list;
if (match) {
list.add(str.substring(start, i));
}
return (String[]) list.toArray(new String[list.size()]);
}
// Joining
@ -1792,41 +1858,47 @@ public static String escape(String str) {
* <code>null</code> if null String input
*/
public static String repeat(String str, int repeat) {
// Performance tuned for 2.0 (JDK1.4)
if (str == null) {
return null;
}
if (repeat <= 0) {
return "";
}
int inputLength = str.length();
int inputLength;
if (repeat == 1 || (inputLength = str.length()) == 0) {
return str;
}
if (inputLength == 1 && repeat <= PAD_LIMIT) {
return padding(repeat, str.charAt(0));
}
char[] input = str.toCharArray();
char[] output = new char[repeat * inputLength];
int outputLength = inputLength * repeat;
switch (inputLength) {
case 1:
char ch = input[0];
char ch = str.charAt(0);
char[] output1 = new char[outputLength];
for (int i = repeat - 1; i >= 0; i--) {
output[i] = ch;
output1[i] = ch;
}
break;
return new String(output1);
case 2:
char ch0 = input[0];
char ch1 = input[1];
char ch0 = str.charAt(0);
char ch1 = str.charAt(1);
char[] output2 = new char[outputLength];
for (int i = repeat * 2 - 2; i >= 0; i--,i--) {
output[i] = ch0;
output[i + 1] = ch1;
output2[i] = ch0;
output2[i + 1] = ch1;
}
break;
return new String(output2);
default:
for (int i = repeat - 1; i >= 0; i--) {
System.arraycopy(input, 0, output, i * inputLength, inputLength);
}
break;
StringBuffer buf = new StringBuffer(outputLength);
for (int i = 0; i < repeat; i++) {
buf.append(str);
}
return buf.toString();
}
return new String(output);
}
/**
@ -2210,8 +2282,7 @@ public static String center(String str, int size, String padStr) {
* This is similar to {@link String#trim()} but instead removes whitespace.
* Whitespace is defined by {@link Character#isWhitespace(char)}.</p>
*
* <p>If the input String is <code>null</code>, <code>null</code>
* is returned.</p>
* <p>A <code>null</code> input String returns <code>null</code>.</p>
*
* <pre>
* StringUtils.strip(null) = null
@ -2234,8 +2305,7 @@ public static String strip(String str) {
* This is similar to {@link String#trim()} but allows the characters
* to be stripped to be controlled.</p>
*
* <p>If the input String is <code>null</code>, <code>null</code>
* is returned.</p>
* <p>A <code>null</code> input String returns <code>null</code>.</p>
*
* <p>If the stripChars String is <code>null</code>, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}.
@ -2262,8 +2332,7 @@ public static String strip(String str, String stripChars) {
/**
* <p>Strips any of a set of characters from the start of a String.</p>
*
* <p>If the input String is <code>null</code>, <code>null</code>
* is returned.</p>
* <p>A <code>null</code> input String returns <code>null</code>.</p>
*
* <p>If the stripChars String is <code>null</code>, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}.</p>
@ -2302,8 +2371,7 @@ public static String stripStart(String str, String stripChars) {
/**
* <p>Strips any of a set of characters from the end of a String.</p>
*
* <p>If the input String is <code>null</code>, <code>null</code>
* is returned.</p>
* <p>A <code>null</code> input String returns <code>null</code>.</p>
*
* <p>If the stripChars String is <code>null</code>, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}.</p>
@ -2570,9 +2638,8 @@ public static String capitaliseAllWords(String str) {
}
StringBuffer buffer = new StringBuffer(strLen);
boolean whitespace = true;
char[] strChars = str.toCharArray();
for (int i = 0; i < strLen; i++) {
char ch = strChars[i];
char ch = str.charAt(i);
if (Character.isWhitespace(ch)) {
buffer.append(ch);
whitespace = true;
@ -2609,9 +2676,8 @@ public static String uncapitaliseAllWords(String str) {
}
StringBuffer buffer = new StringBuffer(strLen);
boolean whitespace = true;
char[] strChars = str.toCharArray();
for (int i = 0; i < strLen; i++) {
char ch = strChars[i];
char ch = str.charAt(i);
if (Character.isWhitespace(ch)) {
buffer.append(ch);
whitespace = true;