Added a trimmer matcher which is now used to determine

which characters to trim off the left and right of tokens.
the ignore matcher is now truly an ignore matcher, which
will leave out any matching characters from all tokens.
I also fixed a few minor bugs.


git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@138002 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Matthew P. Inger 2004-12-23 18:55:48 +00:00
parent d669920cb2
commit 57b94c34ea
1 changed files with 157 additions and 96 deletions

View File

@ -32,14 +32,17 @@ import java.util.ListIterator;
* Each token is separated from the next String by a <i>delimiter</i>. * Each token is separated from the next String by a <i>delimiter</i>.
* One or more delimiter characters must be specified. * One or more delimiter characters must be specified.
* <p> * <p>
* The processing then strips all the <i>ignored</i> characters from each side of the token. * The processing then strips all the <i>ignored</i> characters from then entire string (this
* is useful for removing things like carriage returns, and so forth)
* <p>
* The processing then strips all the <i>trimmer</i> characters from the ends of the string.
* <p>
* The token may also have <i>quotes</i> to mark an area not to be stripped or tokenized. * The token may also have <i>quotes</i> to mark an area not to be stripped or tokenized.
* Empty tokens may be removed or returned as null. * Empty tokens may be removed or returned as null.
* This example is based on the CSV tokenizer.
* <pre> * <pre>
* "a,b,c" - Three tokens "a","b","c" (comma delimiter) * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
* "a, b , c" - Three tokens "a","b","c" (ignored space characters stripped) * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
* "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched) * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
* </pre> * </pre>
* <p> * <p>
* *
@ -70,7 +73,7 @@ import java.util.ListIterator;
* @author Stephen Colebourne * @author Stephen Colebourne
* @author Gary D. Gregory * @author Gary D. Gregory
* @since 2.1 * @since 2.1
* @version $Id: StrTokenizer.java,v 1.1 2004/10/06 22:29:24 scolebourne Exp $ * @version $Id: StrTokenizer.java,v 1.2 2004/12/23 18:55:48 mattinger Exp $
*/ */
public class StrTokenizer implements ListIterator, Cloneable { public class StrTokenizer implements ListIterator, Cloneable {
@ -107,12 +110,12 @@ public class StrTokenizer implements ListIterator, Cloneable {
public static final Matcher DOUBLE_QUOTE_MATCHER = new CharMatcher('"'); public static final Matcher DOUBLE_QUOTE_MATCHER = new CharMatcher('"');
/** /**
* A Matcher which matches the String trim() whitespace characters. * A Matcher which matches the String trim() whitespace characters.
* Best used for <code>ignored</code>. * Best used for <code>trimmer</code>.
*/ */
public static final Matcher TRIM_MATCHER = new TrimMatcher(); public static final Matcher TRIM_MATCHER = new TrimMatcher();
/** /**
* A Matcher that matches no characters. Don't use this for delimiters! * A Matcher that matches no characters. Don't use this for delimiters!
* Best used for <code>ignored</code>. * Best used for <code>trimmer</code>.
*/ */
public static final Matcher NONE_MATCHER = new NoMatcher(); public static final Matcher NONE_MATCHER = new NoMatcher();
@ -122,14 +125,16 @@ public class StrTokenizer implements ListIterator, Cloneable {
CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(COMMA_MATCHER); CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(COMMA_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER); CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER); CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(NONE_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(TRIM_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(TAB_MATCHER); TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(TAB_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER); TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER); TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(NONE_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(TRIM_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
} }
@ -149,6 +154,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
private Matcher quote = NONE_MATCHER; private Matcher quote = NONE_MATCHER;
/** The ignored matcher */ /** The ignored matcher */
private Matcher ignored = NONE_MATCHER; private Matcher ignored = NONE_MATCHER;
private Matcher trimmer = TRIM_MATCHER;
/** Whether to return empty tokens as null */ /** Whether to return empty tokens as null */
private boolean emptyAsNull = false; private boolean emptyAsNull = false;
/** Whether to ignore empty tokens */ /** Whether to ignore empty tokens */
@ -210,8 +218,13 @@ public class StrTokenizer implements ListIterator, Cloneable {
} }
//----------------------------------------------------------------------- //-----------------------------------------------------------------------
/** /**
* Gets a new tokenizer instance which parses Comma Seperated Value strings. * Gets a new tokenizer instance which parses Comma Seperated Value strings
* initializing it with the given input. The default for CSV processing
* will be trim whitespace from both ends (which can be overriden with
* the setTrimmer method).
* <p>
* You must call a "reset" method to set the string which you want to parse. * You must call a "reset" method to set the string which you want to parse.
*/ */
public static StrTokenizer getCSVInstance() { public static StrTokenizer getCSVInstance() {
@ -220,9 +233,11 @@ public class StrTokenizer implements ListIterator, Cloneable {
/** /**
* Gets a new tokenizer instance which parses Comma Seperated Value strings * Gets a new tokenizer instance which parses Comma Seperated Value strings
* initializing it with the given input. * initializing it with the given input. The default for CSV processing
* will be trim whitespace from both ends (which can be overriden with
* the setTrimmer method).
* *
* @param input the string to parse * @param input the text to parse
*/ */
public static StrTokenizer getCSVInstance(String input) { public static StrTokenizer getCSVInstance(String input) {
StrTokenizer tok = (StrTokenizer)(CSV_TOKENIZER_PROTOTYPE.clone()); StrTokenizer tok = (StrTokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
@ -232,7 +247,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
/** /**
* Gets a new tokenizer instance which parses Comma Seperated Value strings * Gets a new tokenizer instance which parses Comma Seperated Value strings
* initializing it with the given input. * initializing it with the given input. The default for CSV processing
* will be trim whitespace from both ends (which can be overriden with
* the setTrimmer method).
* *
* @param input the text to parse * @param input the text to parse
*/ */
@ -244,6 +261,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
/** /**
* Gets a new tokenizer instance which parses Tab Seperated Value strings. * Gets a new tokenizer instance which parses Tab Seperated Value strings.
* The default for CSV processing will be trim whitespace from both ends
* (which can be overriden with the setTrimmer method).
* <p>
* You must call a "reset" method to set the string which you want to parse. * You must call a "reset" method to set the string which you want to parse.
*/ */
public static StrTokenizer getTSVInstance() { public static StrTokenizer getTSVInstance() {
@ -251,9 +271,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
} }
/** /**
* Gets a new tokenizer instance which parses Tab Seperated Value strings * Gets a new tokenizer instance which parses Tab Seperated Value strings.
* initializing it with the given input. * The default for CSV processing will be trim whitespace from both ends
* * (which can be overriden with the setTrimmer method).
* @param input the string to parse * @param input the string to parse
*/ */
public static StrTokenizer getTSVInstance(String input) { public static StrTokenizer getTSVInstance(String input) {
@ -263,10 +283,10 @@ public class StrTokenizer implements ListIterator, Cloneable {
} }
/** /**
* Gets a new tokenizer instance which parses Tab Seperated Value strings * Gets a new tokenizer instance which parses Tab Seperated Value strings.
* initializing it with the given input. * The default for CSV processing will be trim whitespace from both ends
* * (which can be overriden with the setTrimmer method).
* @param input the text to parse * @param input the string to parse
*/ */
public static StrTokenizer getTSVInstance(char[] input) { public static StrTokenizer getTSVInstance(char[] input) {
StrTokenizer tok = (StrTokenizer)(TSV_TOKENIZER_PROTOTYPE.clone()); StrTokenizer tok = (StrTokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
@ -685,7 +705,6 @@ public class StrTokenizer implements ListIterator, Cloneable {
int delimLen = 0; int delimLen = 0;
int quoteLen = 0; int quoteLen = 0;
while (start < len && while (start < len &&
(ignoreLen = ignored.isMatch(chars, len, start)) >= 1 &&
(delimLen = delim.isMatch(chars, len, start)) < 1 && (delimLen = delim.isMatch(chars, len, start)) < 1 &&
(quoteLen = quote.isMatch(chars, len, start)) < 1) { (quoteLen = quote.isMatch(chars, len, start)) < 1) {
start += ignoreLen; start += ignoreLen;
@ -817,7 +836,23 @@ public class StrTokenizer implements ListIterator, Cloneable {
pos++; pos++;
} }
token.append(chars, start, Math.min(pos, len) - start); /* Trim string based on the trimmer matcher */
while (trimmer.isMatch(chars, 1, start) > 0) {
start++;
}
int length = Math.min(pos, len) - start;
while (trimmer.isMatch(chars, 1, start + length - 1) > 0) {
length--;
}
for (int i=0;i<length;i++) {
if (ignored.isMatch(chars, 1, start + i) == 0) {
token.append(chars[start + i]);
}
}
return pos + delimLen; return pos + delimLen;
} }
@ -942,6 +977,32 @@ public class StrTokenizer implements ListIterator, Cloneable {
} }
} }
// Trimmer
//-----------------------------------------------------------------------
/**
* Gets the trimmer character matcher.
* <p>
* These characters are trimmed off the beginning and ending of an unquoted string.
* The default value is space (' ') and all char control characters (32 and less).
*
* @return the trimmer matcher in use
*/
public Matcher getTrimmerMatcher() {
return trimmer;
}
/**
* Set the matcher for characters to trim off the beginning and end of an
* unquoted string.
*
* @param trimmer the trimmer matcher to use, null ignored
*/
public void setTrimmerMatcher(Matcher trimmer) {
if (trimmer != null) {
this.trimmer = trimmer;
}
}
/** /**
* Set the character to ignore. * Set the character to ignore.
* <p> * <p>
@ -1060,7 +1121,7 @@ public class StrTokenizer implements ListIterator, Cloneable {
/** /**
* Class used to define a set of characters for matching purposes. * Class used to define a set of characters for matching purposes.
*/ */
static final class CharSetMatcher implements Matcher { public static final class CharSetMatcher implements Matcher {
private char[] chars; private char[] chars;
/** /**
@ -1068,7 +1129,7 @@ public class StrTokenizer implements ListIterator, Cloneable {
* *
* @param chars the characters to match, must not be null * @param chars the characters to match, must not be null
*/ */
CharSetMatcher(char chars[]) { public CharSetMatcher(char chars[]) {
super(); super();
this.chars = (char[]) chars.clone(); this.chars = (char[]) chars.clone();
Arrays.sort(this.chars); Arrays.sort(this.chars);