Added a trimmer matcher which is now used to determine
which characters to trim off the left and right of tokens. the ignore matcher is now truly an ignore matcher, which will leave out any matching characters from all tokens. I also fixed a few minor bugs. git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@138002 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d669920cb2
commit
57b94c34ea
|
@ -32,14 +32,17 @@ import java.util.ListIterator;
|
|||
* Each token is separated from the next String by a <i>delimiter</i>.
|
||||
* One or more delimiter characters must be specified.
|
||||
* <p>
|
||||
* The processing then strips all the <i>ignored</i> characters from each side of the token.
|
||||
* The processing then strips all the <i>ignored</i> characters from then entire string (this
|
||||
* is useful for removing things like carriage returns, and so forth)
|
||||
* <p>
|
||||
* The processing then strips all the <i>trimmer</i> characters from the ends of the string.
|
||||
* <p>
|
||||
* The token may also have <i>quotes</i> to mark an area not to be stripped or tokenized.
|
||||
* Empty tokens may be removed or returned as null.
|
||||
* This example is based on the CSV tokenizer.
|
||||
* <pre>
|
||||
* "a,b,c" - Three tokens "a","b","c" (comma delimiter)
|
||||
* "a, b , c" - Three tokens "a","b","c" (ignored space characters stripped)
|
||||
* "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched)
|
||||
* " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
|
||||
* "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
|
||||
* </pre>
|
||||
* <p>
|
||||
*
|
||||
|
@ -70,7 +73,7 @@ import java.util.ListIterator;
|
|||
* @author Stephen Colebourne
|
||||
* @author Gary D. Gregory
|
||||
* @since 2.1
|
||||
* @version $Id: StrTokenizer.java,v 1.1 2004/10/06 22:29:24 scolebourne Exp $
|
||||
* @version $Id: StrTokenizer.java,v 1.2 2004/12/23 18:55:48 mattinger Exp $
|
||||
*/
|
||||
public class StrTokenizer implements ListIterator, Cloneable {
|
||||
|
||||
|
@ -107,12 +110,12 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
public static final Matcher DOUBLE_QUOTE_MATCHER = new CharMatcher('"');
|
||||
/**
|
||||
* A Matcher which matches the String trim() whitespace characters.
|
||||
* Best used for <code>ignored</code>.
|
||||
* Best used for <code>trimmer</code>.
|
||||
*/
|
||||
public static final Matcher TRIM_MATCHER = new TrimMatcher();
|
||||
/**
|
||||
* A Matcher that matches no characters. Don't use this for delimiters!
|
||||
* Best used for <code>ignored</code>.
|
||||
* Best used for <code>trimmer</code>.
|
||||
*/
|
||||
public static final Matcher NONE_MATCHER = new NoMatcher();
|
||||
|
||||
|
@ -122,14 +125,16 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
|
||||
CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(COMMA_MATCHER);
|
||||
CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
|
||||
CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
|
||||
CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(NONE_MATCHER);
|
||||
CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(TRIM_MATCHER);
|
||||
CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
|
||||
CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
|
||||
|
||||
TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
|
||||
TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(TAB_MATCHER);
|
||||
TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
|
||||
TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
|
||||
TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(NONE_MATCHER);
|
||||
CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(TRIM_MATCHER);
|
||||
TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
|
||||
TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
|
||||
}
|
||||
|
@ -149,6 +154,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
private Matcher quote = NONE_MATCHER;
|
||||
/** The ignored matcher */
|
||||
private Matcher ignored = NONE_MATCHER;
|
||||
|
||||
private Matcher trimmer = TRIM_MATCHER;
|
||||
|
||||
/** Whether to return empty tokens as null */
|
||||
private boolean emptyAsNull = false;
|
||||
/** Whether to ignore empty tokens */
|
||||
|
@ -210,8 +218,13 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets a new tokenizer instance which parses Comma Seperated Value strings.
|
||||
* Gets a new tokenizer instance which parses Comma Seperated Value strings
|
||||
* initializing it with the given input. The default for CSV processing
|
||||
* will be trim whitespace from both ends (which can be overriden with
|
||||
* the setTrimmer method).
|
||||
* <p>
|
||||
* You must call a "reset" method to set the string which you want to parse.
|
||||
*/
|
||||
public static StrTokenizer getCSVInstance() {
|
||||
|
@ -220,9 +233,11 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
|
||||
/**
|
||||
* Gets a new tokenizer instance which parses Comma Seperated Value strings
|
||||
* initializing it with the given input.
|
||||
* initializing it with the given input. The default for CSV processing
|
||||
* will be trim whitespace from both ends (which can be overriden with
|
||||
* the setTrimmer method).
|
||||
*
|
||||
* @param input the string to parse
|
||||
* @param input the text to parse
|
||||
*/
|
||||
public static StrTokenizer getCSVInstance(String input) {
|
||||
StrTokenizer tok = (StrTokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
|
||||
|
@ -232,7 +247,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
|
||||
/**
|
||||
* Gets a new tokenizer instance which parses Comma Seperated Value strings
|
||||
* initializing it with the given input.
|
||||
* initializing it with the given input. The default for CSV processing
|
||||
* will be trim whitespace from both ends (which can be overriden with
|
||||
* the setTrimmer method).
|
||||
*
|
||||
* @param input the text to parse
|
||||
*/
|
||||
|
@ -244,6 +261,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
|
||||
/**
|
||||
* Gets a new tokenizer instance which parses Tab Seperated Value strings.
|
||||
* The default for CSV processing will be trim whitespace from both ends
|
||||
* (which can be overriden with the setTrimmer method).
|
||||
* <p>
|
||||
* You must call a "reset" method to set the string which you want to parse.
|
||||
*/
|
||||
public static StrTokenizer getTSVInstance() {
|
||||
|
@ -251,9 +271,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
}
|
||||
|
||||
/**
|
||||
* Gets a new tokenizer instance which parses Tab Seperated Value strings
|
||||
* initializing it with the given input.
|
||||
*
|
||||
* Gets a new tokenizer instance which parses Tab Seperated Value strings.
|
||||
* The default for CSV processing will be trim whitespace from both ends
|
||||
* (which can be overriden with the setTrimmer method).
|
||||
* @param input the string to parse
|
||||
*/
|
||||
public static StrTokenizer getTSVInstance(String input) {
|
||||
|
@ -263,10 +283,10 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
}
|
||||
|
||||
/**
|
||||
* Gets a new tokenizer instance which parses Tab Seperated Value strings
|
||||
* initializing it with the given input.
|
||||
*
|
||||
* @param input the text to parse
|
||||
* Gets a new tokenizer instance which parses Tab Seperated Value strings.
|
||||
* The default for CSV processing will be trim whitespace from both ends
|
||||
* (which can be overriden with the setTrimmer method).
|
||||
* @param input the string to parse
|
||||
*/
|
||||
public static StrTokenizer getTSVInstance(char[] input) {
|
||||
StrTokenizer tok = (StrTokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
|
||||
|
@ -685,7 +705,6 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
int delimLen = 0;
|
||||
int quoteLen = 0;
|
||||
while (start < len &&
|
||||
(ignoreLen = ignored.isMatch(chars, len, start)) >= 1 &&
|
||||
(delimLen = delim.isMatch(chars, len, start)) < 1 &&
|
||||
(quoteLen = quote.isMatch(chars, len, start)) < 1) {
|
||||
start += ignoreLen;
|
||||
|
@ -817,7 +836,23 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
pos++;
|
||||
}
|
||||
|
||||
token.append(chars, start, Math.min(pos, len) - start);
|
||||
/* Trim string based on the trimmer matcher */
|
||||
while (trimmer.isMatch(chars, 1, start) > 0) {
|
||||
start++;
|
||||
}
|
||||
|
||||
int length = Math.min(pos, len) - start;
|
||||
|
||||
while (trimmer.isMatch(chars, 1, start + length - 1) > 0) {
|
||||
length--;
|
||||
}
|
||||
|
||||
for (int i=0;i<length;i++) {
|
||||
if (ignored.isMatch(chars, 1, start + i) == 0) {
|
||||
token.append(chars[start + i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return pos + delimLen;
|
||||
}
|
||||
|
@ -942,6 +977,32 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
}
|
||||
}
|
||||
|
||||
// Trimmer
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Gets the trimmer character matcher.
|
||||
* <p>
|
||||
* These characters are trimmed off the beginning and ending of an unquoted string.
|
||||
* The default value is space (' ') and all char control characters (32 and less).
|
||||
*
|
||||
* @return the trimmer matcher in use
|
||||
*/
|
||||
public Matcher getTrimmerMatcher() {
|
||||
return trimmer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the matcher for characters to trim off the beginning and end of an
|
||||
* unquoted string.
|
||||
*
|
||||
* @param trimmer the trimmer matcher to use, null ignored
|
||||
*/
|
||||
public void setTrimmerMatcher(Matcher trimmer) {
|
||||
if (trimmer != null) {
|
||||
this.trimmer = trimmer;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the character to ignore.
|
||||
* <p>
|
||||
|
@ -1060,7 +1121,7 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
/**
|
||||
* Class used to define a set of characters for matching purposes.
|
||||
*/
|
||||
static final class CharSetMatcher implements Matcher {
|
||||
public static final class CharSetMatcher implements Matcher {
|
||||
private char[] chars;
|
||||
|
||||
/**
|
||||
|
@ -1068,7 +1129,7 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
|||
*
|
||||
* @param chars the characters to match, must not be null
|
||||
*/
|
||||
CharSetMatcher(char chars[]) {
|
||||
public CharSetMatcher(char chars[]) {
|
||||
super();
|
||||
this.chars = (char[]) chars.clone();
|
||||
Arrays.sort(this.chars);
|
||||
|
|
Loading…
Reference in New Issue