Initial checkin of the Tokenizer class
git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137709 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f6c1ac465f
commit
71de3f7ed9
|
@ -0,0 +1,878 @@
|
|||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2002-2003 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution, if
|
||||
* any, must include the following acknowledgement:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgement may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgements normally appear.
|
||||
*
|
||||
* 4. The names "The Jakarta Project", "Commons", and "Apache Software
|
||||
* Foundation" must not be used to endorse or promote products derived
|
||||
* from this software without prior written permission. For written
|
||||
* permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache"
|
||||
* nor may "Apache" appear in their names without prior written
|
||||
* permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
package org.apache.commons.lang;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/**
|
||||
* Tokenizes a string based based on delimiters (separators)
|
||||
* and supporting quoting and ignored character concepts.
|
||||
* <p>
|
||||
* This class can split a String into many smaller strings. It aims to do a
|
||||
* similar job to java util StringTokenizer, however it offers much more
|
||||
* control and flexibility.
|
||||
* <p>
|
||||
* The input String is split into a number of <i>tokens</i>.
|
||||
* Each token is separated from the next String by a <i>delimiter</i>.
|
||||
* One or more delimiter characters must be specified.
|
||||
* <p>
|
||||
* The processing then strips all the <i>ignored</i> characters from each side of the token.
|
||||
* The token may also have <i>quotes</i> to mark an area not to be stripped or tokenized.
|
||||
* Empty tokens may be removed or returned as null.
|
||||
* <pre>
|
||||
* "a,b,c" - Three tokens "a","b","c" (comma delimiter)
|
||||
* "a, b , c" - Three tokens "a","b","c" (ignored space characters stripped)
|
||||
* "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched)
|
||||
* </pre>
|
||||
* <p>
|
||||
* By default, this tokenizer has the following properties:
|
||||
* <pre>
|
||||
* Property Default
|
||||
* --------- -------
|
||||
* delimiter , (comma)
|
||||
* quote " (double quote)
|
||||
* ignored char <= 32 (as per trim)
|
||||
* emptyTokenAsNull false
|
||||
* ignoreEmptyTokens false
|
||||
* </pre>
|
||||
*
|
||||
* @author Matthew Inger
|
||||
* @author Stephen Colebourne
|
||||
*/
|
||||
public class Tokenizer implements ListIterator {
|
||||
// TODO: Constructors
|
||||
// TODO: Tests
|
||||
// TODO: Static factories CSV/StringTokenizer
|
||||
|
||||
/**
|
||||
* A Matcher which matches the comma character.
|
||||
* Best used for <code>delimiter</code>.
|
||||
*/
|
||||
public static final Matcher COMMA_MATCHER = new CharMatcher(',');
|
||||
/**
|
||||
* A Matcher which matches the double quote character.
|
||||
* Best used for <code>quote</code>.
|
||||
*/
|
||||
public static final Matcher DOUBLE_QUOTE_MATCHER = new CharMatcher('"');
|
||||
/**
|
||||
* A Matcher which matches the String trim() whitespace characters.
|
||||
* Best used for <code>ignored</code>.
|
||||
*/
|
||||
public static final Matcher SPACES_MATCHER = new TrimMatcher();
|
||||
/**
|
||||
* A Matcher that matches no characters. Don't use this for delimiters!
|
||||
* Best used for <code>ignored</code>.
|
||||
*/
|
||||
public static final Matcher NONE_MATCHER = new NoMatcher();
|
||||
|
||||
/** The text to work on */
|
||||
private char chars[];
|
||||
/** The parsed tokens */
|
||||
private String tokens[];
|
||||
/** The current iteration position */
|
||||
private int tokenPos;
|
||||
|
||||
/** The delimiter matcher */
|
||||
private Matcher delim = COMMA_MATCHER;
|
||||
/** The quote matcher */
|
||||
private Matcher quote = DOUBLE_QUOTE_MATCHER;
|
||||
/** The ignored matcher */
|
||||
private Matcher ignored = SPACES_MATCHER;
|
||||
/** Whether to return empty tokens as null */
|
||||
private boolean emptyAsNull = false;
|
||||
/** Whether to ignore empty tokens */
|
||||
private boolean ignoreEmptyTokens = false;
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
*/
|
||||
public Tokenizer(String input) {
|
||||
this(input.toCharArray());
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
* @param delim the field delimiter character
|
||||
*/
|
||||
public Tokenizer(String input, char delim) {
|
||||
this(input.toCharArray(), delim);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
* @param delim the field delimiter character
|
||||
*/
|
||||
public Tokenizer(String input, CharSetMatcher delim) {
|
||||
this(input.toCharArray(), delim);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
* @param delim the field delimiter character
|
||||
* @param quote the field quoted string character
|
||||
*/
|
||||
public Tokenizer(String input, char delim, char quote) {
|
||||
this(input.toCharArray(), delim, quote);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
* @param delim the field delimiter character
|
||||
* @param quote the field quoted string character
|
||||
*/
|
||||
public Tokenizer(String input, CharSetMatcher delim, CharSetMatcher quote) {
|
||||
this(input.toCharArray(), delim, quote);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
*/
|
||||
public Tokenizer(char[] input) {
|
||||
super();
|
||||
this.chars = (char[]) input.clone();
|
||||
this.tokenPos = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
* @param delim the field delimiter character
|
||||
*/
|
||||
public Tokenizer(char[] input, char delim) {
|
||||
this(input);
|
||||
setDelimiterChar(delim);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
* @param delim the field delimiter character
|
||||
*/
|
||||
public Tokenizer(char[] input, CharSetMatcher delim) {
|
||||
this(input);
|
||||
setDelimiterMatcher(delim);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
* @param delim the field delimiter character
|
||||
* @param quote the field quoted string character
|
||||
*/
|
||||
public Tokenizer(char[] input, char delim, char quote) {
|
||||
this(input, delim);
|
||||
setQuoteChar(quote);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param input the string which is to be parsed
|
||||
* @param delim the field delimiter character
|
||||
* @param quote the field quoted string character
|
||||
*/
|
||||
public Tokenizer(char[] input, CharSetMatcher delim, CharSetMatcher quote) {
|
||||
this(input, delim);
|
||||
setQuoteMatcher(quote);
|
||||
}
|
||||
|
||||
// API
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Gets the number of tokens found in the String.
|
||||
*
|
||||
* @return the number of matched tokens
|
||||
*/
|
||||
public int size() {
|
||||
tokenize();
|
||||
return tokens.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next token from the String.
|
||||
*
|
||||
* @return the next sequential token, or null when no more tokens are found
|
||||
*/
|
||||
public String nextToken() {
|
||||
if (hasNext()) {
|
||||
return tokens[tokenPos++];
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the previous token from the String.
|
||||
*
|
||||
* @return the previous sequential token, or null when no more tokens are found
|
||||
*/
|
||||
public String previousToken() {
|
||||
if (hasPrevious()) {
|
||||
return tokens[--tokenPos];
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a copy of the full token list.
|
||||
*
|
||||
* @return the tokens as a String array
|
||||
*/
|
||||
public String[] getAllTokens() {
|
||||
tokenize();
|
||||
return (String[]) tokens.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets this tokenizer, forgetting all parsing and iteration already completed.
|
||||
* <p>
|
||||
* This method allows the same tokenizer to be reused for the same String.
|
||||
*/
|
||||
public void reset() {
|
||||
tokenPos = 0;
|
||||
tokens = null;
|
||||
}
|
||||
|
||||
// ListIterator
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Checks whether there are any more tokens.
|
||||
*
|
||||
* @return true if there are more tokens
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
tokenize();
|
||||
return (tokenPos < tokens.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next token. This method is equivalent to {@link #nextToken()}.
|
||||
*
|
||||
* @return the next String token
|
||||
*/
|
||||
public Object next() {
|
||||
return nextToken();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the index of the next token to return.
|
||||
*
|
||||
* @return the next token index
|
||||
*/
|
||||
public int nextIndex() {
|
||||
return tokenPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether there are any previous tokens that can be iterated to.
|
||||
*
|
||||
* @return true if there are previous tokens
|
||||
*/
|
||||
public boolean hasPrevious() {
|
||||
tokenize();
|
||||
return (tokenPos > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the token previous to the last returned token.
|
||||
*
|
||||
* @return the previous token
|
||||
*/
|
||||
public Object previous() {
|
||||
return previousToken();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the index of the previous token.
|
||||
*
|
||||
* @return the previous token index
|
||||
*/
|
||||
public int previousIndex() {
|
||||
return (tokenPos - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unsupported ListIterator operation.
|
||||
*
|
||||
* @throws UnsupportedOperationException always
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("remove() is unsupported");
|
||||
}
|
||||
|
||||
/**
|
||||
* Unsupported ListIterator operation.
|
||||
*
|
||||
* @throws UnsupportedOperationException always
|
||||
*/
|
||||
public void set(Object obj) {
|
||||
throw new UnsupportedOperationException("set() is unsupported");
|
||||
}
|
||||
|
||||
/**
|
||||
* Unsupported ListIterator operation.
|
||||
*
|
||||
* @throws UnsupportedOperationException always
|
||||
*/
|
||||
public void add(Object obj) {
|
||||
throw new UnsupportedOperationException("add() is unsupported");
|
||||
}
|
||||
|
||||
// Implementation
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Performs the tokenization if it hasn't already been done.
|
||||
*/
|
||||
private void tokenize() {
|
||||
if (tokens == null) {
|
||||
this.tokens = readTokens();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read all the tokens.
|
||||
*/
|
||||
private String[] readTokens() {
|
||||
int len = chars.length;
|
||||
char cbuf[] = new char[len];
|
||||
StringBuffer token = new StringBuffer();
|
||||
int start = 0;
|
||||
List tokens = new ArrayList();
|
||||
String tok = null;
|
||||
|
||||
// Keep going until we run out of characters
|
||||
while (start < len) {
|
||||
// read the next token
|
||||
start = readNextToken(start, cbuf, token);
|
||||
tok = token.toString();
|
||||
|
||||
// Add the token, following the rules
|
||||
// in this object
|
||||
addToken(tokens, tok);
|
||||
|
||||
// Reset the string buffer to zero length
|
||||
token.setLength(0);
|
||||
|
||||
// Handle the special case where the very last
|
||||
// character is a delimiter, in which case, we
|
||||
// need another empty string
|
||||
if (start == len && delim.isMatch(chars[start - 1])) {
|
||||
// Add the token, following the rules
|
||||
// in this object
|
||||
addToken(tokens, new String());
|
||||
}
|
||||
}
|
||||
|
||||
return (String[]) tokens.toArray(new String[tokens.size()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a token to a list, paying attention to the parameters we've set.
|
||||
*
|
||||
* @param list the list to add to
|
||||
* @param tok the token to add
|
||||
*/
|
||||
private void addToken(List list, String tok) {
|
||||
if (tok == null || tok.length() == 0) {
|
||||
if (ignoreEmptyTokens) {
|
||||
return;
|
||||
}
|
||||
if (emptyAsNull) {
|
||||
tok = null;
|
||||
}
|
||||
}
|
||||
list.add(tok);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads character by character through the String to get the next token.
|
||||
*
|
||||
* @param start the first character of field
|
||||
* @param cbuf a character buffer for temporary computations (so we
|
||||
* don't have to keep recreating one)
|
||||
* @param token a StringBuffer where the output token will go
|
||||
* @return the starting position of the next field (the character
|
||||
* immediately after the delimiter, or if end of string found,
|
||||
* then the length of string
|
||||
*/
|
||||
private int readNextToken(int start, char cbuf[], StringBuffer token) {
|
||||
token.setLength(0);
|
||||
int len = chars.length;
|
||||
|
||||
// skip all leading whitespace, unless it is the
|
||||
// field delimiter or the quote character
|
||||
while (start < len &&
|
||||
ignored.isMatch(chars[start]) &&
|
||||
!delim.isMatch(chars[start]) &&
|
||||
!quote.isMatch(chars[start])) {
|
||||
start++;
|
||||
}
|
||||
|
||||
// Read the token depending on what the first
|
||||
// character is like
|
||||
if (delim.isMatch(chars[start])) {
|
||||
start = readEmpty(start, token);
|
||||
} else if (quote.isMatch(chars[start])) {
|
||||
start = readQuoted(start, cbuf, token);
|
||||
} else {
|
||||
start = readUnquoted(start, token);
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a quoted string token.
|
||||
*
|
||||
* @param start The first character of field (this will be the quote
|
||||
* character)
|
||||
* @param cbuf A character buffer for temporary computations (so we
|
||||
* don't have to keep recreating one)
|
||||
* @param token A StringBuffer where the output token will go.
|
||||
* @return The starting position of the next field (the character
|
||||
* immediately after the delimiter, or if end of string found,
|
||||
* then the length of string.
|
||||
*/
|
||||
private int readQuoted(int start, char cbuf[], StringBuffer token) {
|
||||
// Loop until we've found the end of the quoted
|
||||
// string or the end of the input
|
||||
int cbufcnt = 0;
|
||||
int nd = start + 1;
|
||||
boolean done = false;
|
||||
boolean quoting = true;
|
||||
int len = chars.length;
|
||||
|
||||
while (nd < len && !done) {
|
||||
// Quoting mode can occur several times throughout
|
||||
// a given string, so must switch between quoting
|
||||
// and non-quoting until we encounter a non-quoted
|
||||
// delimiter, or end of string, which inidicates end
|
||||
// of token.
|
||||
if (quoting) {
|
||||
// If we've found a quote character, see if it's
|
||||
// followed by a second quote. If so, then we need
|
||||
// to actually put the quote character into the token
|
||||
// rather than end the token.
|
||||
if (quote.isMatch(chars[nd]) &&
|
||||
nd + 1 < len &&
|
||||
chars[nd + 1] == chars[nd]) {
|
||||
cbuf[cbufcnt++] = chars[nd];
|
||||
nd++;
|
||||
}
|
||||
// End the quoting if we get to this condition
|
||||
else if (quote.isMatch(chars[nd])) {
|
||||
quoting = false;
|
||||
}
|
||||
// Otherwise, just put the character into the token
|
||||
else {
|
||||
cbuf[cbufcnt++] = chars[nd];
|
||||
}
|
||||
nd++;
|
||||
}
|
||||
// If we're not in quoting mode, if we encounter
|
||||
// a delimiter, the token is ended. If we encounter
|
||||
// a quote, we start quoting mode, otherwise, just append
|
||||
// the character
|
||||
else {
|
||||
// If we're
|
||||
if (delim.isMatch(chars[nd])) {
|
||||
done = true;
|
||||
} else {
|
||||
if (quote.isMatch(chars[nd])) {
|
||||
quoting = true;
|
||||
} else {
|
||||
cbuf[cbufcnt++] = chars[nd];
|
||||
}
|
||||
nd++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
token.append(cbuf, 0, cbufcnt);
|
||||
|
||||
return nd + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read an unquoted string until a delimiter is found.
|
||||
*
|
||||
* @param start the first character of field
|
||||
* @param token a StringBuffer where the output token will go.
|
||||
* @return the starting position of the next field (the character
|
||||
* immediately after the delimiter, or if end of string found,
|
||||
* then the length of string.
|
||||
*/
|
||||
private int readUnquoted(int start, StringBuffer token) {
|
||||
int len = chars.length;
|
||||
// Skip ahead until we get to a delimiter character, or
|
||||
// the end of the input
|
||||
int nd = start + 1;
|
||||
while (nd < len && !delim.isMatch(chars[nd])) {
|
||||
nd++;
|
||||
}
|
||||
|
||||
token.append(chars, start, Math.min(nd, len) - start);
|
||||
|
||||
return nd + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read an empty string (basically, if a delimiter is found right
|
||||
* after another delimiter).
|
||||
*
|
||||
* @param start the first character of field (this will be the delimiter
|
||||
* character)
|
||||
* @param token a StringBuffer where the output token will go.
|
||||
* @return The starting position of the next field (the character
|
||||
* immediately after the delimiter, or if end of string found,
|
||||
* then the length of string.
|
||||
*/
|
||||
private int readEmpty(int start, StringBuffer token) {
|
||||
token.setLength(0);
|
||||
return start + 1;
|
||||
}
|
||||
|
||||
// Delimiter
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Gets the field delimiter matcher.
|
||||
*
|
||||
* @return the delimiter matcher in use
|
||||
*/
|
||||
public Matcher getDelimiterMatcher() {
|
||||
return delim;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the field delimiter matcher.
|
||||
* <p>
|
||||
* The delimitier is used to separate one token from another.
|
||||
*
|
||||
* @param delim the delimiter matcher to use, null ignored
|
||||
*/
|
||||
public void setDelimiterMatcher(Matcher delim) {
|
||||
if (delim != null) {
|
||||
this.delim = delim;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the field delimiter character
|
||||
*
|
||||
* @param delim the delimiter character to use
|
||||
*/
|
||||
public void setDelimiterChar(char delim) {
|
||||
setDelimiterMatcher(new CharMatcher(delim));
|
||||
}
|
||||
|
||||
// Quote
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Gets the quote matcher currently in use.
|
||||
* <p>
|
||||
* The quote character is used to wrap data between the tokens.
|
||||
* This enables delimiters to be entered as data.
|
||||
* The default value is '"' (double quote).
|
||||
*
|
||||
* @return the quote matcher in use
|
||||
*/
|
||||
public Matcher getQuoteMatcher() {
|
||||
return quote;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the quote matcher to use.
|
||||
* <p>
|
||||
* The quote character is used to wrap data between the tokens.
|
||||
* This enables delimiters to be entered as data.
|
||||
*
|
||||
* @param quote the quote matcher to use, null ignored
|
||||
*/
|
||||
public void setQuoteMatcher(Matcher quote) {
|
||||
if (quote != null) {
|
||||
this.quote = quote;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the quote character to use.
|
||||
* <p>
|
||||
* The quote character is used to wrap data between the tokens.
|
||||
* This enables delimiters to be entered as data.
|
||||
*
|
||||
* @param quote the quote character to use
|
||||
*/
|
||||
public void setQuoteChar(char quote) {
|
||||
setQuoteMatcher(new CharMatcher(quote));
|
||||
}
|
||||
|
||||
// Ignored
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Gets the ignored character matcher.
|
||||
* <p>
|
||||
* These characters are ignored when parsing the String, unless they are
|
||||
* within a quoted region.
|
||||
* The default value is space (' ') and all char control characters (32 and less).
|
||||
*
|
||||
* @return the ignored matcher in use
|
||||
*/
|
||||
public Matcher getIgnoredMatcher() {
|
||||
return ignored;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the matcher for characters to ignore.
|
||||
* <p>
|
||||
* These characters are ignored when parsing the String, unless they are
|
||||
* within a quoted region.
|
||||
*
|
||||
* @param ignored the ignored matcher to use, null ignored
|
||||
*/
|
||||
public void setIgnoredMatcher(Matcher ignored) {
|
||||
if (ignored != null) {
|
||||
this.ignored = ignored;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the character to ignore.
|
||||
* <p>
|
||||
* This character is ignored when parsing the String, unless it is
|
||||
* within a quoted region.
|
||||
*
|
||||
* @param quote the ignored character to use
|
||||
*/
|
||||
public void setIgnoredChar(char ignored) {
|
||||
setIgnoredMatcher(new CharMatcher(ignored));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Gets whether the tokenizer currently returns empty tokens as null.
|
||||
* The default for this property is false.
|
||||
*
|
||||
* @return true if empty tokens are returned as null
|
||||
*/
|
||||
public boolean isEmptyTokenAsNull() {
|
||||
return emptyAsNull;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets whether the tokenizer should return empty tokens as null.
|
||||
* The default for this property is false.
|
||||
*
|
||||
* @return emptyAsNull whether empty tokens are returned as null
|
||||
*/
|
||||
public void setEmptyTokenAsNull(boolean emptyAsNull) {
|
||||
this.emptyAsNull = emptyAsNull;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Gets whether the tokenizer currently ignores empty tokens.
|
||||
* The default for this property is false.
|
||||
*
|
||||
* @return true if empty tokens are not returned
|
||||
*/
|
||||
public boolean isIgnoreEmptyTokens() {
|
||||
return ignoreEmptyTokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets whether the tokenizer should ignore and not return empty tokens.
|
||||
* The default for this property is false.
|
||||
*
|
||||
* @return ignoreEmptyTokens whether empty tokens are not returned
|
||||
*/
|
||||
public void setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
|
||||
this.ignoreEmptyTokens = ignoreEmptyTokens;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Class used to define a set of characters for matching purposes.
|
||||
*/
|
||||
public static interface Matcher {
|
||||
/**
|
||||
* Returns true if the specified character matches.
|
||||
*
|
||||
* @param ch the character to check for
|
||||
* @return true if matches
|
||||
*/
|
||||
boolean isMatch(char ch);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Class used to define a set of characters for matching purposes.
|
||||
*/
|
||||
public static final class CharSetMatcher implements Matcher {
|
||||
private char chars[];
|
||||
|
||||
/**
|
||||
* Constructor that creates a matcher from a character array.
|
||||
*
|
||||
* @param chars the characters to match, must not be null
|
||||
*/
|
||||
public CharSetMatcher(char chars[]) {
|
||||
super();
|
||||
this.chars = (char[]) chars.clone();
|
||||
Arrays.sort(this.chars);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor that creates a matcher from a String.
|
||||
*
|
||||
* @param chars the characters to match, must not be null
|
||||
*/
|
||||
public CharSetMatcher(String chars) {
|
||||
this(chars.toCharArray());
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the characters being matched.
|
||||
*
|
||||
* @return the characters being matched
|
||||
*/
|
||||
public char[] getChars() {
|
||||
return (char[]) chars.clone();
|
||||
}
|
||||
|
||||
public boolean isMatch(char c) {
|
||||
return (Arrays.binarySearch(chars, c) >= 0);
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Class used to define a character for matching purposes.
|
||||
*/
|
||||
public static final class CharMatcher implements Matcher {
|
||||
private char ch;
|
||||
|
||||
/**
|
||||
* Constructor that creates a matcher that matches a single character.
|
||||
*
|
||||
* @param chars the character to match
|
||||
*/
|
||||
public CharMatcher(char ch) {
|
||||
super();
|
||||
this.ch = ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the character being matched.
|
||||
*
|
||||
* @return the character being matched
|
||||
*/
|
||||
public char getChar() {
|
||||
return this.ch;
|
||||
}
|
||||
|
||||
public boolean isMatch(char ch) {
|
||||
return (this.ch == ch);
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Class used to match no characters.
|
||||
*/
|
||||
static final class NoMatcher implements Matcher {
|
||||
|
||||
NoMatcher() {
|
||||
super();
|
||||
}
|
||||
|
||||
public boolean isMatch(char ch) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* Class used to match whitespace as per trim().
|
||||
*/
|
||||
static final class TrimMatcher implements Matcher {
|
||||
|
||||
TrimMatcher() {
|
||||
super();
|
||||
}
|
||||
|
||||
public boolean isMatch(char ch) {
|
||||
return (ch <= 32);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,344 @@
|
|||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2002-2003 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution, if
|
||||
* any, must include the following acknowledgement:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgement may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgements normally appear.
|
||||
*
|
||||
* 4. The names "The Jakarta Project", "Commons", and "Apache Software
|
||||
* Foundation" must not be used to endorse or promote products derived
|
||||
* from this software without prior written permission. For written
|
||||
* permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache"
|
||||
* nor may "Apache" appear in their names without prior written
|
||||
* permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
package org.apache.commons.lang;
|
||||
|
||||
import junit.framework.Test;
|
||||
import junit.framework.TestCase;
|
||||
import junit.framework.TestSuite;
|
||||
import junit.textui.TestRunner;
|
||||
|
||||
/**
|
||||
* Unit test for Tokenizer.
|
||||
*
|
||||
* @author Matthew Inger
|
||||
*/
|
||||
public class TokenizerTest extends TestCase {
|
||||
|
||||
/**
|
||||
* JUnit constructor.
|
||||
* @param name
|
||||
*/
|
||||
public TokenizerTest(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
public static Test suite() {
|
||||
TestSuite suite = new TestSuite(TokenizerTest.class);
|
||||
suite.setName("TokenizerTest Tests");
|
||||
return suite;
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
TestRunner.run(suite());
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
public void test1() {
|
||||
|
||||
String input = "a;b;c;\"d;\"\"e\";f; ; ;";
|
||||
Tokenizer tok = new Tokenizer(input);
|
||||
tok.setDelimiterChar(';');
|
||||
String tokens [] = tok.getAllTokens();
|
||||
|
||||
String expected[] = new String[]
|
||||
{
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d;\"e",
|
||||
"f",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
};
|
||||
|
||||
assertTrue(tokens.length == expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
assertTrue("token[" + i + "] was '" + tokens[i]
|
||||
+ "' but was expected to be '" + expected[i]
|
||||
+ "'",
|
||||
ObjectUtils.equals(expected[i], tokens[i]));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void test2() {
|
||||
|
||||
String input = "a;b;c ;\"d;\"\"e\";f; ; ;";
|
||||
Tokenizer tok = new Tokenizer(input);
|
||||
tok.setDelimiterChar(';');
|
||||
tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
|
||||
String tokens [] = tok.getAllTokens();
|
||||
|
||||
String expected[] = new String[]
|
||||
{
|
||||
"a",
|
||||
"b",
|
||||
"c ",
|
||||
"d;\"e",
|
||||
"f",
|
||||
" ",
|
||||
" ",
|
||||
"",
|
||||
};
|
||||
|
||||
assertTrue(tokens.length == expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
assertTrue("token[" + i + "] was '" + tokens[i]
|
||||
+ "' but was expected to be '" + expected[i]
|
||||
+ "'",
|
||||
ObjectUtils.equals(expected[i], tokens[i]));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void test3() {
|
||||
|
||||
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
|
||||
Tokenizer tok = new Tokenizer(input);
|
||||
tok.setDelimiterChar(';');
|
||||
tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
|
||||
String tokens [] = tok.getAllTokens();
|
||||
|
||||
String expected[] = new String[]
|
||||
{
|
||||
"a",
|
||||
"b",
|
||||
" c",
|
||||
"d;\"e",
|
||||
"f",
|
||||
" ",
|
||||
" ",
|
||||
"",
|
||||
};
|
||||
|
||||
assertTrue(tokens.length == expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
assertTrue("token[" + i + "] was '" + tokens[i]
|
||||
+ "' but was expected to be '" + expected[i]
|
||||
+ "'",
|
||||
ObjectUtils.equals(expected[i], tokens[i]));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void test4() {
|
||||
|
||||
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
|
||||
Tokenizer tok = new Tokenizer(input);
|
||||
tok.setDelimiterChar(';');
|
||||
tok.setIgnoreEmptyTokens(true);
|
||||
String tokens [] = tok.getAllTokens();
|
||||
|
||||
String expected[] = new String[]
|
||||
{
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d;\"e",
|
||||
"f",
|
||||
};
|
||||
|
||||
assertTrue(tokens.length == expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
assertTrue("token[" + i + "] was '" + tokens[i]
|
||||
+ "' but was expected to be '" + expected[i]
|
||||
+ "'",
|
||||
ObjectUtils.equals(expected[i], tokens[i]));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void test5() {
|
||||
|
||||
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
|
||||
Tokenizer tok = new Tokenizer(input);
|
||||
tok.setDelimiterChar(';');
|
||||
tok.setEmptyTokenAsNull(true);
|
||||
String tokens [] = tok.getAllTokens();
|
||||
|
||||
String expected[] = new String[]
|
||||
{
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d;\"e",
|
||||
"f",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
};
|
||||
|
||||
assertTrue(tokens.length == expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
assertTrue("token[" + i + "] was '" + tokens[i]
|
||||
+ "' but was expected to be '" + expected[i]
|
||||
+ "'",
|
||||
ObjectUtils.equals(expected[i], tokens[i]));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void test6() {
|
||||
|
||||
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
|
||||
Tokenizer tok = new Tokenizer(input);
|
||||
tok.setDelimiterChar(';');
|
||||
// tok.setTreatingEmptyAsNull(true);
|
||||
String tokens [] = tok.getAllTokens();
|
||||
|
||||
String expected[] = new String[]
|
||||
{
|
||||
"a",
|
||||
"b",
|
||||
" c",
|
||||
"d;\"e",
|
||||
"f",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
};
|
||||
|
||||
int nextCount = 0;
|
||||
while (tok.hasNext()) {
|
||||
tok.next();
|
||||
nextCount++;
|
||||
}
|
||||
|
||||
int prevCount = 0;
|
||||
while (tok.hasPrevious()) {
|
||||
tok.previous();
|
||||
prevCount++;
|
||||
}
|
||||
|
||||
assertTrue(tokens.length == expected.length);
|
||||
|
||||
assertTrue("could not cycle through entire token list"
|
||||
+ " using the 'hasNext' and 'next' methods",
|
||||
nextCount == expected.length);
|
||||
|
||||
assertTrue("could not cycle through entire token list"
|
||||
+ " using the 'hasPrevious' and 'previous' methods",
|
||||
prevCount == expected.length);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void test7() {
|
||||
|
||||
String input = "a b c \"d e\" f ";
|
||||
Tokenizer tok = new Tokenizer(input);
|
||||
tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER);
|
||||
tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
|
||||
tok.setIgnoreEmptyTokens(false);
|
||||
String tokens [] = tok.getAllTokens();
|
||||
|
||||
String expected[] = new String[]
|
||||
{
|
||||
"a",
|
||||
"",
|
||||
"",
|
||||
"b",
|
||||
"c",
|
||||
"d e",
|
||||
"f",
|
||||
"",
|
||||
};
|
||||
|
||||
assertTrue(tokens.length == expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
assertTrue("token[" + i + "] was '" + tokens[i]
|
||||
+ "' but was expected to be '" + expected[i]
|
||||
+ "'",
|
||||
ObjectUtils.equals(expected[i], tokens[i]));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void test8() {
|
||||
|
||||
String input = "a b c \"d e\" f ";
|
||||
Tokenizer tok = new Tokenizer(input);
|
||||
tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER);
|
||||
tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
|
||||
tok.setIgnoreEmptyTokens(true);
|
||||
String tokens [] = tok.getAllTokens();
|
||||
|
||||
String expected[] = new String[]
|
||||
{
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d e",
|
||||
"f",
|
||||
};
|
||||
|
||||
assertTrue(tokens.length == expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
assertTrue("token[" + i + "] was '" + tokens[i]
|
||||
+ "' but was expected to be '" + expected[i]
|
||||
+ "'",
|
||||
ObjectUtils.equals(expected[i], tokens[i]));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue