Initial checkin of the Tokenizer class

git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137709 13f79535-47bb-0310-9956-ffa450edef68
2003-11-17 23:02:18 +00:00 · 2003-11-17 23:02:18 +00:00 · 71de3f7ed9
parent f6c1ac465f
commit 71de3f7ed9
2 changed files with 1222 additions and 0 deletions
--- a/src/java/org/apache/commons/lang/Tokenizer.java
+++ b/src/java/org/apache/commons/lang/Tokenizer.java
@ -0,0 +1,878 @@
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2002-2003 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution, if
+ *    any, must include the following acknowledgement:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgement may appear in the software itself,
+ *    if and wherever such third-party acknowledgements normally appear.
+ *
+ * 4. The names "The Jakarta Project", "Commons", and "Apache Software
+ *    Foundation" must not be used to endorse or promote products derived
+ *    from this software without prior written permission. For written
+ *    permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache"
+ *    nor may "Apache" appear in their names without prior written
+ *    permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+package org.apache.commons.lang;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.ListIterator;
+
+/**
+ * Tokenizes a string based based on delimiters (separators)
+ * and supporting quoting and ignored character concepts.
+ * <p>
+ * This class can split a String into many smaller strings. It aims to do a
+ * similar job to java util StringTokenizer, however it offers much more
+ * control and flexibility.
+ * <p>
+ * The input String is split into a number of <i>tokens</i>.
+ * Each token is separated from the next String by a <i>delimiter</i>.
+ * One or more delimiter characters must be specified.
+ * <p>
+ * The processing then strips all the <i>ignored</i> characters from each side of the token.
+ * The token may also have <i>quotes</i> to mark an area not to be stripped or tokenized.
+ * Empty tokens may be removed or returned as null.
+ * <pre>
+ * "a,b,c"       - Three tokens "a","b","c" (comma delimiter)
+ * "a, b , c"    - Three tokens "a","b","c" (ignored space characters stripped)
+ * "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched)
+ * </pre>
+ * <p>
+ * By default, this tokenizer has the following properties:
+ * <pre>
+ * Property                     Default
+ * ---------                    -------
+ * delimiter                    ,  (comma)
+ * quote                        "  (double quote)
+ * ignored                      char &lt;= 32 (as per trim)
+ * emptyTokenAsNull             false
+ * ignoreEmptyTokens            false
+ * </pre>
+ *
+ * @author Matthew Inger
+ * @author Stephen Colebourne
+ */
+public class Tokenizer implements ListIterator {
+    // TODO: Constructors
+    // TODO: Tests
+    // TODO: Static factories CSV/StringTokenizer
+    
+    /**
+     * A Matcher which matches the comma character.
+     * Best used for <code>delimiter</code>.
+     */
+    public static final Matcher COMMA_MATCHER = new CharMatcher(',');
+    /**
+     * A Matcher which matches the double quote character.
+     * Best used for <code>quote</code>.
+     */
+    public static final Matcher DOUBLE_QUOTE_MATCHER = new CharMatcher('"');
+    /**
+     * A Matcher which matches the String trim() whitespace characters.
+     * Best used for <code>ignored</code>.
+     */
+    public static final Matcher SPACES_MATCHER = new TrimMatcher();
+    /**
+     * A Matcher that matches no characters. Don't use this for delimiters!
+     * Best used for <code>ignored</code>.
+     */
+    public static final Matcher NONE_MATCHER = new NoMatcher();
+
+    /** The text to work on */
+    private char chars[];
+    /** The parsed tokens */
+    private String tokens[];
+    /** The current iteration position */
+    private int tokenPos;
+
+    /** The delimiter matcher */
+    private Matcher delim = COMMA_MATCHER;
+    /** The quote matcher */
+    private Matcher quote = DOUBLE_QUOTE_MATCHER;
+    /** The ignored matcher */
+    private Matcher ignored = SPACES_MATCHER;
+    /** Whether to return empty tokens as null */
+    private boolean emptyAsNull = false;
+    /** Whether to ignore empty tokens */
+    private boolean ignoreEmptyTokens = false;
+
+    //-----------------------------------------------------------------------
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     */
+    public Tokenizer(String input) {
+        this(input.toCharArray());
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     * @param delim  the field delimiter character
+     */
+    public Tokenizer(String input, char delim) {
+        this(input.toCharArray(), delim);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     * @param delim  the field delimiter character
+     */
+    public Tokenizer(String input, CharSetMatcher delim) {
+        this(input.toCharArray(), delim);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     * @param delim  the field delimiter character
+     * @param quote  the field quoted string character
+     */
+    public Tokenizer(String input, char delim, char quote) {
+        this(input.toCharArray(), delim, quote);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     * @param delim  the field delimiter character
+     * @param quote  the field quoted string character
+     */
+    public Tokenizer(String input, CharSetMatcher delim, CharSetMatcher quote) {
+        this(input.toCharArray(), delim, quote);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     */
+    public Tokenizer(char[] input) {
+        super();
+        this.chars = (char[]) input.clone();
+        this.tokenPos = 0;
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     * @param delim the field delimiter character
+     */
+    public Tokenizer(char[] input, char delim) {
+        this(input);
+        setDelimiterChar(delim);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     * @param delim  the field delimiter character
+     */
+    public Tokenizer(char[] input, CharSetMatcher delim) {
+        this(input);
+        setDelimiterMatcher(delim);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     * @param delim  the field delimiter character
+     * @param quote  the field quoted string character
+     */
+    public Tokenizer(char[] input, char delim, char quote) {
+        this(input, delim);
+        setQuoteChar(quote);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input  the string which is to be parsed
+     * @param delim  the field delimiter character
+     * @param quote  the field quoted string character
+     */
+    public Tokenizer(char[] input, CharSetMatcher delim, CharSetMatcher quote) {
+        this(input, delim);
+        setQuoteMatcher(quote);
+    }
+
+    // API
+    //-----------------------------------------------------------------------
+    /**
+     * Gets the number of tokens found in the String.
+     * 
+     * @return the number of matched tokens
+     */
+    public int size() {
+        tokenize();
+        return tokens.length;
+    }
+
+    /**
+     * Gets the next token from the String.
+     * 
+     * @return the next sequential token, or null when no more tokens are found
+     */
+    public String nextToken() {
+        if (hasNext()) {
+            return tokens[tokenPos++];
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * Gets the previous token from the String.
+     * 
+     * @return the previous sequential token, or null when no more tokens are found
+     */
+    public String previousToken() {
+        if (hasPrevious()) {
+            return tokens[--tokenPos];
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * Gets a copy of the full token list.
+     * 
+     * @return the tokens as a String array
+     */
+    public String[] getAllTokens() {
+        tokenize();
+        return (String[]) tokens.clone();
+    }
+
+    /**
+     * Resets this tokenizer, forgetting all parsing and iteration already completed.
+     * <p>
+     * This method allows the same tokenizer to be reused for the same String.
+     */
+    public void reset() {
+        tokenPos = 0;
+        tokens = null;
+    }
+
+    // ListIterator
+    //-----------------------------------------------------------------------
+    /**
+     * Checks whether there are any more tokens.
+     * 
+     * @return true if there are more tokens
+     */
+    public boolean hasNext() {
+        tokenize();
+        return (tokenPos < tokens.length);
+    }
+
+    /**
+     * Gets the next token. This method is equivalent to {@link #nextToken()}.
+     * 
+     * @return the next String token
+     */
+    public Object next() {
+        return nextToken();
+    }
+
+    /**
+     * Gets the index of the next token to return.
+     * 
+     * @return the next token index
+     */
+    public int nextIndex() {
+        return tokenPos;
+    }
+
+    /**
+     * Checks whether there are any previous tokens that can be iterated to.
+     * 
+     * @return true if there are previous tokens
+     */
+    public boolean hasPrevious() {
+        tokenize();
+        return (tokenPos > 0);
+    }
+
+    /**
+     * Gets the token previous to the last returned token.
+     * 
+     * @return the previous token
+     */
+    public Object previous() {
+        return previousToken();
+    }
+
+    /**
+     * Gets the index of the previous token.
+     * 
+     * @return the previous token index
+     */
+    public int previousIndex() {
+        return (tokenPos - 1);
+    }
+
+    /**
+     * Unsupported ListIterator operation.
+     *
+     * @throws UnsupportedOperationException always
+     */
+    public void remove() {
+        throw new UnsupportedOperationException("remove() is unsupported");
+    }
+
+    /**
+     * Unsupported ListIterator operation.
+     *
+     * @throws UnsupportedOperationException always
+     */
+    public void set(Object obj) {
+        throw new UnsupportedOperationException("set() is unsupported");
+    }
+
+    /**
+     * Unsupported ListIterator operation.
+     *
+     * @throws UnsupportedOperationException always
+     */
+    public void add(Object obj) {
+        throw new UnsupportedOperationException("add() is unsupported");
+    }
+
+    // Implementation
+    //-----------------------------------------------------------------------
+    /**
+     * Performs the tokenization if it hasn't already been done.
+     */
+    private void tokenize() {
+        if (tokens == null) {
+            this.tokens = readTokens();
+        }
+    }
+
+    /**
+     * Read all the tokens.
+     */
+    private String[] readTokens() {
+        int len = chars.length;
+        char cbuf[] = new char[len];
+        StringBuffer token = new StringBuffer();
+        int start = 0;
+        List tokens = new ArrayList();
+        String tok = null;
+
+        // Keep going until we run out of characters
+        while (start < len) {
+            // read the next token
+            start = readNextToken(start, cbuf, token);
+            tok = token.toString();
+
+            // Add the token, following the rules
+            // in this object
+            addToken(tokens, tok);
+
+            // Reset the string buffer to zero length
+            token.setLength(0);
+
+            // Handle the special case where the very last
+            // character is a delimiter, in which case, we
+            // need another empty string
+            if (start == len && delim.isMatch(chars[start - 1])) {
+                // Add the token, following the rules
+                // in this object
+                addToken(tokens, new String());
+            }
+        }
+
+        return (String[]) tokens.toArray(new String[tokens.size()]);
+    }
+
+    /**
+     * Adds a token to a list, paying attention to the parameters we've set.
+     * 
+     * @param list  the list to add to
+     * @param tok  the token to add
+     */
+    private void addToken(List list, String tok) {
+        if (tok == null || tok.length() == 0) {
+            if (ignoreEmptyTokens) {
+                return;
+            }
+            if (emptyAsNull) {
+                tok = null;
+            }
+        }
+        list.add(tok);
+    }
+
+    /**
+     * Reads character by character through the String to get the next token.
+     * 
+     * @param start  the first character of field
+     * @param cbuf  a character buffer for temporary computations (so we
+     *  don't have to keep recreating one)
+     * @param token  a StringBuffer where the output token will go
+     * @return the starting position of the next field (the character
+     *  immediately after the delimiter, or if end of string found,
+     *  then the length of string
+     */
+    private int readNextToken(int start, char cbuf[], StringBuffer token) {
+        token.setLength(0);
+        int len = chars.length;
+
+        // skip all leading whitespace, unless it is the
+        // field delimiter or the quote character
+        while (start < len &&
+                ignored.isMatch(chars[start]) &&
+                !delim.isMatch(chars[start]) &&
+                !quote.isMatch(chars[start])) {
+            start++;
+        }
+
+        // Read the token depending on what the first
+        // character is like
+        if (delim.isMatch(chars[start])) {
+            start = readEmpty(start, token);
+        } else if (quote.isMatch(chars[start])) {
+            start = readQuoted(start, cbuf, token);
+        } else {
+            start = readUnquoted(start, token);
+        }
+
+        return start;
+    }
+
+    /**
+     * Reads a quoted string token.
+     * 
+     * @param start The first character of field (this will be the quote
+     *              character)
+     * @param cbuf A character buffer for temporary computations (so we
+     *             don't have to keep recreating one)
+     * @param token A StringBuffer where the output token will go.
+     * @return The starting position of the next field (the character
+     *         immediately after the delimiter, or if end of string found,
+     *         then the length of string.
+     */
+    private int readQuoted(int start, char cbuf[], StringBuffer token) {
+        // Loop until we've found the end of the quoted
+        // string or the end of the input
+        int cbufcnt = 0;
+        int nd = start + 1;
+        boolean done = false;
+        boolean quoting = true;
+        int len = chars.length;
+
+        while (nd < len && !done) {
+            // Quoting mode can occur several times throughout
+            // a given string, so must switch between quoting
+            // and non-quoting until we encounter a non-quoted
+            // delimiter, or end of string, which inidicates end
+            // of token.
+            if (quoting) {
+                // If we've found a quote character, see if it's
+                // followed by a second quote.  If so, then we need
+                // to actually put the quote character into the token
+                // rather than end the token.
+                if (quote.isMatch(chars[nd]) &&
+                        nd + 1 < len &&
+                        chars[nd + 1] == chars[nd]) {
+                    cbuf[cbufcnt++] = chars[nd];
+                    nd++;
+                }
+                // End the quoting if we get to this condition
+                else if (quote.isMatch(chars[nd])) {
+                    quoting = false;
+                }
+                // Otherwise, just put the character into the token
+                else {
+                    cbuf[cbufcnt++] = chars[nd];
+                }
+                nd++;
+            }
+            // If we're not in quoting mode, if we encounter
+            // a delimiter, the token is ended.  If we encounter
+            // a quote, we start quoting mode, otherwise, just append
+            // the character
+            else {
+                // If we're
+                if (delim.isMatch(chars[nd])) {
+                    done = true;
+                } else {
+                    if (quote.isMatch(chars[nd])) {
+                        quoting = true;
+                    } else {
+                        cbuf[cbufcnt++] = chars[nd];
+                    }
+                    nd++;
+                }
+            }
+        }
+
+        token.append(cbuf, 0, cbufcnt);
+
+        return nd + 1;
+    }
+
+    /**
+     * Read an unquoted string until a delimiter is found.
+     * 
+     * @param start  the first character of field
+     * @param token  a StringBuffer where the output token will go.
+     * @return  the starting position of the next field (the character
+     *  immediately after the delimiter, or if end of string found,
+     *  then the length of string.
+     */
+    private int readUnquoted(int start, StringBuffer token) {
+        int len = chars.length;
+        // Skip ahead until we get to a delimiter character, or
+        // the end of the input
+        int nd = start + 1;
+        while (nd < len && !delim.isMatch(chars[nd])) {
+            nd++;
+        }
+
+        token.append(chars, start, Math.min(nd, len) - start);
+
+        return nd + 1;
+    }
+
+    /**
+     * Read an empty string (basically, if a delimiter is found right
+     * after another delimiter).
+     * 
+     * @param start  the first character of field (this will be the delimiter
+     *  character)
+     * @param token  a StringBuffer where the output token will go.
+     * @return The starting position of the next field (the character
+     *  immediately after the delimiter, or if end of string found,
+     *  then the length of string.
+     */
+    private int readEmpty(int start, StringBuffer token) {
+        token.setLength(0);
+        return start + 1;
+    }
+
+    // Delimiter
+    //-----------------------------------------------------------------------
+    /**
+     * Gets the field delimiter matcher.
+     * 
+     * @return the delimiter matcher in use
+     */
+    public Matcher getDelimiterMatcher() {
+        return delim;
+    }
+
+    /**
+     * Sets the field delimiter matcher.
+     * <p>
+     * The delimitier is used to separate one token from another.
+     * 
+     * @param delim  the delimiter matcher to use, null ignored
+     */
+    public void setDelimiterMatcher(Matcher delim) {
+        if (delim != null) {
+            this.delim = delim;
+        }
+    }
+
+    /**
+     * Sets the field delimiter character
+     * 
+     * @param delim  the delimiter character to use
+     */
+    public void setDelimiterChar(char delim) {
+        setDelimiterMatcher(new CharMatcher(delim));
+    }
+
+    // Quote
+    //-----------------------------------------------------------------------
+    /**
+     * Gets the quote matcher currently in use.
+     * <p>
+     * The quote character is used to wrap data between the tokens.
+     * This enables delimiters to be entered as data.
+     * The default value is '"' (double quote).
+     * 
+     * @return the quote matcher in use
+     */
+    public Matcher getQuoteMatcher() {
+        return quote;
+    }
+
+    /**
+     * Set the quote matcher to use.
+     * <p>
+     * The quote character is used to wrap data between the tokens.
+     * This enables delimiters to be entered as data.
+     * 
+     * @param quote  the quote matcher to use, null ignored
+     */
+    public void setQuoteMatcher(Matcher quote) {
+        if (quote != null) {
+            this.quote = quote;
+        }
+    }
+
+    /**
+     * Sets the quote character to use.
+     * <p>
+     * The quote character is used to wrap data between the tokens.
+     * This enables delimiters to be entered as data.
+     * 
+     * @param quote  the quote character to use
+     */
+    public void setQuoteChar(char quote) {
+        setQuoteMatcher(new CharMatcher(quote));
+    }
+
+    // Ignored
+    //-----------------------------------------------------------------------
+    /**
+     * Gets the ignored character matcher.
+     * <p>
+     * These characters are ignored when parsing the String, unless they are
+     * within a quoted region.
+     * The default value is space (' ') and all char control characters (32 and less).
+     * 
+     * @return the ignored matcher in use
+     */
+    public Matcher getIgnoredMatcher() {
+        return ignored;
+    }
+
+    /**
+     * Set the matcher for characters to ignore.
+     * <p>
+     * These characters are ignored when parsing the String, unless they are
+     * within a quoted region.
+     * 
+     * @param ignored  the ignored matcher to use, null ignored
+     */
+    public void setIgnoredMatcher(Matcher ignored) {
+        if (ignored != null) {
+            this.ignored = ignored;
+        }
+    }
+
+    /**
+     * Set the character to ignore.
+     * <p>
+     * This character is ignored when parsing the String, unless it is
+     * within a quoted region.
+     * 
+     * @param quote  the ignored character to use
+     */
+    public void setIgnoredChar(char ignored) {
+        setIgnoredMatcher(new CharMatcher(ignored));
+    }
+
+    //-----------------------------------------------------------------------
+    /**
+     * Gets whether the tokenizer currently returns empty tokens as null.
+     * The default for this property is false.
+     * 
+     * @return true if empty tokens are returned as null
+     */
+    public boolean isEmptyTokenAsNull() {
+        return emptyAsNull;
+    }
+
+    /**
+     * Sets whether the tokenizer should return empty tokens as null.
+     * The default for this property is false.
+     * 
+     * @return emptyAsNull  whether empty tokens are returned as null
+     */
+    public void setEmptyTokenAsNull(boolean emptyAsNull) {
+        this.emptyAsNull = emptyAsNull;
+    }
+
+    //-----------------------------------------------------------------------
+    /**
+     * Gets whether the tokenizer currently ignores empty tokens.
+     * The default for this property is false.
+     * 
+     * @return true if empty tokens are not returned
+     */
+    public boolean isIgnoreEmptyTokens() {
+        return ignoreEmptyTokens;
+    }
+
+    /**
+     * Sets whether the tokenizer should ignore and not return empty tokens.
+     * The default for this property is false.
+     * 
+     * @return ignoreEmptyTokens  whether empty tokens are not returned
+     */
+    public void setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
+        this.ignoreEmptyTokens = ignoreEmptyTokens;
+    }
+
+    //-----------------------------------------------------------------------    
+    /**
+     * Class used to define a set of characters for matching purposes.
+     */
+    public static interface Matcher {
+        /**
+         * Returns true if the specified character matches.
+         * 
+         * @param ch  the character to check for
+         * @return true if matches
+         */
+        boolean isMatch(char ch);
+    }
+    
+    //-----------------------------------------------------------------------    
+    /**
+     * Class used to define a set of characters for matching purposes.
+     */
+    public static final class CharSetMatcher implements Matcher {
+        private char chars[];
+
+        /**
+         * Constructor that creates a matcher from a character array.
+         * 
+         * @param chars  the characters to match, must not be null
+         */
+        public CharSetMatcher(char chars[]) {
+            super();
+            this.chars = (char[]) chars.clone();
+            Arrays.sort(this.chars);
+        }
+
+        /**
+         * Constructor that creates a matcher from a String.
+         * 
+         * @param chars  the characters to match, must not be null
+         */
+        public CharSetMatcher(String chars) {
+            this(chars.toCharArray());
+        }
+
+        /**
+         * Gets the characters being matched.
+         * 
+         * @return the characters being matched
+         */
+        public char[] getChars() {
+            return (char[]) chars.clone();
+        }
+
+        public boolean isMatch(char c) {
+            return (Arrays.binarySearch(chars, c) >= 0);
+        }
+    }
+    
+    //-----------------------------------------------------------------------    
+    /**
+     * Class used to define a character for matching purposes.
+     */
+    public static final class CharMatcher implements Matcher {
+        private char ch;
+
+        /**
+         * Constructor that creates a matcher that matches a single character.
+         * 
+         * @param chars  the character to match
+         */
+        public CharMatcher(char ch) {
+            super();
+            this.ch = ch;
+        }
+
+        /**
+         * Gets the character being matched.
+         * 
+         * @return the character being matched
+         */
+        public char getChar() {
+            return this.ch;
+        }
+
+        public boolean isMatch(char ch) {
+            return (this.ch == ch);
+        }
+    }
+    
+    //-----------------------------------------------------------------------    
+    /**
+     * Class used to match no characters.
+     */
+    static final class NoMatcher implements Matcher {
+
+        NoMatcher() {
+            super();
+        }
+
+        public boolean isMatch(char ch) {
+            return false;
+        }
+    }
+    
+    //-----------------------------------------------------------------------    
+    /**
+     * Class used to match whitespace as per trim().
+     */
+    static final class TrimMatcher implements Matcher {
+
+        TrimMatcher() {
+            super();
+        }
+
+        public boolean isMatch(char ch) {
+            return (ch <= 32);
+        }
+    }
+}
--- a/src/test/org/apache/commons/lang/TokenizerTest.java
+++ b/src/test/org/apache/commons/lang/TokenizerTest.java
@ -0,0 +1,344 @@
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2002-2003 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution, if
+ *    any, must include the following acknowledgement:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgement may appear in the software itself,
+ *    if and wherever such third-party acknowledgements normally appear.
+ *
+ * 4. The names "The Jakarta Project", "Commons", and "Apache Software
+ *    Foundation" must not be used to endorse or promote products derived
+ *    from this software without prior written permission. For written
+ *    permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache"
+ *    nor may "Apache" appear in their names without prior written
+ *    permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+package org.apache.commons.lang;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+/**
+ * Unit test for Tokenizer.
+ *
+ * @author Matthew Inger
+ */
+public class TokenizerTest extends TestCase {
+    
+    /**
+     * JUnit constructor.
+     * @param name
+     */
+    public TokenizerTest(String name) {
+        super(name);
+    }
+
+    public static Test suite() {
+        TestSuite suite = new TestSuite(TokenizerTest.class);
+        suite.setName("TokenizerTest Tests");
+        return suite;
+    }
+
+
+    public static void main(String[] args) {
+        TestRunner.run(suite());
+    }
+
+    //-----------------------------------------------------------------------
+    public void test1() {
+
+        String input = "a;b;c;\"d;\"\"e\";f; ; ;";
+        Tokenizer tok = new Tokenizer(input);
+        tok.setDelimiterChar(';');
+        String tokens [] = tok.getAllTokens();
+
+        String expected[] = new String[]
+        {
+            "a",
+            "b",
+            "c",
+            "d;\"e",
+            "f",
+            "",
+            "",
+            "",
+        };
+
+        assertTrue(tokens.length == expected.length);
+        for (int i = 0; i < expected.length; i++) {
+            assertTrue("token[" + i + "] was '" + tokens[i]
+                    + "' but was expected to be '" + expected[i]
+                    + "'",
+                    ObjectUtils.equals(expected[i], tokens[i]));
+        }
+
+    }
+
+
+    public void test2() {
+
+        String input = "a;b;c ;\"d;\"\"e\";f; ; ;";
+        Tokenizer tok = new Tokenizer(input);
+        tok.setDelimiterChar(';');
+        tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
+        String tokens [] = tok.getAllTokens();
+
+        String expected[] = new String[]
+        {
+            "a",
+            "b",
+            "c ",
+            "d;\"e",
+            "f",
+            " ",
+            " ",
+            "",
+        };
+
+        assertTrue(tokens.length == expected.length);
+        for (int i = 0; i < expected.length; i++) {
+            assertTrue("token[" + i + "] was '" + tokens[i]
+                    + "' but was expected to be '" + expected[i]
+                    + "'",
+                    ObjectUtils.equals(expected[i], tokens[i]));
+        }
+
+    }
+
+
+    public void test3() {
+
+        String input = "a;b; c;\"d;\"\"e\";f; ; ;";
+        Tokenizer tok = new Tokenizer(input);
+        tok.setDelimiterChar(';');
+        tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
+        String tokens [] = tok.getAllTokens();
+
+        String expected[] = new String[]
+        {
+            "a",
+            "b",
+            " c",
+            "d;\"e",
+            "f",
+            " ",
+            " ",
+            "",
+        };
+
+        assertTrue(tokens.length == expected.length);
+        for (int i = 0; i < expected.length; i++) {
+            assertTrue("token[" + i + "] was '" + tokens[i]
+                    + "' but was expected to be '" + expected[i]
+                    + "'",
+                    ObjectUtils.equals(expected[i], tokens[i]));
+        }
+
+    }
+
+
+    public void test4() {
+
+        String input = "a;b; c;\"d;\"\"e\";f; ; ;";
+        Tokenizer tok = new Tokenizer(input);
+        tok.setDelimiterChar(';');
+        tok.setIgnoreEmptyTokens(true);
+        String tokens [] = tok.getAllTokens();
+
+        String expected[] = new String[]
+        {
+            "a",
+            "b",
+            "c",
+            "d;\"e",
+            "f",
+        };
+
+        assertTrue(tokens.length == expected.length);
+        for (int i = 0; i < expected.length; i++) {
+            assertTrue("token[" + i + "] was '" + tokens[i]
+                    + "' but was expected to be '" + expected[i]
+                    + "'",
+                    ObjectUtils.equals(expected[i], tokens[i]));
+        }
+
+    }
+
+
+    public void test5() {
+
+        String input = "a;b; c;\"d;\"\"e\";f; ; ;";
+        Tokenizer tok = new Tokenizer(input);
+        tok.setDelimiterChar(';');
+        tok.setEmptyTokenAsNull(true);
+        String tokens [] = tok.getAllTokens();
+
+        String expected[] = new String[]
+        {
+            "a",
+            "b",
+            "c",
+            "d;\"e",
+            "f",
+            null,
+            null,
+            null,
+        };
+
+        assertTrue(tokens.length == expected.length);
+        for (int i = 0; i < expected.length; i++) {
+            assertTrue("token[" + i + "] was '" + tokens[i]
+                    + "' but was expected to be '" + expected[i]
+                    + "'",
+                    ObjectUtils.equals(expected[i], tokens[i]));
+        }
+
+    }
+
+
+    public void test6() {
+
+        String input = "a;b; c;\"d;\"\"e\";f; ; ;";
+        Tokenizer tok = new Tokenizer(input);
+        tok.setDelimiterChar(';');
+//        tok.setTreatingEmptyAsNull(true);
+        String tokens [] = tok.getAllTokens();
+
+        String expected[] = new String[]
+        {
+            "a",
+            "b",
+            " c",
+            "d;\"e",
+            "f",
+            null,
+            null,
+            null,
+        };
+
+        int nextCount = 0;
+        while (tok.hasNext()) {
+            tok.next();
+            nextCount++;
+        }
+
+        int prevCount = 0;
+        while (tok.hasPrevious()) {
+            tok.previous();
+            prevCount++;
+        }
+
+        assertTrue(tokens.length == expected.length);
+
+        assertTrue("could not cycle through entire token list"
+                + " using the 'hasNext' and 'next' methods",
+                nextCount == expected.length);
+
+        assertTrue("could not cycle through entire token list"
+                + " using the 'hasPrevious' and 'previous' methods",
+                prevCount == expected.length);
+
+    }
+
+
+    public void test7() {
+
+        String input = "a   b c \"d e\" f ";
+        Tokenizer tok = new Tokenizer(input);
+        tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER);
+        tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
+        tok.setIgnoreEmptyTokens(false);
+        String tokens [] = tok.getAllTokens();
+
+        String expected[] = new String[]
+        {
+            "a",
+            "",
+            "",
+            "b",
+            "c",
+            "d e",
+            "f",
+            "",
+        };
+
+        assertTrue(tokens.length == expected.length);
+        for (int i = 0; i < expected.length; i++) {
+            assertTrue("token[" + i + "] was '" + tokens[i]
+                    + "' but was expected to be '" + expected[i]
+                    + "'",
+                    ObjectUtils.equals(expected[i], tokens[i]));
+        }
+
+    }
+
+
+    public void test8() {
+
+        String input = "a   b c \"d e\" f ";
+        Tokenizer tok = new Tokenizer(input);
+        tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER);
+        tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
+        tok.setIgnoreEmptyTokens(true);
+        String tokens [] = tok.getAllTokens();
+
+        String expected[] = new String[]
+        {
+            "a",
+            "b",
+            "c",
+            "d e",
+            "f",
+        };
+
+        assertTrue(tokens.length == expected.length);
+        for (int i = 0; i < expected.length; i++) {
+            assertTrue("token[" + i + "] was '" + tokens[i]
+                    + "' but was expected to be '" + expected[i]
+                    + "'",
+                    ObjectUtils.equals(expected[i], tokens[i]));
+        }
+
+    }
+
+}