Added a trimmer matcher which is now used to determine

which characters to trim off the left and right of tokens. the ignore matcher is now truly an ignore matcher, which will leave out any matching characters from all tokens. I also fixed a few minor bugs. git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@138002 13f79535-47bb-0310-9956-ffa450edef68
2004-12-23 18:55:48 +00:00 · 2004-12-23 18:55:48 +00:00 · 57b94c34ea
parent d669920cb2
commit 57b94c34ea
1 changed files with 157 additions and 96 deletions
--- a/src/java/org/apache/commons/lang/text/StrTokenizer.java
+++ b/src/java/org/apache/commons/lang/text/StrTokenizer.java
@ -32,14 +32,17 @@ import java.util.ListIterator;
 * Each token is separated from the next String by a <i>delimiter</i>.
 * One or more delimiter characters must be specified.
 * <p>
- * The processing then strips all the <i>ignored</i> characters from each side of the token.
+ * The processing then strips all the <i>ignored</i> characters from then entire string (this
+ * is useful for removing things like carriage returns, and so forth)
+ * <p>
+ * The processing then strips all the <i>trimmer</i> characters from the ends of the string.
+ * <p>
 * The token may also have <i>quotes</i> to mark an area not to be stripped or tokenized.
 * Empty tokens may be removed or returned as null.
- * This example is based on the CSV tokenizer.
 * <pre>
 * "a,b,c"       - Three tokens "a","b","c"   (comma delimiter)
- * "a, b , c"    - Three tokens "a","b","c"   (ignored space characters stripped)
- * "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched)
+ * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
+ * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
 * </pre>
 * <p>
 *
@ -70,7 +73,7 @@ import java.util.ListIterator;
 * @author Stephen Colebourne
 * @author Gary D. Gregory
 * @since 2.1
- * @version $Id: StrTokenizer.java,v 1.1 2004/10/06 22:29:24 scolebourne Exp $
+ * @version $Id: StrTokenizer.java,v 1.2 2004/12/23 18:55:48 mattinger Exp $
 */
 public class StrTokenizer implements ListIterator, Cloneable {

@ -107,12 +110,12 @@ public class StrTokenizer implements ListIterator, Cloneable {
    public static final Matcher DOUBLE_QUOTE_MATCHER = new CharMatcher('"');
    /**
     * A Matcher which matches the String trim() whitespace characters.
-     * Best used for <code>ignored</code>.
+     * Best used for <code>trimmer</code>.
     */
    public static final Matcher TRIM_MATCHER = new TrimMatcher();
    /**
     * A Matcher that matches no characters. Don't use this for delimiters!
-     * Best used for <code>ignored</code>.
+     * Best used for <code>trimmer</code>.
     */
    public static final Matcher NONE_MATCHER = new NoMatcher();

@ -122,14 +125,16 @@ public class StrTokenizer implements ListIterator, Cloneable {
        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(COMMA_MATCHER);
        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
-        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
+        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(NONE_MATCHER);
+        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(TRIM_MATCHER);
        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);

        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(TAB_MATCHER);
        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
-        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
+        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(NONE_MATCHER);
+        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(TRIM_MATCHER);
        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
    }
@ -149,6 +154,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
    private Matcher quote = NONE_MATCHER;
    /** The ignored matcher */
    private Matcher ignored = NONE_MATCHER;
+
+    private Matcher trimmer = TRIM_MATCHER;
+
    /** Whether to return empty tokens as null */
    private boolean emptyAsNull = false;
    /** Whether to ignore empty tokens */
@ -210,8 +218,13 @@ public class StrTokenizer implements ListIterator, Cloneable {
    }

    //-----------------------------------------------------------------------
+
    /**
-     * Gets a new tokenizer instance which parses Comma Seperated Value strings.
+     * Gets a new tokenizer instance which parses Comma Seperated Value strings
+     * initializing it with the given input.  The default for CSV processing
+     * will be trim whitespace from both ends (which can be overriden with
+     * the setTrimmer method).
+     * <p>
     * You must call a "reset" method to set the string which you want to parse.
     */
    public static StrTokenizer getCSVInstance() {
@ -220,9 +233,11 @@ public class StrTokenizer implements ListIterator, Cloneable {

    /**
     * Gets a new tokenizer instance which parses Comma Seperated Value strings
-     * initializing it with the given input.
+     * initializing it with the given input.  The default for CSV processing
+     * will be trim whitespace from both ends (which can be overriden with
+     * the setTrimmer method).
     *
-     * @param input  the string to parse
+     * @param input  the text to parse
     */
    public static StrTokenizer getCSVInstance(String input) {
        StrTokenizer tok = (StrTokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
@ -232,7 +247,9 @@ public class StrTokenizer implements ListIterator, Cloneable {

    /**
     * Gets a new tokenizer instance which parses Comma Seperated Value strings
-     * initializing it with the given input.
+     * initializing it with the given input.  The default for CSV processing
+     * will be trim whitespace from both ends (which can be overriden with
+     * the setTrimmer method).
     *
     * @param input  the text to parse
     */
@ -244,6 +261,9 @@ public class StrTokenizer implements ListIterator, Cloneable {

    /**
     * Gets a new tokenizer instance which parses Tab Seperated Value strings.
+     * The default for CSV processing will be trim whitespace from both ends
+     * (which can be overriden with the setTrimmer method).
+     * <p>
     * You must call a "reset" method to set the string which you want to parse.
     */
    public static StrTokenizer getTSVInstance() {
@ -251,9 +271,9 @@ public class StrTokenizer implements ListIterator, Cloneable {
    }

    /**
-     * Gets a new tokenizer instance which parses Tab Seperated Value strings
-     * initializing it with the given input.
-     * 
+     * Gets a new tokenizer instance which parses Tab Seperated Value strings.
+     * The default for CSV processing will be trim whitespace from both ends
+     * (which can be overriden with the setTrimmer method).
     * @param input  the string to parse
     */
    public static StrTokenizer getTSVInstance(String input) {
@ -263,10 +283,10 @@ public class StrTokenizer implements ListIterator, Cloneable {
    }

    /**
-     * Gets a new tokenizer instance which parses Tab Seperated Value strings
-     * initializing it with the given input.
-     * 
-     * @param input  the text to parse
+     * Gets a new tokenizer instance which parses Tab Seperated Value strings.
+     * The default for CSV processing will be trim whitespace from both ends
+     * (which can be overriden with the setTrimmer method).
+     * @param input  the string to parse
     */
    public static StrTokenizer getTSVInstance(char[] input) {
        StrTokenizer tok = (StrTokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
@ -685,7 +705,6 @@ public class StrTokenizer implements ListIterator, Cloneable {
        int delimLen = 0;
        int quoteLen = 0;
        while (start < len &&
-                (ignoreLen = ignored.isMatch(chars, len, start)) >= 1 &&
                (delimLen = delim.isMatch(chars, len, start)) < 1 &&
                (quoteLen = quote.isMatch(chars, len, start)) < 1) {
            start += ignoreLen;
@ -817,7 +836,23 @@ public class StrTokenizer implements ListIterator, Cloneable {
            pos++;
        }

-        token.append(chars, start, Math.min(pos, len) - start);
+        /* Trim string based on the trimmer matcher */
+        while (trimmer.isMatch(chars, 1, start) > 0) {
+			start++;
+		}
+
+		int length = Math.min(pos, len) - start;
+
+        while (trimmer.isMatch(chars, 1, start + length - 1) > 0) {
+			length--;
+		}
+
+        for (int i=0;i<length;i++) {
+			if (ignored.isMatch(chars, 1, start + i) == 0) {
+				token.append(chars[start + i]);
+			}
+		}
+

        return pos + delimLen;
    }
@ -942,6 +977,32 @@ public class StrTokenizer implements ListIterator, Cloneable {
        }
    }

+    // Trimmer
+    //-----------------------------------------------------------------------
+    /**
+     * Gets the trimmer character matcher.
+     * <p>
+     * These characters are trimmed off the beginning and ending of an unquoted string.
+     * The default value is space (' ') and all char control characters (32 and less).
+     *
+     * @return the trimmer matcher in use
+     */
+    public Matcher getTrimmerMatcher() {
+        return trimmer;
+    }
+
+    /**
+     * Set the matcher for characters to trim off the beginning and end of an
+     * unquoted string.
+     *
+     * @param trimmer  the trimmer matcher to use, null ignored
+     */
+    public void setTrimmerMatcher(Matcher trimmer) {
+        if (trimmer != null) {
+            this.trimmer = trimmer;
+        }
+    }
+
    /**
     * Set the character to ignore.
     * <p>
@ -1060,7 +1121,7 @@ public class StrTokenizer implements ListIterator, Cloneable {
    /**
     * Class used to define a set of characters for matching purposes.
     */
-    static final class CharSetMatcher implements Matcher {
+    public static final class CharSetMatcher implements Matcher {
        private char[] chars;

        /**
@ -1068,7 +1129,7 @@ public class StrTokenizer implements ListIterator, Cloneable {
         *
         * @param chars  the characters to match, must not be null
         */
-        CharSetMatcher(char chars[]) {
+        public CharSetMatcher(char chars[]) {
            super();
            this.chars = (char[]) chars.clone();
            Arrays.sort(this.chars);