Improve Tokenizer with CSV and TSV plus change default to StringTokenizer like

includes code from Matthew Inger


git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137787 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stephen Colebourne 2004-02-14 00:31:55 +00:00
parent 83406a3ffa
commit ca635e0108
2 changed files with 383 additions and 58 deletions

View File

@ -1,7 +1,7 @@
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2002-2003 The Apache Software Foundation. All rights
* Copyright (c) 2003-2004 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -62,9 +62,9 @@ import java.util.ListIterator;
* Tokenizes a string based based on delimiters (separators)
* and supporting quoting and ignored character concepts.
* <p>
* This class can split a String into many smaller strings. It aims to do a
* similar job to java util StringTokenizer, however it offers much more
* control and flexibility.
* This class can split a String into many smaller strings.
* It aims to do a similar job to java util StringTokenizer, however it offers
* much more control and flexibility. By default, it is setup like StringTokenizer.
* <p>
* The input String is split into a number of <i>tokens</i>.
* Each token is separated from the next String by a <i>delimiter</i>.
@ -73,39 +73,66 @@ import java.util.ListIterator;
* The processing then strips all the <i>ignored</i> characters from each side of the token.
* The token may also have <i>quotes</i> to mark an area not to be stripped or tokenized.
* Empty tokens may be removed or returned as null.
* This example is based on the CSV tokenizer.
* <pre>
* "a,b,c" - Three tokens "a","b","c" (comma delimiter)
* "a, b , c" - Three tokens "a","b","c" (ignored space characters stripped)
* "a,b,c" - Three tokens "a","b","c" (comma delimiter)
* "a, b , c" - Three tokens "a","b","c" (ignored space characters stripped)
* "a, " b ", c" - Three tokens "a"," b ","c" (quoted text untouched)
* </pre>
* <p>
* By default, this tokenizer has the following properties:
* <pre>
* Property Default
* --------- -------
* delimiter , (comma)
* quote " (double quote)
* ignored char &lt;= 32 (as per trim)
* emptyTokenAsNull false
* ignoreEmptyTokens false
* </pre>
*
* This tokenizer has the following properties and options:
*
* <table>
* <tr>
* <th>Property</th><th>Type</th><th>Default</th>
* </tr>
* <tr>
* <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
* </tr>
* <tr>
* <td>quote</td><td>NoneMatcher</td><td>{}</td>
* </tr>
* <tr>
* <td>ignore</td><td>NoneMatcher</td><td>{}</td>
* </tr>
* <tr>
* <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
* </tr>
* <tr>
* <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
* </tr>
* </table>
*
* @author Matthew Inger
* @author Stephen Colebourne
* @author Gary D. Gregory
* @since 2.1
* @version $Id: Tokenizer.java,v 1.3 2004/02/13 01:58:50 ggregory Exp $
* @version $Id: Tokenizer.java,v 1.4 2004/02/14 00:31:55 scolebourne Exp $
*/
public class Tokenizer implements ListIterator {
// TODO: Constructors
// TODO: Tests
// TODO: Static factories CSV/StringTokenizer
public class Tokenizer implements ListIterator, Cloneable {
/**
* A Matcher which matches the comma character.
* Best used for <code>delimiter</code>.
*/
public static final Matcher COMMA_MATCHER = new CharMatcher(',');
/**
* A Matcher which matches the tab character.
* Best used for <code>delimiter</code>.
*/
public static final Matcher TAB_MATCHER = new CharMatcher('\t');
/**
* A Matcher which matches the space character.
* Best used for <code>delimiter</code>.
*/
public static final Matcher SPACE_MATCHER = new CharMatcher(' ');
/**
* A Matcher which matches the same characters as StringTokenizer,
* namely space, tab, newline, formfeed.
* Best used for <code>delimiter</code>.
*/
public static final Matcher SPLIT_MATCHER = new CharSetMatcher(" \t\n\r\f");
/**
* A Matcher which matches the double quote character.
* Best used for <code>quote</code>.
@ -115,98 +142,199 @@ public class Tokenizer implements ListIterator {
* A Matcher which matches the String trim() whitespace characters.
* Best used for <code>ignored</code>.
*/
public static final Matcher SPACES_MATCHER = new TrimMatcher();
public static final Matcher TRIM_MATCHER = new TrimMatcher();
/**
* A Matcher that matches no characters. Don't use this for delimiters!
* Best used for <code>ignored</code>.
*/
public static final Matcher NONE_MATCHER = new NoMatcher();
private static final Tokenizer CSV_TOKENIZER_PROTOTYPE;
private static final Tokenizer TSV_TOKENIZER_PROTOTYPE;
static {
CSV_TOKENIZER_PROTOTYPE = new Tokenizer(StringUtils.EMPTY);
CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(COMMA_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
TSV_TOKENIZER_PROTOTYPE = new Tokenizer(StringUtils.EMPTY);
TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(TAB_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(DOUBLE_QUOTE_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(TRIM_MATCHER);
TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
}
/** The text to work on */
private char chars[];
/** The input text, null if char[] input */
private String text;
/** The parsed tokens */
private String tokens[];
/** The current iteration position */
private int tokenPos;
/** The delimiter matcher */
private Matcher delim = COMMA_MATCHER;
private Matcher delim = SPLIT_MATCHER;
/** The quote matcher */
private Matcher quote = DOUBLE_QUOTE_MATCHER;
private Matcher quote = NONE_MATCHER;
/** The ignored matcher */
private Matcher ignored = SPACES_MATCHER;
private Matcher ignored = NONE_MATCHER;
/** Whether to return empty tokens as null */
private boolean emptyAsNull = false;
/** Whether to ignore empty tokens */
private boolean ignoreEmptyTokens = false;
private boolean ignoreEmptyTokens = true;
//-----------------------------------------------------------------------
/**
* Constructor.
* Get a tokenizer instance which parses Comma Seperated Value
* strings. You must call a "reset" method to set the string which
* you want to parse.
*/
public static final Tokenizer getCSVInstance() {
return (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
}
/**
* Get a tokenizer instance which parses Comma Seperated Value
* strings, initializing it with the given input.
*
* @param input the string to parse
*/
public static final Tokenizer getCSVInstance(String input) {
Tokenizer tok = (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
tok.reset(input);
return tok;
}
/**
* Get a tokenizer instance which parses Comma Seperated Value
* strings, initializing it with the given input.
*
* @param input the text to parse
*/
public static final Tokenizer getCSVInstance(char[] input) {
Tokenizer tok = (Tokenizer)(CSV_TOKENIZER_PROTOTYPE.clone());
tok.reset(input);
return tok;
}
/**
* Get a tokenizer instance which parses Tab Seperated Value
* strings. You must call a "reset" method to set the string which
* you want to parse.
*/
public static final Tokenizer getTSVInstance() {
return (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
}
/**
* Get a tokenizer instance which parses Tab Seperated Value
* strings, initializing it with the given input.
*
* @param input the string to parse
*/
public static final Tokenizer getTSVInstance(String input) {
Tokenizer tok = (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
tok.reset(input);
return tok;
}
/**
* Get a tokenizer instance which parses Tab Seperated Value
* strings, initializing it with the given input.
*
* @param input the text to parse
*/
public static final Tokenizer getTSVInstance(char[] input) {
Tokenizer tok = (Tokenizer)(TSV_TOKENIZER_PROTOTYPE.clone());
tok.reset(input);
return tok;
}
//-----------------------------------------------------------------------
/**
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
*/
public Tokenizer(String input) {
this(input.toCharArray());
super();
this.text = input;
this.chars = input.toCharArray(); // no clone as toCharArray() clones
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param delim the field delimiter character
*/
public Tokenizer(String input, char delim) {
this(input.toCharArray(), delim);
this(input);
setDelimiterChar(delim);
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param delim the field delimiter character
*/
public Tokenizer(String input, CharSetMatcher delim) {
this(input.toCharArray(), delim);
this(input);
setDelimiterMatcher(delim);
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param delim the field delimiter character
* @param quote the field quoted string character
*/
public Tokenizer(String input, char delim, char quote) {
this(input.toCharArray(), delim, quote);
this(input, delim);
setQuoteChar(quote);
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param delim the field delimiter character
* @param quote the field quoted string character
*/
public Tokenizer(String input, CharSetMatcher delim, CharSetMatcher quote) {
this(input.toCharArray(), delim, quote);
this(input, delim);
setQuoteMatcher(quote);
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param input the string which is to be parsed, cloned
*/
public Tokenizer(char[] input) {
super();
this.text = null;
this.chars = (char[]) input.clone();
this.tokenPos = 0;
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter character
*/
public Tokenizer(char[] input, char delim) {
@ -215,9 +343,10 @@ public class Tokenizer implements ListIterator {
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter character
*/
public Tokenizer(char[] input, CharSetMatcher delim) {
@ -226,9 +355,10 @@ public class Tokenizer implements ListIterator {
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter character
* @param quote the field quoted string character
*/
@ -238,9 +368,10 @@ public class Tokenizer implements ListIterator {
}
/**
* Constructor.
* Constructs a tokenizer splitting on space, tab, newline and formfeed
* as per StringTokenizer.
*
* @param input the string which is to be parsed
* @param input the string which is to be parsed, cloned
* @param delim the field delimiter character
* @param quote the field quoted string character
*/
@ -307,6 +438,32 @@ public class Tokenizer implements ListIterator {
tokens = null;
}
/**
* Reset this tokenizer, giving it a new input string to parse.
* In this manner you can re-use a tokenizer with the same settings
* on multiple input lines.
*
* @param input the new string to tokenize
*/
public void reset(String input) {
reset();
this.text = input;
chars = input.toCharArray(); // no clone as toCharArray() clones
}
/**
* Reset this tokenizer, giving it a new input string to parse.
* In this manner you can re-use a tokenizer with the same settings
* on multiple input lines.
*
* @param input the new character array to tokenize, cloned
*/
public void reset(char [] input) {
reset();
this.text = null;
chars = (char[]) input.clone();
}
// ListIterator
//-----------------------------------------------------------------------
/**
@ -473,15 +630,18 @@ public class Tokenizer implements ListIterator {
token.setLength(0);
int len = chars.length;
// skip all leading whitespace, unless it is the
// Skip all leading whitespace, unless it is the
// field delimiter or the quote character
while (start < len &&
ignored.isMatch(chars[start]) &&
!delim.isMatch(chars[start]) &&
!quote.isMatch(chars[start])) {
start++;
int current = start;
while (current < len &&
ignored.isMatch(chars[current]) &&
!delim.isMatch(chars[current]) &&
!quote.isMatch(chars[current])) {
current++;
}
start = current;
// Read the token depending on what the first
// character is like
if (delim.isMatch(chars[start])) {
@ -763,6 +923,36 @@ public class Tokenizer implements ListIterator {
this.ignoreEmptyTokens = ignoreEmptyTokens;
}
//-----------------------------------------------------------------------
/**
* Gets the String content that the tokenizer is parsing.
*
* @return the string content being parsed
*/
public String getContent() {
if (text == null) {
text = new String(chars);
}
return text;
}
//-----------------------------------------------------------------------
/**
* Create a new instance of this Tokenizer.
* The new instance is reset so that it will be at the start of the token list.
*/
public Object clone() {
try {
Tokenizer cloned = (Tokenizer) super.clone();
// chars[] does not need additional clone as it is treated as immutable
cloned.reset();
return cloned;
} catch (CloneNotSupportedException ex) {
return null;
}
}
//-----------------------------------------------------------------------
/**
* Class used to define a set of characters for matching purposes.
@ -801,7 +991,9 @@ public class Tokenizer implements ListIterator {
* @param chars the characters to match, must not be null
*/
public CharSetMatcher(String chars) {
this(chars.toCharArray());
super();
this.chars = chars.toCharArray();
Arrays.sort(this.chars);
}
/**

View File

@ -1,7 +1,7 @@
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2002-2003 The Apache Software Foundation. All rights
* Copyright (c) 2003-2004 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -90,6 +90,9 @@ public class TokenizerTest extends TestCase {
String input = "a;b;c;\"d;\"\"e\";f; ; ;";
Tokenizer tok = new Tokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER);
tok.setIgnoreEmptyTokens(false);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
@ -120,7 +123,9 @@ public class TokenizerTest extends TestCase {
String input = "a;b;c ;\"d;\"\"e\";f; ; ;";
Tokenizer tok = new Tokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
tok.setIgnoreEmptyTokens(false);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
@ -151,7 +156,9 @@ public class TokenizerTest extends TestCase {
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
Tokenizer tok = new Tokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
tok.setIgnoreEmptyTokens(false);
String tokens [] = tok.getAllTokens();
String expected[] = new String[]
@ -182,6 +189,8 @@ public class TokenizerTest extends TestCase {
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
Tokenizer tok = new Tokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER);
tok.setIgnoreEmptyTokens(true);
String tokens [] = tok.getAllTokens();
@ -210,6 +219,9 @@ public class TokenizerTest extends TestCase {
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
Tokenizer tok = new Tokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER);
tok.setIgnoreEmptyTokens(false);
tok.setEmptyTokenAsNull(true);
String tokens [] = tok.getAllTokens();
@ -241,6 +253,9 @@ public class TokenizerTest extends TestCase {
String input = "a;b; c;\"d;\"\"e\";f; ; ;";
Tokenizer tok = new Tokenizer(input);
tok.setDelimiterChar(';');
tok.setQuoteChar('"');
tok.setIgnoredMatcher(Tokenizer.TRIM_MATCHER);
tok.setIgnoreEmptyTokens(false);
// tok.setTreatingEmptyAsNull(true);
String tokens [] = tok.getAllTokens();
@ -285,7 +300,8 @@ public class TokenizerTest extends TestCase {
String input = "a b c \"d e\" f ";
Tokenizer tok = new Tokenizer(input);
tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER);
tok.setDelimiterMatcher(Tokenizer.SPACE_MATCHER);
tok.setQuoteMatcher(Tokenizer.DOUBLE_QUOTE_MATCHER);
tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
tok.setIgnoreEmptyTokens(false);
String tokens [] = tok.getAllTokens();
@ -317,7 +333,8 @@ public class TokenizerTest extends TestCase {
String input = "a b c \"d e\" f ";
Tokenizer tok = new Tokenizer(input);
tok.setDelimiterMatcher(Tokenizer.SPACES_MATCHER);
tok.setDelimiterMatcher(Tokenizer.SPACE_MATCHER);
tok.setQuoteMatcher(Tokenizer.DOUBLE_QUOTE_MATCHER);
tok.setIgnoredMatcher(Tokenizer.NONE_MATCHER);
tok.setIgnoreEmptyTokens(true);
String tokens [] = tok.getAllTokens();
@ -341,4 +358,120 @@ public class TokenizerTest extends TestCase {
}
public void testBasic1() {
String input = "a b c";
Tokenizer tok = new Tokenizer(input);
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasic2() {
String input = "a \nb\fc";
Tokenizer tok = new Tokenizer(input);
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasic3() {
String input = "a \nb\u0001\fc";
Tokenizer tok = new Tokenizer(input);
assertEquals("a", tok.next());
assertEquals("b\u0001", tok.next());
assertEquals("c", tok.next());
}
public void testBasic4() {
String input = "a \"b\" c";
Tokenizer tok = new Tokenizer(input);
assertEquals("a", tok.next());
assertEquals("\"b\"", tok.next());
assertEquals("c", tok.next());
}
public void testBasicQuoted1() {
String input = "a \"b\" c";
Tokenizer tok = new Tokenizer(input, ' ', '"');
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasicDelim1() {
String input = "a:b:c";
Tokenizer tok = new Tokenizer(input, ':');
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasicDelim2() {
String input = "a:b:c";
Tokenizer tok = new Tokenizer(input, ',');
assertEquals("a:b:c", tok.next());
}
public void testBasicEmpty1() {
String input = "a b c";
Tokenizer tok = new Tokenizer(input);
tok.setIgnoreEmptyTokens(false);
assertEquals("a", tok.next());
assertEquals("", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testBasicEmpty2() {
String input = "a b c";
Tokenizer tok = new Tokenizer(input);
tok.setIgnoreEmptyTokens(false);
tok.setEmptyTokenAsNull(true);
assertEquals("a", tok.next());
assertEquals(null, tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
}
public void testGetContent() {
String input = "a b c \"d e\" f ";
Tokenizer tok = new Tokenizer(input);
assertSame(input, tok.getContent());
tok = new Tokenizer(input.toCharArray());
assertEquals(input, tok.getContent());
}
public void testReset() {
String input = "a b c";
Tokenizer tok = new Tokenizer(input);
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
tok.reset();
assertEquals("a", tok.next());
assertEquals("b", tok.next());
assertEquals("c", tok.next());
tok.reset("d e");
assertEquals("d", tok.next());
assertEquals("e", tok.next());
tok.reset("f g".toCharArray());
assertEquals("f", tok.next());
assertEquals("g", tok.next());
}
public void testMatcher() {
assertEquals(true, Tokenizer.SPACE_MATCHER.isMatch(' '));
assertEquals(false, Tokenizer.SPACE_MATCHER.isMatch('\n'));
assertEquals(false, Tokenizer.SPACE_MATCHER.isMatch('\u0001'));
assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch(' '));
assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch('\n'));
assertEquals(true, Tokenizer.TRIM_MATCHER.isMatch('\u0001'));
assertEquals(true, Tokenizer.SPLIT_MATCHER.isMatch(' '));
assertEquals(true, Tokenizer.SPLIT_MATCHER.isMatch('\n'));
assertEquals(false, Tokenizer.SPLIT_MATCHER.isMatch('\u0001'));
}
}