Allow tokenizer state to be adjusted before and after tokenizing

git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@424608 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stephen Colebourne 2006-07-22 17:25:38 +00:00
parent b7b7d7c935
commit 7917cc095b
2 changed files with 74 additions and 41 deletions

View File

@ -16,12 +16,11 @@
package org.apache.commons.lang.text; package org.apache.commons.lang.text;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.ListIterator; import java.util.ListIterator;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import org.apache.commons.lang.ArrayUtils;
/** /**
* Tokenizes a string based based on delimiters (separators) * Tokenizes a string based based on delimiters (separators)
* and supporting quoting and ignored character concepts. * and supporting quoting and ignored character concepts.
@ -107,10 +106,8 @@ public class StrTokenizer implements ListIterator, Cloneable {
TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
} }
/** The text to work on */ /** The text to work on. */
private char chars[]; private char chars[];
/** The input text, null if char[] input */
private String text;
/** The parsed tokens */ /** The parsed tokens */
private String tokens[]; private String tokens[];
/** The current iteration position */ /** The current iteration position */
@ -241,8 +238,7 @@ public static StrTokenizer getTSVInstance(char[] input) {
*/ */
public StrTokenizer() { public StrTokenizer() {
super(); super();
this.text = ""; this.chars = null;
this.chars = new char[0];
} }
/** /**
@ -253,7 +249,6 @@ public StrTokenizer() {
*/ */
public StrTokenizer(String input) { public StrTokenizer(String input) {
super(); super();
text = input;
if (input != null) { if (input != null) {
chars = input.toCharArray(); chars = input.toCharArray();
} else { } else {
@ -331,7 +326,6 @@ public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
*/ */
public StrTokenizer(char[] input) { public StrTokenizer(char[] input) {
super(); super();
this.text = null;
this.chars = input; this.chars = input;
} }
@ -417,7 +411,7 @@ public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
* @return the number of matched tokens * @return the number of matched tokens
*/ */
public int size() { public int size() {
tokenize(); checkTokenized();
return tokens.length; return tokens.length;
} }
@ -451,7 +445,7 @@ public String previousToken() {
* @return the tokens as a String array * @return the tokens as a String array
*/ */
public String[] getTokenArray() { public String[] getTokenArray() {
tokenize(); checkTokenized();
return (String[]) tokens.clone(); return (String[]) tokens.clone();
} }
@ -461,7 +455,7 @@ public String[] getTokenArray() {
* @return the tokens as a String array * @return the tokens as a String array
*/ */
public List getTokenList() { public List getTokenList() {
tokenize(); checkTokenized();
List list = new ArrayList(tokens.length); List list = new ArrayList(tokens.length);
for (int i = 0; i < tokens.length; i++) { for (int i = 0; i < tokens.length; i++) {
list.add(tokens[i]); list.add(tokens[i]);
@ -492,11 +486,10 @@ public StrTokenizer reset() {
*/ */
public StrTokenizer reset(String input) { public StrTokenizer reset(String input) {
reset(); reset();
text = input;
if (input != null) { if (input != null) {
chars = input.toCharArray(); this.chars = input.toCharArray();
} else { } else {
chars = null; this.chars = null;
} }
return this; return this;
} }
@ -514,8 +507,7 @@ public StrTokenizer reset(String input) {
*/ */
public StrTokenizer reset(char[] input) { public StrTokenizer reset(char[] input) {
reset(); reset();
text = null; this.chars = input;
chars = input;
return this; return this;
} }
@ -527,7 +519,7 @@ public StrTokenizer reset(char[] input) {
* @return true if there are more tokens * @return true if there are more tokens
*/ */
public boolean hasNext() { public boolean hasNext() {
tokenize(); checkTokenized();
return tokenPos < tokens.length; return tokenPos < tokens.length;
} }
@ -558,7 +550,7 @@ public int nextIndex() {
* @return true if there are previous tokens * @return true if there are previous tokens
*/ */
public boolean hasPrevious() { public boolean hasPrevious() {
tokenize(); checkTokenized();
return tokenPos > 0; return tokenPos > 0;
} }
@ -613,42 +605,60 @@ public void add(Object obj) {
// Implementation // Implementation
//----------------------------------------------------------------------- //-----------------------------------------------------------------------
/** /**
* Performs the tokenization if it hasn't already been done. * Checks if tokenization has been done, and if not then do it.
*/ */
private void tokenize() { private void checkTokenized() {
if (tokens == null) { if (tokens == null) {
tokens = readTokens(); if (chars == null) {
// still call tokenize as subclass may do some work
List split = tokenize(null, 0, 0);
tokens = (String[]) split.toArray(new String[split.size()]);
} else {
List split = tokenize(chars, 0, chars.length);
tokens = (String[]) split.toArray(new String[split.size()]);
}
} }
} }
/** /**
* Read all the tokens. * Internal method to performs the tokenization.
* <p>
* Most users of this class do not need to call this method. This method
* will be called automatically by other (public) methods when required.
* <p>
* This method exists to allow subclasses to add code before or after the
* tokenization. For example, a subclass could alter the character array,
* offset or count to be parsed, or call the tokenizer multiple times on
* multiple strings. It is also be possible to filter the results.
* <p>
* <code>StrTokenizer</code> will always pass a zero offset and a count
* equal to the length of the array to this method, however a subclass
* may pass other values, or even an entirely different array.
* *
* @return array containing the tokens. * @param chars the character array being tokenized, may be null
* @param offset the start position within the character array, must be valid
* @param count the number of characters to tokenize, must be valid
* @return the modifiable list of String tokens, unmodifiable if null array or zero count
*/ */
private String[] readTokens() { protected List tokenize(char[] chars, int offset, int count) {
if (chars == null) { if (chars == null || count == 0) {
return ArrayUtils.EMPTY_STRING_ARRAY; return Collections.EMPTY_LIST;
}
int len = chars.length;
if (len == 0) {
return ArrayUtils.EMPTY_STRING_ARRAY;
} }
StrBuilder buf = new StrBuilder(); StrBuilder buf = new StrBuilder();
List tokens = new ArrayList(); List tokens = new ArrayList();
int start = 0; int pos = offset;
// loop around the entire buffer // loop around the entire buffer
while (start >= 0 && start < len) { while (pos >= 0 && pos < count) {
// find next token // find next token
start = readNextToken(chars, start, len, buf, tokens); pos = readNextToken(chars, pos, count, buf, tokens);
// handle case where end of string is a delimiter // handle case where end of string is a delimiter
if (start >= len) { if (pos >= count) {
addToken(tokens, ""); addToken(tokens, "");
} }
} }
return (String[]) tokens.toArray(new String[tokens.size()]); return tokens;
} }
/** /**
@ -1058,10 +1068,7 @@ public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
* @return the string content being parsed * @return the string content being parsed
*/ */
public String getContent() { public String getContent() {
if (text == null) { return new String(chars);
text = new String(chars);
}
return text;
} }
//----------------------------------------------------------------------- //-----------------------------------------------------------------------

View File

@ -17,6 +17,7 @@
package org.apache.commons.lang.text; package org.apache.commons.lang.text;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
@ -531,7 +532,7 @@ void testEmpty(StrTokenizer tokenizer) {
public void testGetContent() { public void testGetContent() {
String input = "a b c \"d e\" f "; String input = "a b c \"d e\" f ";
StrTokenizer tok = new StrTokenizer(input); StrTokenizer tok = new StrTokenizer(input);
assertSame(input, tok.getContent()); assertEquals(input, tok.getContent());
tok = new StrTokenizer(input.toCharArray()); tok = new StrTokenizer(input.toCharArray());
assertEquals(input, tok.getContent()); assertEquals(input, tok.getContent());
@ -804,4 +805,29 @@ public void testIteration() {
assertEquals(false, tkn.hasNext()); assertEquals(false, tkn.hasNext());
} }
//-----------------------------------------------------------------------
public void testTokenizeSubclassInputChange() {
StrTokenizer tkn = new StrTokenizer("a b c d e") {
protected List tokenize(char[] chars, int offset, int count) {
return super.tokenize("w x y z".toCharArray(), 2, 5);
}
};
assertEquals("x", tkn.next());
assertEquals("y", tkn.next());
}
//-----------------------------------------------------------------------
public void testTokenizeSubclassOutputChange() {
StrTokenizer tkn = new StrTokenizer("a b c") {
protected List tokenize(char[] chars, int offset, int count) {
List list = super.tokenize(chars, offset, count);
Collections.reverse(list);
return list;
}
};
assertEquals("c", tkn.next());
assertEquals("b", tkn.next());
assertEquals("a", tkn.next());
}
} }