Allow tokenizer state to be adjusted before and after tokenizing
git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@424608 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b7b7d7c935
commit
7917cc095b
|
@ -16,12 +16,11 @@
|
||||||
package org.apache.commons.lang.text;
|
package org.apache.commons.lang.text;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ListIterator;
|
import java.util.ListIterator;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
import org.apache.commons.lang.ArrayUtils;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes a string based based on delimiters (separators)
|
* Tokenizes a string based based on delimiters (separators)
|
||||||
* and supporting quoting and ignored character concepts.
|
* and supporting quoting and ignored character concepts.
|
||||||
|
@ -107,10 +106,8 @@ public class StrTokenizer implements ListIterator, Cloneable {
|
||||||
TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
|
TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** The text to work on */
|
/** The text to work on. */
|
||||||
private char chars[];
|
private char chars[];
|
||||||
/** The input text, null if char[] input */
|
|
||||||
private String text;
|
|
||||||
/** The parsed tokens */
|
/** The parsed tokens */
|
||||||
private String tokens[];
|
private String tokens[];
|
||||||
/** The current iteration position */
|
/** The current iteration position */
|
||||||
|
@ -241,8 +238,7 @@ public static StrTokenizer getTSVInstance(char[] input) {
|
||||||
*/
|
*/
|
||||||
public StrTokenizer() {
|
public StrTokenizer() {
|
||||||
super();
|
super();
|
||||||
this.text = "";
|
this.chars = null;
|
||||||
this.chars = new char[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -253,7 +249,6 @@ public StrTokenizer() {
|
||||||
*/
|
*/
|
||||||
public StrTokenizer(String input) {
|
public StrTokenizer(String input) {
|
||||||
super();
|
super();
|
||||||
text = input;
|
|
||||||
if (input != null) {
|
if (input != null) {
|
||||||
chars = input.toCharArray();
|
chars = input.toCharArray();
|
||||||
} else {
|
} else {
|
||||||
|
@ -331,7 +326,6 @@ public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
|
||||||
*/
|
*/
|
||||||
public StrTokenizer(char[] input) {
|
public StrTokenizer(char[] input) {
|
||||||
super();
|
super();
|
||||||
this.text = null;
|
|
||||||
this.chars = input;
|
this.chars = input;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -417,7 +411,7 @@ public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
|
||||||
* @return the number of matched tokens
|
* @return the number of matched tokens
|
||||||
*/
|
*/
|
||||||
public int size() {
|
public int size() {
|
||||||
tokenize();
|
checkTokenized();
|
||||||
return tokens.length;
|
return tokens.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -451,7 +445,7 @@ public String previousToken() {
|
||||||
* @return the tokens as a String array
|
* @return the tokens as a String array
|
||||||
*/
|
*/
|
||||||
public String[] getTokenArray() {
|
public String[] getTokenArray() {
|
||||||
tokenize();
|
checkTokenized();
|
||||||
return (String[]) tokens.clone();
|
return (String[]) tokens.clone();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -461,7 +455,7 @@ public String[] getTokenArray() {
|
||||||
* @return the tokens as a String array
|
* @return the tokens as a String array
|
||||||
*/
|
*/
|
||||||
public List getTokenList() {
|
public List getTokenList() {
|
||||||
tokenize();
|
checkTokenized();
|
||||||
List list = new ArrayList(tokens.length);
|
List list = new ArrayList(tokens.length);
|
||||||
for (int i = 0; i < tokens.length; i++) {
|
for (int i = 0; i < tokens.length; i++) {
|
||||||
list.add(tokens[i]);
|
list.add(tokens[i]);
|
||||||
|
@ -492,11 +486,10 @@ public StrTokenizer reset() {
|
||||||
*/
|
*/
|
||||||
public StrTokenizer reset(String input) {
|
public StrTokenizer reset(String input) {
|
||||||
reset();
|
reset();
|
||||||
text = input;
|
|
||||||
if (input != null) {
|
if (input != null) {
|
||||||
chars = input.toCharArray();
|
this.chars = input.toCharArray();
|
||||||
} else {
|
} else {
|
||||||
chars = null;
|
this.chars = null;
|
||||||
}
|
}
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@ -514,8 +507,7 @@ public StrTokenizer reset(String input) {
|
||||||
*/
|
*/
|
||||||
public StrTokenizer reset(char[] input) {
|
public StrTokenizer reset(char[] input) {
|
||||||
reset();
|
reset();
|
||||||
text = null;
|
this.chars = input;
|
||||||
chars = input;
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -527,7 +519,7 @@ public StrTokenizer reset(char[] input) {
|
||||||
* @return true if there are more tokens
|
* @return true if there are more tokens
|
||||||
*/
|
*/
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
tokenize();
|
checkTokenized();
|
||||||
return tokenPos < tokens.length;
|
return tokenPos < tokens.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -558,7 +550,7 @@ public int nextIndex() {
|
||||||
* @return true if there are previous tokens
|
* @return true if there are previous tokens
|
||||||
*/
|
*/
|
||||||
public boolean hasPrevious() {
|
public boolean hasPrevious() {
|
||||||
tokenize();
|
checkTokenized();
|
||||||
return tokenPos > 0;
|
return tokenPos > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -613,42 +605,60 @@ public void add(Object obj) {
|
||||||
// Implementation
|
// Implementation
|
||||||
//-----------------------------------------------------------------------
|
//-----------------------------------------------------------------------
|
||||||
/**
|
/**
|
||||||
* Performs the tokenization if it hasn't already been done.
|
* Checks if tokenization has been done, and if not then do it.
|
||||||
*/
|
*/
|
||||||
private void tokenize() {
|
private void checkTokenized() {
|
||||||
if (tokens == null) {
|
if (tokens == null) {
|
||||||
tokens = readTokens();
|
if (chars == null) {
|
||||||
|
// still call tokenize as subclass may do some work
|
||||||
|
List split = tokenize(null, 0, 0);
|
||||||
|
tokens = (String[]) split.toArray(new String[split.size()]);
|
||||||
|
} else {
|
||||||
|
List split = tokenize(chars, 0, chars.length);
|
||||||
|
tokens = (String[]) split.toArray(new String[split.size()]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read all the tokens.
|
* Internal method to performs the tokenization.
|
||||||
|
* <p>
|
||||||
|
* Most users of this class do not need to call this method. This method
|
||||||
|
* will be called automatically by other (public) methods when required.
|
||||||
|
* <p>
|
||||||
|
* This method exists to allow subclasses to add code before or after the
|
||||||
|
* tokenization. For example, a subclass could alter the character array,
|
||||||
|
* offset or count to be parsed, or call the tokenizer multiple times on
|
||||||
|
* multiple strings. It is also be possible to filter the results.
|
||||||
|
* <p>
|
||||||
|
* <code>StrTokenizer</code> will always pass a zero offset and a count
|
||||||
|
* equal to the length of the array to this method, however a subclass
|
||||||
|
* may pass other values, or even an entirely different array.
|
||||||
*
|
*
|
||||||
* @return array containing the tokens.
|
* @param chars the character array being tokenized, may be null
|
||||||
|
* @param offset the start position within the character array, must be valid
|
||||||
|
* @param count the number of characters to tokenize, must be valid
|
||||||
|
* @return the modifiable list of String tokens, unmodifiable if null array or zero count
|
||||||
*/
|
*/
|
||||||
private String[] readTokens() {
|
protected List tokenize(char[] chars, int offset, int count) {
|
||||||
if (chars == null) {
|
if (chars == null || count == 0) {
|
||||||
return ArrayUtils.EMPTY_STRING_ARRAY;
|
return Collections.EMPTY_LIST;
|
||||||
}
|
|
||||||
int len = chars.length;
|
|
||||||
if (len == 0) {
|
|
||||||
return ArrayUtils.EMPTY_STRING_ARRAY;
|
|
||||||
}
|
}
|
||||||
StrBuilder buf = new StrBuilder();
|
StrBuilder buf = new StrBuilder();
|
||||||
List tokens = new ArrayList();
|
List tokens = new ArrayList();
|
||||||
int start = 0;
|
int pos = offset;
|
||||||
|
|
||||||
// loop around the entire buffer
|
// loop around the entire buffer
|
||||||
while (start >= 0 && start < len) {
|
while (pos >= 0 && pos < count) {
|
||||||
// find next token
|
// find next token
|
||||||
start = readNextToken(chars, start, len, buf, tokens);
|
pos = readNextToken(chars, pos, count, buf, tokens);
|
||||||
|
|
||||||
// handle case where end of string is a delimiter
|
// handle case where end of string is a delimiter
|
||||||
if (start >= len) {
|
if (pos >= count) {
|
||||||
addToken(tokens, "");
|
addToken(tokens, "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return (String[]) tokens.toArray(new String[tokens.size()]);
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1058,10 +1068,7 @@ public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
|
||||||
* @return the string content being parsed
|
* @return the string content being parsed
|
||||||
*/
|
*/
|
||||||
public String getContent() {
|
public String getContent() {
|
||||||
if (text == null) {
|
return new String(chars);
|
||||||
text = new String(chars);
|
|
||||||
}
|
|
||||||
return text;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//-----------------------------------------------------------------------
|
//-----------------------------------------------------------------------
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.commons.lang.text;
|
package org.apache.commons.lang.text;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
|
@ -531,7 +532,7 @@ void testEmpty(StrTokenizer tokenizer) {
|
||||||
public void testGetContent() {
|
public void testGetContent() {
|
||||||
String input = "a b c \"d e\" f ";
|
String input = "a b c \"d e\" f ";
|
||||||
StrTokenizer tok = new StrTokenizer(input);
|
StrTokenizer tok = new StrTokenizer(input);
|
||||||
assertSame(input, tok.getContent());
|
assertEquals(input, tok.getContent());
|
||||||
|
|
||||||
tok = new StrTokenizer(input.toCharArray());
|
tok = new StrTokenizer(input.toCharArray());
|
||||||
assertEquals(input, tok.getContent());
|
assertEquals(input, tok.getContent());
|
||||||
|
@ -804,4 +805,29 @@ public void testIteration() {
|
||||||
assertEquals(false, tkn.hasNext());
|
assertEquals(false, tkn.hasNext());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------
|
||||||
|
public void testTokenizeSubclassInputChange() {
|
||||||
|
StrTokenizer tkn = new StrTokenizer("a b c d e") {
|
||||||
|
protected List tokenize(char[] chars, int offset, int count) {
|
||||||
|
return super.tokenize("w x y z".toCharArray(), 2, 5);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertEquals("x", tkn.next());
|
||||||
|
assertEquals("y", tkn.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
//-----------------------------------------------------------------------
|
||||||
|
public void testTokenizeSubclassOutputChange() {
|
||||||
|
StrTokenizer tkn = new StrTokenizer("a b c") {
|
||||||
|
protected List tokenize(char[] chars, int offset, int count) {
|
||||||
|
List list = super.tokenize(chars, offset, count);
|
||||||
|
Collections.reverse(list);
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertEquals("c", tkn.next());
|
||||||
|
assertEquals("b", tkn.next());
|
||||||
|
assertEquals("a", tkn.next());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue