diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index da411459..a3a59efc 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -22,6 +22,7 @@ import java.io.Reader; import java.util.ArrayList; import java.util.List; +import static org.apache.commons.csv.CSVParser.Token.Type.*; /** * Parses CSV files according to the specified configuration. @@ -54,19 +55,6 @@ public class CSVParser { /** length of the initial token (content-)buffer */ private static final int INITIAL_TOKEN_LENGTH = 50; - // the token types - /** Token has no valid content, i.e. is in its initialized state. */ - static final int TT_INVALID = -1; - - /** Token with content, at beginning or in the middle of a line. */ - static final int TT_TOKEN = 0; - - /** Token (which can have content) when end of file is reached. */ - static final int TT_EOF = 1; - - /** Token with content when end of a line is reached. */ - static final int TT_EORECORD = 2; - /** Immutable empty String array. */ private static final String[] EMPTY_STRING_ARRAY = new String[0]; @@ -91,22 +79,33 @@ public class CSVParser { * It is used as contract between the lexer and the parser. */ static class Token { - /** - * Token type, see TT_xxx constants. - */ - int type = TT_INVALID; - /** - * The content buffer. - */ + + enum Type { + /** Token has no valid content, i.e. is in its initialized state. */ + INVALID, + + /** Token with content, at beginning or in the middle of a line. */ + TOKEN, + + /** Token (which can have content) when end of file is reached. */ + EOF, + + /** Token with content when end of a line is reached. */ + EORECORD + } + + /** Token type */ + Type type = INVALID; + + /** The content buffer. */ CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH); - /** - * Token ready flag: indicates a valid token with content (ready for the parser). - */ + + /** Token ready flag: indicates a valid token with content (ready for the parser). */ boolean isReady; Token reset() { content.clear(); - type = TT_INVALID; + type = INVALID; isReady = false; return this; } @@ -180,26 +179,26 @@ public class CSVParser { reusableToken.reset(); nextToken(reusableToken); switch (reusableToken.type) { - case TT_TOKEN: + case TOKEN: record.add(reusableToken.content.toString()); break; - case TT_EORECORD: + case EORECORD: record.add(reusableToken.content.toString()); break; - case TT_EOF: + case EOF: if (reusableToken.isReady) { record.add(reusableToken.content.toString()); } else { ret = null; } break; - case TT_INVALID: + case INVALID: default: // error: throw IOException throw new IOException("(line " + getLineNumber() + ") invalid parse sequence"); // unreachable: break; } - if (reusableToken.type != TT_TOKEN) { + if (reusableToken.type != TOKEN) { break; } } @@ -272,19 +271,19 @@ public class CSVParser { c = in.readAgain(); // reached end of file without any content (empty line at the end) if (isEndOfFile(c)) { - tkn.type = TT_EOF; + tkn.type = EOF; return tkn; } } - // did we reach eof during the last iteration already ? TT_EOF + // did we reach eof during the last iteration already ? EOF if (isEndOfFile(lastChar) || (lastChar != format.getDelimiter() && isEndOfFile(c))) { - tkn.type = TT_EOF; + tkn.type = EOF; return tkn; } // important: make sure a new char gets consumed in each iteration - while (!tkn.isReady && tkn.type != TT_EOF) { + while (!tkn.isReady && tkn.type != EOF) { // ignore whitespaces at beginning of a token while (format.isLeadingSpacesIgnored() && isWhitespace(c) && !eol) { wsBuf.append((char) c); @@ -297,21 +296,21 @@ public class CSVParser { in.readLine(); tkn = nextToken(tkn.reset()); } else if (c == format.getDelimiter()) { - // empty token return TT_TOKEN("") - tkn.type = TT_TOKEN; + // empty token return TOKEN("") + tkn.type = TOKEN; tkn.isReady = true; } else if (eol) { - // empty token return TT_EORECORD("") + // empty token return EORECORD("") //noop: tkn.content.append(""); - tkn.type = TT_EORECORD; + tkn.type = EORECORD; tkn.isReady = true; } else if (c == format.getEncapsulator()) { // consume encapsulated token encapsulatedTokenLexer(tkn, c); } else if (isEndOfFile(c)) { - // end of file return TT_EOF() + // end of file return EOF() //noop: tkn.content.append(""); - tkn.type = TT_EOF; + tkn.type = EOF; tkn.isReady = true; } else { // next token must be a simple token @@ -332,9 +331,9 @@ public class CSVParser { * A simple token might contain escaped delimiters (as \, or \;). The * token is finished when one of the following conditions become true: * * * @param tkn the current token @@ -346,17 +345,17 @@ public class CSVParser { for (; ;) { if (isEndOfLine(c)) { // end of record - tkn.type = TT_EORECORD; + tkn.type = EORECORD; tkn.isReady = true; break; } else if (isEndOfFile(c)) { // end of file - tkn.type = TT_EOF; + tkn.type = EOF; tkn.isReady = true; break; } else if (c == format.getDelimiter()) { // end of token - tkn.type = TT_TOKEN; + tkn.type = TOKEN; tkn.isReady = true; break; } else if (c == '\\' && format.isUnicodeEscapesInterpreted() && in.lookAhead() == 'u') { @@ -414,16 +413,16 @@ public class CSVParser { for (; ;) { c = in.read(); if (c == format.getDelimiter()) { - tkn.type = TT_TOKEN; + tkn.type = TOKEN; tkn.isReady = true; return tkn; } else if (isEndOfFile(c)) { - tkn.type = TT_EOF; + tkn.type = EOF; tkn.isReady = true; return tkn; } else if (isEndOfLine(c)) { // ok eo token reached - tkn.type = TT_EORECORD; + tkn.type = EORECORD; tkn.isReady = true; return tkn; } else if (!isWhitespace(c)) { diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index d7c85c06..ff608dee 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.commons.csv; import java.io.IOException; @@ -23,6 +24,8 @@ import java.util.Arrays; import junit.framework.TestCase; +import static org.apache.commons.csv.CSVParser.Token.Type.*; + /** * CSVParserTest * @@ -60,7 +63,7 @@ public class CSVParserTest extends TestCase { */ public String testNextToken() throws IOException { Token t = super.nextToken(); - return Integer.toString(t.type) + ";" + t.content + ";"; + return t.type.name() + ";" + t.content + ";"; } } @@ -72,16 +75,16 @@ public class CSVParserTest extends TestCase { public void testNextToken1() throws IOException { String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,"; TestCSVParser parser = new TestCSVParser(new StringReader(code)); - assertEquals(CSVParser.TT_TOKEN + ";abc;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";def;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";hijk;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";lmnop;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";qrst;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";uv;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";wxy;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";z;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); + assertEquals(TOKEN + ";abc;", parser.testNextToken()); + assertEquals(TOKEN + ";def;", parser.testNextToken()); + assertEquals(TOKEN + ";hijk;", parser.testNextToken()); + assertEquals(TOKEN + ";lmnop;", parser.testNextToken()); + assertEquals(TOKEN + ";qrst;", parser.testNextToken()); + assertEquals(TOKEN + ";uv;", parser.testNextToken()); + assertEquals(TOKEN + ";wxy;", parser.testNextToken()); + assertEquals(TOKEN + ";z;", parser.testNextToken()); + assertEquals(TOKEN + ";;", parser.testNextToken()); + assertEquals(EOF + ";;", parser.testNextToken()); } // multiline including comments (and empty lines) @@ -99,19 +102,19 @@ public class CSVParserTest extends TestCase { TestCSVParser parser = new TestCSVParser(new StringReader(code), format); - assertEquals(CSVParser.TT_TOKEN + ";1;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";2;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";3;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";b x;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";c;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";d;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";e;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); + assertEquals(TOKEN + ";1;", parser.testNextToken()); + assertEquals(TOKEN + ";2;", parser.testNextToken()); + assertEquals(TOKEN + ";3;", parser.testNextToken()); + assertEquals(EORECORD + ";;", parser.testNextToken()); + assertEquals(TOKEN + ";a;", parser.testNextToken()); + assertEquals(TOKEN + ";b x;", parser.testNextToken()); + assertEquals(EORECORD + ";c;", parser.testNextToken()); + assertEquals(EORECORD + ";;", parser.testNextToken()); + assertEquals(TOKEN + ";d;", parser.testNextToken()); + assertEquals(TOKEN + ";e;", parser.testNextToken()); + assertEquals(EORECORD + ";;", parser.testNextToken()); + assertEquals(EOF + ";;", parser.testNextToken()); + assertEquals(EOF + ";;", parser.testNextToken()); } @@ -124,15 +127,15 @@ public class CSVParserTest extends TestCase { CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#'); TestCSVParser parser = new TestCSVParser(new StringReader(code), format); - assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); + assertEquals(TOKEN + ";a;", parser.testNextToken()); // an unquoted single backslash is not an escape char - assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); + assertEquals(TOKEN + ";\\;", parser.testNextToken()); + assertEquals(TOKEN + ";;", parser.testNextToken()); + assertEquals(EORECORD + ";b;", parser.testNextToken()); // an unquoted single backslash is not an escape char - assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); + assertEquals(TOKEN + ";\\;", parser.testNextToken()); + assertEquals(TOKEN + ";;", parser.testNextToken()); + assertEquals(EOF + ";;", parser.testNextToken()); } // encapsulator tokenizer (sinle line) @@ -145,19 +148,19 @@ public class CSVParserTest extends TestCase { String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b"; TestCSVParser parser = new TestCSVParser(new StringReader(code)); - assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";foo;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + "; foo;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";foo ;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + "; foo ;", parser.testNextToken()); -// assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";b;", parser.testNextToken()); + assertEquals(TOKEN + ";a;", parser.testNextToken()); + assertEquals(TOKEN + ";foo;", parser.testNextToken()); + assertEquals(EORECORD + ";b;", parser.testNextToken()); + assertEquals(TOKEN + ";a;", parser.testNextToken()); + assertEquals(TOKEN + "; foo;", parser.testNextToken()); + assertEquals(EORECORD + ";b;", parser.testNextToken()); + assertEquals(TOKEN + ";a;", parser.testNextToken()); + assertEquals(TOKEN + ";foo ;", parser.testNextToken()); + assertEquals(EORECORD + ";b;", parser.testNextToken()); + assertEquals(TOKEN + ";a;", parser.testNextToken()); + assertEquals(TOKEN + "; foo ;", parser.testNextToken()); +// assertEquals(EORECORD + ";b;", parser.testNextToken()); + assertEquals(EOF + ";b;", parser.testNextToken()); } // encapsulator tokenizer (multi line, delimiter in string) @@ -165,12 +168,12 @@ public class CSVParserTest extends TestCase { String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\""; TestCSVParser parser = new TestCSVParser(new StringReader(code)); - assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";foo\n;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;", + assertEquals(TOKEN + ";a;", parser.testNextToken()); + assertEquals(TOKEN + ";foo\n;", parser.testNextToken()); + assertEquals(EORECORD + ";b;", parser.testNextToken()); + assertEquals(EORECORD + ";foo\n baar ,,,;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";\n\t \n;", parser.testNextToken()); + assertEquals(EOF + ";\n\t \n;", parser.testNextToken()); } @@ -183,9 +186,9 @@ public class CSVParserTest extends TestCase { */ String code = "a;'b and '' more\n'\n!comment;;;;\n;;"; TestCSVParser parser = new TestCSVParser(new StringReader(code), new CSVFormat(';', '\'', '!')); - assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); + assertEquals(TOKEN + ";a;", parser.testNextToken()); assertEquals( - CSVParser.TT_EORECORD + ";b and ' more\n;", + EORECORD + ";b and ' more\n;", parser.testNextToken()); } @@ -209,13 +212,11 @@ public class CSVParserTest extends TestCase { public void testGetLine() throws IOException { CSVParser parser = new CSVParser(new StringReader(code)); - String[] tmp = null; - for (int i = 0; i < res.length; i++) { - tmp = parser.getLine(); - assertTrue(Arrays.equals(res[i], tmp)); + for (String[] re : res) { + assertTrue(Arrays.equals(re, parser.getLine())); } - tmp = parser.getLine(); - assertTrue(tmp == null); + + assertTrue(parser.getLine() == null); } public void testGetAllValues() throws IOException { @@ -282,9 +283,8 @@ public class CSVParserTest extends TestCase { {""}, // Excel format does not ignore empty lines {"world", ""} }; - String code; - for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) { - code = codes[codeIndex]; + + for (String code : codes) { CSVParser parser = new CSVParser(new StringReader(code), CSVFormat.EXCEL); String[][] tmp = parser.getAllValues(); assertEquals(res.length, tmp.length); @@ -558,11 +558,11 @@ public class CSVParserTest extends TestCase { public void testDelimiterIsWhitespace() throws IOException { String code = "one\ttwo\t\tfour \t five\t six"; TestCSVParser parser = new TestCSVParser(new StringReader(code), CSVFormat.TDF); - assertEquals(CSVParser.TT_TOKEN + ";one;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";two;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";four;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";five;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";six;", parser.testNextToken()); + assertEquals(TOKEN + ";one;", parser.testNextToken()); + assertEquals(TOKEN + ";two;", parser.testNextToken()); + assertEquals(TOKEN + ";;", parser.testNextToken()); + assertEquals(TOKEN + ";four;", parser.testNextToken()); + assertEquals(TOKEN + ";five;", parser.testNextToken()); + assertEquals(EOF + ";six;", parser.testNextToken()); } }