diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index b22cfe01..b85b9768 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -25,7 +25,9 @@ import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; -import static org.apache.commons.csv.CSVParser.Token.Type.*; +import org.apache.commons.csv.CSVLexer.Token; + +import static org.apache.commons.csv.CSVLexer.Token.Type.*; /** * Parses CSV files according to the specified configuration. @@ -59,65 +61,16 @@ import static org.apache.commons.csv.CSVParser.Token.Type.*; */ public class CSVParser implements Iterable { - /** length of the initial token (content-)buffer */ - private static final int INITIAL_TOKEN_LENGTH = 50; - /** Immutable empty String array. */ private static final String[] EMPTY_STRING_ARRAY = new String[0]; - /** The input stream */ - private final ExtendedBufferedReader in; - - private final CSVFormat format; - + private CSVLexer lexer; + // the following objects are shared to reduce garbage /** A record buffer for getLine(). Grows as necessary and is reused. */ private final List record = new ArrayList(); private final Token reusableToken = new Token(); - private final CharBuffer wsBuf = new CharBuffer(); - - /** - * Token is an internal token representation. - *

- * It is used as contract between the lexer and the parser. - */ - static class Token { - - enum Type { - /** Token has no valid content, i.e. is in its initialized state. */ - INVALID, - - /** Token with content, at beginning or in the middle of a line. */ - TOKEN, - - /** Token (which can have content) when end of file is reached. */ - EOF, - - /** Token with content when end of a line is reached. */ - EORECORD - } - - /** Token type */ - Type type = INVALID; - - /** The content buffer. */ - CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH); - - /** Token ready flag: indicates a valid token with content (ready for the parser). */ - boolean isReady; - - Token reset() { - content.clear(); - type = INVALID; - isReady = false; - return this; - } - } - - // ====================================================== - // the constructor - // ====================================================== /** * CSV parser using the default {@link CSVFormat}. @@ -139,8 +92,7 @@ public class CSVParser implements Iterable { input = new UnicodeUnescapeReader(input); } - this.in = new ExtendedBufferedReader(input); - this.format = format; + this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input)); } /** @@ -153,9 +105,6 @@ public class CSVParser implements Iterable { this(new StringReader(input), format); } - // ====================================================== - // the parser - // ====================================================== /** * Parses the CSV according to the given format and returns the content @@ -191,7 +140,7 @@ public class CSVParser implements Iterable { record.clear(); while (true) { reusableToken.reset(); - nextToken(reusableToken); + lexer.nextToken(reusableToken); switch (reusableToken.type) { case TOKEN: record.add(reusableToken.content.toString()); @@ -274,12 +223,69 @@ public class CSVParser implements Iterable { * @return current line number */ public int getLineNumber() { - return in.getLineNumber(); + return lexer.getLineNumber(); + } +} + + +class CSVLexer { + + /** length of the initial token (content-)buffer */ + private static final int INITIAL_TOKEN_LENGTH = 50; + + private final CharBuffer wsBuf = new CharBuffer(); + + private CSVFormat format; + + /** The input stream */ + private ExtendedBufferedReader in; + + /** + * Token is an internal token representation. + *

+ * It is used as contract between the lexer and the parser. + */ + static class Token { + + enum Type { + /** Token has no valid content, i.e. is in its initialized state. */ + INVALID, + + /** Token with content, at beginning or in the middle of a line. */ + TOKEN, + + /** Token (which can have content) when end of file is reached. */ + EOF, + + /** Token with content when end of a line is reached. */ + EORECORD + } + + /** Token type */ + Type type = INVALID; + + /** The content buffer. */ + CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH); + + /** Token ready flag: indicates a valid token with content (ready for the parser). */ + boolean isReady; + + Token reset() { + content.clear(); + type = INVALID; + isReady = false; + return this; + } } - // ====================================================== - // the lexer(s) - // ====================================================== + CSVLexer(CSVFormat format, ExtendedBufferedReader in) { + this.format = format; + this.in = in; + } + + public int getLineNumber() { + return in.getLineNumber(); + } /** * Returns the next token. @@ -503,19 +509,6 @@ public class CSVParser implements Iterable { } } - /** - * Obtain the specified CSV format. - * - * @return format currently being used - */ - public CSVFormat getFormat() { - return this.format; - } - - // ====================================================== - // Character class checker - // ====================================================== - /** * @return true if the given char is a whitespace character */ diff --git a/src/test/java/org/apache/commons/csv/CSVLexerTest.java b/src/test/java/org/apache/commons/csv/CSVLexerTest.java new file mode 100644 index 00000000..cc9cd2e9 --- /dev/null +++ b/src/test/java/org/apache/commons/csv/CSVLexerTest.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; +import org.apache.commons.csv.CSVLexer.Token; + +import static org.apache.commons.csv.CSVLexer.Token.Type.*; + +public class CSVLexerTest extends TestCase { + + private CSVLexer getLexer(String input, CSVFormat format) { + return new CSVLexer(format, new ExtendedBufferedReader(new StringReader(input))); + } + + private void assertTokenEquals(Token.Type expectedType, String expectedContent, Token token) { + assertEquals("Token type", expectedType, token.type); + assertEquals("Token content", expectedContent, token.content.toString()); + } + + // Single line (without comment) + public void testNextToken1() throws IOException { + String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,"; + CSVLexer parser = getLexer(code, CSVFormat.DEFAULT); + assertTokenEquals(TOKEN, "abc", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "def", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "hijk", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "lmnop", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "qrst", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "uv", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "wxy", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "z", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "", parser.nextToken(new Token())); + assertTokenEquals(EOF, "", parser.nextToken(new Token())); + } + + // multiline including comments (and empty lines) + public void testNextToken2() throws IOException { + /* file: 1,2,3, + * a,b x,c + * + * # this is a comment + * d,e, + * + */ + String code = "1,2,3,\na,b x,c\n#foo\n\nd,e,\n\n"; + CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#'); + + CSVLexer parser = getLexer(code, format); + + + assertTokenEquals(TOKEN, "1", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "2", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "3", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "a", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "b x", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "c", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "d", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "e", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "", parser.nextToken(new Token())); + assertTokenEquals(EOF, "", parser.nextToken(new Token())); + assertTokenEquals(EOF, "", parser.nextToken(new Token())); + + } + + // simple token with escaping + public void testNextToken3() throws IOException { + /* file: a,\,,b + * \,, + */ + String code = "a,\\,,b\n\\,,"; + CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#'); + CSVLexer parser = getLexer(code, format); + + assertTokenEquals(TOKEN, "a", parser.nextToken(new Token())); + // an unquoted single backslash is not an escape char + assertTokenEquals(TOKEN, "\\", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "b", parser.nextToken(new Token())); + // an unquoted single backslash is not an escape char + assertTokenEquals(TOKEN, "\\", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "", parser.nextToken(new Token())); + assertTokenEquals(EOF, "", parser.nextToken(new Token())); + } + + // encapsulator tokenizer (sinle line) + public void testNextToken4() throws IOException { + /* file: a,"foo",b + * a, " foo",b + * a,"foo " ,b // whitespace after closing encapsulator + * a, " foo " ,b + */ + String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b"; + CSVLexer parser = getLexer(code, CSVFormat.DEFAULT); + assertTokenEquals(TOKEN, "a", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "foo", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "b", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "a", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, " foo", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "b", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "a", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "foo ", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "b", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "a", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, " foo ", parser.nextToken(new Token())); +// assertTokenEquals(EORECORD, "b", parser.nextToken(new Token())); + assertTokenEquals(EOF, "b", parser.nextToken(new Token())); + } + + // encapsulator tokenizer (multi line, delimiter in string) + public void testNextToken5() throws IOException { + String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\""; + CSVLexer parser = getLexer(code, CSVFormat.DEFAULT); + assertTokenEquals(TOKEN, "a", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "foo\n", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "b", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "foo\n baar ,,,", parser.nextToken(new Token())); + assertTokenEquals(EOF, "\n\t \n", parser.nextToken(new Token())); + + } + + // change delimiters, comment, encapsulater + public void testNextToken6() throws IOException { + /* file: a;'b and \' more + * ' + * !comment;;;; + * ;; + */ + String code = "a;'b and '' more\n'\n!comment;;;;\n;;"; + CSVFormat format = new CSVFormat(';', '\'', '!'); + CSVLexer parser = getLexer(code, format); + assertTokenEquals(TOKEN, "a", parser.nextToken(new Token())); + assertTokenEquals(EORECORD, "b and ' more\n", parser.nextToken(new Token())); + } + + // From SANDBOX-153 + public void testDelimiterIsWhitespace() throws IOException { + String code = "one\ttwo\t\tfour \t five\t six"; + CSVLexer parser = getLexer(code, CSVFormat.TDF); + assertTokenEquals(TOKEN, "one", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "two", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "four", parser.nextToken(new Token())); + assertTokenEquals(TOKEN, "five", parser.nextToken(new Token())); + assertTokenEquals(EOF, "six", parser.nextToken(new Token())); + } +} diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 34db58f6..01b50652 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -28,8 +28,6 @@ import java.util.NoSuchElementException; import junit.framework.TestCase; -import static org.apache.commons.csv.CSVParser.Token.Type.*; - /** * CSVParserTest * @@ -41,168 +39,7 @@ import static org.apache.commons.csv.CSVParser.Token.Type.*; */ public class CSVParserTest extends TestCase { - /** - * TestCSVParser. - */ - class TestCSVParser extends CSVParser { - /** - * Test parser to investigate the type of the internal Token. - * - * @param in a Reader - */ - TestCSVParser(Reader in) { - super(in); - } - - TestCSVParser(Reader in, CSVFormat format) { - super(in, format); - } - - /** - * Calls super.nextToken() and prints out a String representation of token - * type and content. - * - * @return String representation of token type and content - * @throws IOException like {@link CSVParser#nextToken(Token)} - */ - public String testNextToken() throws IOException { - Token t = super.nextToken(new Token()); - return t.type.name() + ";" + t.content + ";"; - } - } - - // ====================================================== - // lexer tests - // ====================================================== - - // Single line (without comment) - public void testNextToken1() throws IOException { - String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,"; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); - assertEquals(TOKEN + ";abc;", parser.testNextToken()); - assertEquals(TOKEN + ";def;", parser.testNextToken()); - assertEquals(TOKEN + ";hijk;", parser.testNextToken()); - assertEquals(TOKEN + ";lmnop;", parser.testNextToken()); - assertEquals(TOKEN + ";qrst;", parser.testNextToken()); - assertEquals(TOKEN + ";uv;", parser.testNextToken()); - assertEquals(TOKEN + ";wxy;", parser.testNextToken()); - assertEquals(TOKEN + ";z;", parser.testNextToken()); - assertEquals(TOKEN + ";;", parser.testNextToken()); - assertEquals(EOF + ";;", parser.testNextToken()); - } - - // multiline including comments (and empty lines) - public void testNextToken2() throws IOException { - /* file: 1,2,3, - * a,b x,c - * - * # this is a comment - * d,e, - * - */ - String code = "1,2,3,\na,b x,c\n#foo\n\nd,e,\n\n"; - CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#'); - - TestCSVParser parser = new TestCSVParser(new StringReader(code), format); - - - assertEquals(TOKEN + ";1;", parser.testNextToken()); - assertEquals(TOKEN + ";2;", parser.testNextToken()); - assertEquals(TOKEN + ";3;", parser.testNextToken()); - assertEquals(EORECORD + ";;", parser.testNextToken()); - assertEquals(TOKEN + ";a;", parser.testNextToken()); - assertEquals(TOKEN + ";b x;", parser.testNextToken()); - assertEquals(EORECORD + ";c;", parser.testNextToken()); - assertEquals(EORECORD + ";;", parser.testNextToken()); - assertEquals(TOKEN + ";d;", parser.testNextToken()); - assertEquals(TOKEN + ";e;", parser.testNextToken()); - assertEquals(EORECORD + ";;", parser.testNextToken()); - assertEquals(EOF + ";;", parser.testNextToken()); - assertEquals(EOF + ";;", parser.testNextToken()); - - } - - // simple token with escaping - public void testNextToken3() throws IOException { - /* file: a,\,,b - * \,, - */ - String code = "a,\\,,b\n\\,,"; - CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#'); - TestCSVParser parser = new TestCSVParser(new StringReader(code), format); - - assertEquals(TOKEN + ";a;", parser.testNextToken()); - // an unquoted single backslash is not an escape char - assertEquals(TOKEN + ";\\;", parser.testNextToken()); - assertEquals(TOKEN + ";;", parser.testNextToken()); - assertEquals(EORECORD + ";b;", parser.testNextToken()); - // an unquoted single backslash is not an escape char - assertEquals(TOKEN + ";\\;", parser.testNextToken()); - assertEquals(TOKEN + ";;", parser.testNextToken()); - assertEquals(EOF + ";;", parser.testNextToken()); - } - - // encapsulator tokenizer (sinle line) - public void testNextToken4() throws IOException { - /* file: a,"foo",b - * a, " foo",b - * a,"foo " ,b // whitespace after closing encapsulator - * a, " foo " ,b - */ - String code = - "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b"; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); - assertEquals(TOKEN + ";a;", parser.testNextToken()); - assertEquals(TOKEN + ";foo;", parser.testNextToken()); - assertEquals(EORECORD + ";b;", parser.testNextToken()); - assertEquals(TOKEN + ";a;", parser.testNextToken()); - assertEquals(TOKEN + "; foo;", parser.testNextToken()); - assertEquals(EORECORD + ";b;", parser.testNextToken()); - assertEquals(TOKEN + ";a;", parser.testNextToken()); - assertEquals(TOKEN + ";foo ;", parser.testNextToken()); - assertEquals(EORECORD + ";b;", parser.testNextToken()); - assertEquals(TOKEN + ";a;", parser.testNextToken()); - assertEquals(TOKEN + "; foo ;", parser.testNextToken()); -// assertEquals(EORECORD + ";b;", parser.testNextToken()); - assertEquals(EOF + ";b;", parser.testNextToken()); - } - - // encapsulator tokenizer (multi line, delimiter in string) - public void testNextToken5() throws IOException { - String code = - "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\""; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); - assertEquals(TOKEN + ";a;", parser.testNextToken()); - assertEquals(TOKEN + ";foo\n;", parser.testNextToken()); - assertEquals(EORECORD + ";b;", parser.testNextToken()); - assertEquals(EORECORD + ";foo\n baar ,,,;", - parser.testNextToken()); - assertEquals(EOF + ";\n\t \n;", parser.testNextToken()); - - } - - // change delimiters, comment, encapsulater - public void testNextToken6() throws IOException { - /* file: a;'b and \' more - * ' - * !comment;;;; - * ;; - */ - String code = "a;'b and '' more\n'\n!comment;;;;\n;;"; - TestCSVParser parser = new TestCSVParser(new StringReader(code), new CSVFormat(';', '\'', '!')); - assertEquals(TOKEN + ";a;", parser.testNextToken()); - assertEquals( - EORECORD + ";b and ' more\n;", - parser.testNextToken()); - } - - - // ====================================================== - // parser tests - // ====================================================== - - String code = - "a,b,c,d\n" + String code = "a,b,c,d\n" + " a , b , 1 2 \n" + "\"foo baar\", b,\n" // + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n"; @@ -455,8 +292,8 @@ public class CSVParserTest extends TestCase { + ""; String[][] res = { {" ", " ", " "}, // 1 - {" \t ", " ", " "}, // 2 - {" / ", " , ", " ,"}, //3 + {" \t ", " ", " "}, // 2 + {" / ", " , ", " ,"}, // 3 }; @@ -552,18 +389,6 @@ public class CSVParserTest extends TestCase { assertEquals(3, data.length); } - // From SANDBOX-153 - public void testDelimiterIsWhitespace() throws IOException { - String code = "one\ttwo\t\tfour \t five\t six"; - TestCSVParser parser = new TestCSVParser(new StringReader(code), CSVFormat.TDF); - assertEquals(TOKEN + ";one;", parser.testNextToken()); - assertEquals(TOKEN + ";two;", parser.testNextToken()); - assertEquals(TOKEN + ";;", parser.testNextToken()); - assertEquals(TOKEN + ";four;", parser.testNextToken()); - assertEquals(TOKEN + ";five;", parser.testNextToken()); - assertEquals(EOF + ";six;", parser.testNextToken()); - } - public void testForEach() { List records = new ArrayList();