From f047581f9526aad1c9c9e624710a4e860f88ecaa Mon Sep 17 00:00:00 2001 From: Henri Yandell Date: Mon, 6 Mar 2006 05:11:21 +0000 Subject: [PATCH] Javadoc improvements, more unit tests, change of API to a chain style, some bugfixes git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/sandbox/csv/trunk@383468 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/commons/csv/CSVParser.java | 198 +++++++----- .../org/apache/commons/csv/CSVParserTest.java | 288 +++++++++++++++--- 2 files changed, 365 insertions(+), 121 deletions(-) diff --git a/src/java/org/apache/commons/csv/CSVParser.java b/src/java/org/apache/commons/csv/CSVParser.java index 44b5d447..0b487588 100644 --- a/src/java/org/apache/commons/csv/CSVParser.java +++ b/src/java/org/apache/commons/csv/CSVParser.java @@ -34,7 +34,13 @@ import java.util.Vector; *

Parsing of a csv-string having ';' as separator:

*
  *  String[][] data = 
- *         (new CSVParser(new StringReader("a;b\nc;d"),';')).getAllValues();
+ *      (new CSVParser(new StringReader("a;b\nc;d"),';')).getAllValues();
+ * 
+ * + *

The API allows chained method calls, if you like this coding style:

+ *
+ *  String[][] data = (new CSVParser(new StringReader("a;b\nc;d"),';'))
+ *      .setExcelStrategy().setIgnoreEmptyLines(true).getAllValues();
  * 
* *

@@ -45,14 +51,18 @@ import java.util.Vector; * for more details

*/ public class CSVParser { - + /** length of the initial token (content-)buffer */ private static final int INITIAL_TOKEN_LENGTH = 50; // the token types + /** Token has no valid content, i.e. is in its initilized state. */ protected static final int TT_INVALID = -1; + /** Token with content, at beginning or in the middle of a line. */ protected static final int TT_TOKEN = 0; + /** Token (which can have content) when end of file is reached. */ protected static final int TT_EOF = 1; + /** Token with content when end of a line is reached. */ protected static final int TT_EORECORD = 2; // the csv definition @@ -72,12 +82,13 @@ public class CSVParser { * It is used as contract between the lexer and the parser. */ class Token { - // token type see TT_xxx constants + /** Token type, see TT_xxx constants. */ int type; - // the content buffer + /** The content buffer. */ StringBuffer content; - // token ready flag: indicates a valid token (ready for the parser) + /** Token ready flag: indicates a valid token with content (ready for the parser). */ boolean isReady; + /** Initializes an empty token. */ Token() { content = new StringBuffer(INITIAL_TOKEN_LENGTH); type = TT_INVALID; @@ -92,6 +103,7 @@ public class CSVParser { /** * Parses the given String according to the default CSV strategy. * + * @param s CSV String to be parsed. * @return parsed String matrix (which is never null) * @throws IOException in case of error * @see #setCSVStrategy() @@ -100,7 +112,13 @@ public class CSVParser { if (s == null) { throw new IllegalArgumentException("Null argument not allowed."); } - return (new CSVParser(new StringReader(s))).getAllValues(); + String[][] result = (new CSVParser(new StringReader(s))).getAllValues(); + if (result == null) { + // since CSVStrategy ignores empty lines an empty array is returned + // (i.e. not "result = new String[][] {{""}};") + result = new String[0][0]; + } + return result; } /** @@ -109,6 +127,7 @@ public class CSVParser { * Parsing empty string will be handled as valid records containing zero * elements, so the following property holds: parseLine("").length == 0. * + * @param s CSV String to be parsed. * @return parsed String vector (which is never null) * @throws IOException in case of error * @see #setCSVStrategy() @@ -166,8 +185,8 @@ public class CSVParser { * Customized csv parser. * * The parser parses according to the given CSV dialect settings. - * Leading whitespaces are truncated whereas unicode escapes are - * not interpreted. + * Leading whitespaces are truncated, unicode escapes are + * not interpreted and empty lines are ignored. * * @param input a Reader based on "csv-formatted" input * @param delimiter a Char used for value separation @@ -201,6 +220,7 @@ public class CSVParser { * the stream. * * @return matrix of records x values ('null' when end of file) + * @throws IOException on parse error or input read-failure */ public String[][] getAllValues() throws IOException { Vector records = new Vector(); @@ -221,7 +241,7 @@ public class CSVParser { * and returns the next csv-value as string. * * @return next value in the input stream ('null' when end of file) - * @throws IOException + * @throws IOException on parse error or input read-failure */ public String nextValue() throws IOException { Token tkn = nextToken(); @@ -266,7 +286,11 @@ public class CSVParser { record.add(tkn.content.toString()); break; case TT_EOF: - ret = null; + if (tkn.isReady) { + record.add(tkn.content.toString()); + } else { + ret = null; + } break; case TT_INVALID: default: @@ -290,9 +314,8 @@ public class CSVParser { * number does not correspond to the record-number * * @return current line number - * @throws IOException */ - public int getLineNumber() throws IOException { + public int getLineNumber() { return in.getLineNumber(); } @@ -301,15 +324,17 @@ public class CSVParser { // ====================================================== /** - * Returns the next token - * - * a token coresponds to a term, a record change - * or and end-of-file indicator - */ + * Returns the next token. + * + * A token corresponds to a term, a record change or an + * end-of-file indicator. + * + * @return the next token found + * @throws IOException on stream access error + */ protected Token nextToken() throws IOException { Token tkn = new Token(); StringBuffer wsBuf = new StringBuffer(); - // boolean skipEmptyLines = false; // get the last read char (required for empty line detection) int lastChar = in.readAgain(); @@ -342,7 +367,7 @@ public class CSVParser { } // did we reached eof during the last iteration already ? TT_EOF - if (isEndOfFile(lastChar)) { + if (isEndOfFile(lastChar) || (lastChar != delimiter && isEndOfFile(c))) { tkn.type = TT_EOF; return tkn; } @@ -375,8 +400,7 @@ public class CSVParser { } else if (isEndOfFile(c)) { // end of file return TT_EOF() tkn.content.append(""); - tkn.type = TT_EORECORD; - // tkn.type = TT_EOF; + tkn.type = TT_EOF; tkn.isReady = true; } else { // next token must be a simple token @@ -417,23 +441,15 @@ public class CSVParser { tkn.isReady = true; } else if (isEndOfFile(c)) { // end of file - // tkn.type = TT_EOF; - tkn.type = TT_EORECORD; + tkn.type = TT_EOF; tkn.isReady = true; } else if (c == delimiter) { // end of token tkn.type = TT_TOKEN; tkn.isReady = true; - } else if (c == '\\') { - // handle escaped delimiters (remove escaping) - if (in.lookAhead() == this.delimiter) { - tkn.content.append((char) in.read()); - } else if (interpretUnicodeEscapes && in.lookAhead() == 'u') { - // interpret unicode escaped chars (like \u0070 -> p) - tkn.content.append((char) unicodeEscapeLexer(c)); - } else { - tkn.content.append((char) c); - } + } else if (c == '\\' && interpretUnicodeEscapes && in.lookAhead() == 'u') { + // interpret unicode escaped chars (like \u0070 -> p) + tkn.content.append((char) unicodeEscapeLexer(c)); } else if (isWhitespace(c)) { // gather whitespaces // (as long as they are not at the beginning of a token) @@ -484,7 +500,9 @@ public class CSVParser { c = in.read(); tkn.content.append((char) c); } else if (c == '\\' && in.lookAhead() == '\\') { - // doubled escape character -> add single escape char to stream + // doubled escape char, it does not escape itself, only encapsulator + // -> add both escape chars to stream + tkn.content.append((char) c); c = in.read(); tkn.content.append((char) c); } else if ( @@ -493,16 +511,18 @@ public class CSVParser { && in.lookAhead() == 'u') { // interpret unicode escaped chars (like \u0070 -> p) tkn.content.append((char) unicodeEscapeLexer(c)); + } else if (c == '\\') { + // use a single escape character -> add it to stream + tkn.content.append((char) c); } else { - // token finish mark reached: ignore ws till delimiter + // token finish mark (encapsulator) reached: ignore whitespace till delimiter while (!tkn.isReady) { int n = in.lookAhead(); if (n == delimiter) { tkn.type = TT_TOKEN; tkn.isReady = true; } else if (isEndOfFile(n)) { - // tkn.type = TT_EOF; - tkn.type = TT_EORECORD; + tkn.type = TT_EOF; tkn.isReady = true; } else if (isEndOfLine(n)) { // ok eo token reached @@ -538,11 +558,11 @@ public class CSVParser { /** - * Decodes Unicode escapes + * Decodes Unicode escapes. * * Interpretation of "\\uXXXX" escape sequences - * where XXXX is a hex-number - * @param c + * where XXXX is a hex-number. + * @param c current char which is discarded because it's the "\\" of "\\uXXXX" * @return the decoded character * @throws IOException on wrong unicode escape sequence or read error */ @@ -576,29 +596,40 @@ public class CSVParser { * Sets the "Default CSV" settings. * * The default csv settings are relatively restrictive but implement - * something like the "least-common-basis" of CSV. - * - * Values are separated by ',' (as the C in "CSV"). Complex values must - * be surrounded by '"'. Comments are not supported. Leading whitespaces - * are ignored, unicode escapes are not interpreted and empty lines - * are skiped. + * something like the "least-common-basis" of CSV: + * + * @return current instance of CSVParser to allow chained method calls */ - public void setCSVStrategy() { + public CSVParser setCSVStrategy() { setStrategy(',', '"', (char) 0, true, false, true); + return this; } /** - * Sets the "Excel CSV" settings. - * - * There are companies out there which interpret "C" as an abbreviation for - * "Semicolon". For these companies the following settings might be - * appropriate: - *

- * Delimiter Semicolon ';', Complex-values surrounded by '"', leading - * whitespaces are not ignored and unicode escapes are not interpreted. + * Sets the "Excel CSV" settings. There are companies out there which + * interpret "C" as an abbreviation for "Semicolon". For these companies the + * following settings might be appropriate: + *

+ * + * @return current instance of CSVParser to allow chained method calls */ - public void setExcelStrategy() { + public CSVParser setExcelStrategy() { setStrategy(';', '"', (char) 0, false, false, false); + return this; } /** @@ -612,8 +643,9 @@ public class CSVParser { * @param interpretUnicodeEscapes TRUE when unicode escapes should be * interpreted * @param ignoreEmptyLines TRUE when the parser should skip emtpy lines + * @return current instance of CSVParser to allow chained method calls */ - public void setStrategy( + public CSVParser setStrategy( char delimiter, char encapsulator, char commentStart, @@ -626,15 +658,18 @@ public class CSVParser { this.setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace); this.setUnicodeEscapeInterpretation(interpretUnicodeEscapes); this.setIgnoreEmptyLines(ignoreEmptyLines); + return this; } /** - * Set the desired delimiter + * Set the desired delimiter. * * @param c a Char used for value separation + * @return current instance of CSVParser to allow chained method calls */ - public void setDelimiter(char c) { + public CSVParser setDelimiter(char c) { this.delimiter = c; + return this; } /** @@ -647,12 +682,14 @@ public class CSVParser { } /** - * Set the desired encapsulator + * Set the desired encapsulator. * * @param c a Char used as value encapsulation marker + * @return current instance of CSVParser to allow chained method calls */ - public void setEncapsulator(char c) { + public CSVParser setEncapsulator(char c) { this.encapsulator = c; + return this; } /** @@ -665,16 +702,18 @@ public class CSVParser { } /** - * Set the desired comment start character + * Set the desired comment start character. * * @param c a Char used for comment identification + * @return current instance of CSVParser to allow chained method calls */ - public void setCommentStart(char c) { + public CSVParser setCommentStart(char c) { this.commentStart = c; + return this; } /** - * Gets the comment identifier + * Gets the comment identifier. * * @return the comment identifier character */ @@ -683,16 +722,18 @@ public class CSVParser { } /** - * Enables unicode escape interpretation + * Enables unicode escape interpretation. * * @param b TRUE when interpretation should be enabled + * @return current instance of CSVParser to allow chained method calls */ - public void setUnicodeEscapeInterpretation(boolean b) { + public CSVParser setUnicodeEscapeInterpretation(boolean b) { this.interpretUnicodeEscapes = b; + return this; } /** - * Shows wether unicode interpretation is enabled + * Shows wether unicode interpretation is enabled. * * @return TRUE when unicode interpretation is enabled */ @@ -704,16 +745,18 @@ public class CSVParser { * Sets the ignore-leading-whitespaces behaviour. * * Should the lexer ignore leading whitespaces when parsing non - * encapsulated tokens + * encapsulated tokens. * * @param b TRUE when leading whitespaces should be ignored + * @return current instance of CSVParser to allow chained method calls */ - public void setIgnoreLeadingWhitespaces(boolean b) { + public CSVParser setIgnoreLeadingWhitespaces(boolean b) { this.ignoreLeadingWhitespaces = b; + return this; } /** - * Shows wether unicode interpretation is enabled + * Shows whether unicode interpretation is enabled. * * @return TRUE when unicode interpretation is enabled */ @@ -726,10 +769,21 @@ public class CSVParser { * * When set to 'true' empty lines in the input will be ignored. * - * @param b + * @param b TRUE when empty lines in the input should be ignored + * @return current instance of CSVParser to allow chained method calls */ - public void setIgnoreEmptyLines(boolean b) { + public CSVParser setIgnoreEmptyLines(boolean b) { this.ignoreEmptyLines = b; + return this; + } + + /** + * Shows whether empty lines in the input are ignored. + * + * @return TRUE when empty lines in the input are ignored + */ + public boolean getIgnoreEmptyLines() { + return this.ignoreEmptyLines; } // ====================================================== diff --git a/src/test/org/apache/commons/csv/CSVParserTest.java b/src/test/org/apache/commons/csv/CSVParserTest.java index 164992ac..d53e79a4 100644 --- a/src/test/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/org/apache/commons/csv/CSVParserTest.java @@ -36,12 +36,22 @@ import junit.framework.TestSuite; public class CSVParserTest extends TestCase { /** - * TestCSVParser + * TestCSVParser. */ class TestCSVParser extends CSVParser { + /** + * Test parser to investigate the type of the internal Token. + * @param in a Reader + */ TestCSVParser(Reader in) { super(in); } + /** + * Calls super.nextToken() and prints out a String representation of token + * type and content. + * @return String representation of token type and content + * @throws IOException like {@link CSVParser#nextToken()} + */ public String testNextToken() throws IOException { Token t = super.nextToken(); String tmp = Integer.toString(t.type) + ";" + t.content + ";"; @@ -51,13 +61,17 @@ public class CSVParserTest extends TestCase { } /** - * Constructor for CSVParserTest. - * @param arg0 + * Constructor for JUnit. + * @param name Name to be used in JUnit Test Environment */ - public CSVParserTest(String arg0) { - super(arg0); + public CSVParserTest(String name) { + super(name); } + /** + * Returns a Test suite for JUnit. + * @return Test suite for JUnit + */ public static Test suite() { return new TestSuite(CSVParserTest.class); } @@ -95,23 +109,40 @@ public class CSVParserTest extends TestCase { public void testSetCSVStrategy() { CSVParser parser = new CSVParser(new StringReader("hello world")); // default settings - assertEquals(parser.getCommentStart(), '\0'); - assertEquals(parser.getEncapsulator(), '"'); assertEquals(parser.getDelimiter(), ','); + assertEquals(parser.getEncapsulator(), '"'); + assertEquals(parser.getCommentStart(), '\0'); + assertEquals(true, parser.getIgnoreLeadingWhitespaces()); + assertEquals(false, parser.getUnicodeEscapeInterpretation()); + assertEquals(true, parser.getIgnoreEmptyLines()); // explicit csv settings parser.setCSVStrategy(); - assertEquals(parser.getCommentStart(), '\0'); - assertEquals(parser.getEncapsulator(), '"'); assertEquals(parser.getDelimiter(), ','); + assertEquals(parser.getEncapsulator(), '"'); + assertEquals(parser.getCommentStart(), '\0'); + assertEquals(true, parser.getIgnoreLeadingWhitespaces()); + assertEquals(false, parser.getUnicodeEscapeInterpretation()); + assertEquals(true, parser.getIgnoreEmptyLines()); } + public void testSetExcelStrategy() { + CSVParser parser = new CSVParser(new StringReader("hello world")); + // explicit Excel settings + parser.setExcelStrategy(); + assertEquals(parser.getDelimiter(), ';'); + assertEquals(parser.getEncapsulator(), '"'); + assertEquals(parser.getCommentStart(), '\0'); + assertEquals(false, parser.getIgnoreLeadingWhitespaces()); + assertEquals(false, parser.getUnicodeEscapeInterpretation()); + assertEquals(false, parser.getIgnoreEmptyLines()); + } // ====================================================== // lexer tests // ====================================================== - // single line (without comment) + // Single line (without comment) public void testNextToken1() throws IOException { String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,"; TestCSVParser parser = new TestCSVParser(new StringReader(code)); @@ -126,14 +157,13 @@ public class CSVParserTest extends TestCase { assertEquals(CSVParser.TT_TOKEN + ";wxy;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";z;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); } // multiline including comments (and empty lines) public void testNextToken2() throws IOException { /* file: 1,2,3, - * a,b,c + * a,b x,c * * # this is a comment * d,e, @@ -172,10 +202,13 @@ public class CSVParserTest extends TestCase { parser.setCommentStart('#'); System.out.println("---------\n" + code + "\n-------------"); assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";,;", parser.testNextToken()); + // an unquoted single backslash is not an escape char + assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken()); + assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";,;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken()); + // an unquoted single backslash is not an escape char + assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken()); + assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); } @@ -183,7 +216,7 @@ public class CSVParserTest extends TestCase { public void testNextToken4() throws IOException { /* file: a,"foo",b * a, " foo",b - * a,"foo " ,b + * a,"foo " ,b // whitespace after closing encapsulator * a, " foo " ,b */ String code = @@ -202,28 +235,29 @@ public class CSVParserTest extends TestCase { assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + "; foo ;", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); +// assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); + assertEquals(CSVParser.TT_EOF + ";b;", parser.testNextToken()); } // encapsulator tokenizer (multi line, delimiter in string) public void testNextToken5() throws IOException { String code = - "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\",\"\"\"\""; + "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\"" + + ",\"\\,\"" + + ",\"\"\"\""; TestCSVParser parser = new TestCSVParser(new StringReader(code)); parser.setCSVStrategy(); System.out.println("---------\n" + code + "\n-------------"); assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";foo\n;", parser.testNextToken()); assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); - assertEquals( - CSVParser.TT_EORECORD + ";foo\n baar ,,,;", - parser.testNextToken()); + assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;", + parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken()); - assertEquals(CSVParser.TT_EORECORD + ";\";", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); - + // escape char in quoted input only escapes delimiter + assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken()); + assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken()); } // change delimiters, comment, encapsulater @@ -259,11 +293,10 @@ public class CSVParserTest extends TestCase { {"a", "b", "c", "d"}, {"a", "b", "1 2"}, {"foo baar", "b", ""}, - {"foo\n,,\n\",,\n\"", "d", "e"}, - {""} + {"foo\n,,\n\",,\n\"", "d", "e"} }; public void testGetLine() throws IOException { - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); System.out.println("---------\n" + code + "\n-------------"); String[] tmp = null; for (int i = 0; i < res.length; i++) { @@ -275,7 +308,7 @@ public class CSVParserTest extends TestCase { } public void testNextValue() throws IOException { - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); System.out.println("---------\n" + code + "\n-------------"); String tmp = null; for (int i = 0; i < res.length; i++) { @@ -289,7 +322,7 @@ public class CSVParserTest extends TestCase { } public void testGetAllValues() throws IOException { - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); System.out.println("---------\n" + code + "\n-------------"); String[][] tmp = parser.getAllValues(); assertEquals(res.length, tmp.length); @@ -299,7 +332,7 @@ public class CSVParserTest extends TestCase { } } - public void testExcelStrategyTest() throws IOException { + public void testExcelStrategy1() throws IOException { String code = "value1;value2;value3;value4\r\na;b;c;d\r\n x;;;" + "\r\n\r\n\"\"\"hello\"\"\";\" \"\"world\"\"\";\"abc\ndef\";\r\n"; @@ -308,10 +341,9 @@ public class CSVParserTest extends TestCase { {"a", "b", "c", "d"}, {" x", "", "", ""}, {""}, - {"\"hello\"", " \"world\"", "abc\ndef", ""}, - {""} + {"\"hello\"", " \"world\"", "abc\ndef", ""} }; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); parser.setExcelStrategy(); System.out.println("---------\n" + code + "\n-------------"); String[][] tmp = parser.getAllValues(); @@ -322,17 +354,16 @@ public class CSVParserTest extends TestCase { } } - public void testExcelStrategyTest2() throws Exception { + public void testExcelStrategy2() throws Exception { String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n"; String[][] res = { {"foo", "baar"}, {""}, {"hello", ""}, {""}, - {"world", ""}, - {""} + {"world", ""} }; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); parser.setExcelStrategy(); System.out.println("---------\n" + code + "\n-------------"); String[][] tmp = parser.getAllValues(); @@ -344,7 +375,166 @@ public class CSVParserTest extends TestCase { } assertTrue(Arrays.equals(res[i], tmp[i])); } - //assertTrue(false); + } + + public void testEndOfFileBehaviourExcel() throws Exception { + String[] codes = { + "hello;\r\n\r\nworld;\r\n", + "hello;\r\n\r\nworld;", + "hello;\r\n\r\nworld;\"\"\r\n", + "hello;\r\n\r\nworld;\"\"", + "hello;\r\n\r\nworld;\n", + "hello;\r\n\r\nworld;", + "hello;\r\n\r\nworld;\"\"\n", + "hello;\r\n\r\nworld;\"\"" + }; + String[][] res = { + {"hello", ""}, + {""}, // ExcelStrategy does not ignore empty lines + {"world", ""} + }; + String code; + for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) { + code = codes[codeIndex]; + CSVParser parser = new CSVParser(new StringReader(code)); + parser.setExcelStrategy(); + System.out.println("---------\n" + code + "\n-------------"); + String[][] tmp = parser.getAllValues(); + assertEquals(res.length, tmp.length); + assertTrue(tmp.length > 0); + for (int i = 0; i < res.length; i++) { + for (int j = 0; j < tmp[i].length; j++) { + System.out.println("'" + tmp[i][j] + "'"); + } + assertTrue(Arrays.equals(res[i], tmp[i])); + } + } + } + + public void testEndOfFileBehaviorCSV() throws Exception { + String[] codes = { + "hello,\r\n\r\nworld,\r\n", + "hello,\r\n\r\nworld,", + "hello,\r\n\r\nworld,\"\"\r\n", + "hello,\r\n\r\nworld,\"\"", + "hello,\r\n\r\nworld,\n", + "hello,\r\n\r\nworld,", + "hello,\r\n\r\nworld,\"\"\n", + "hello,\r\n\r\nworld,\"\"" + }; + String[][] res = { + {"hello", ""}, // CSV Strategy ignores empty lines + {"world", ""} + }; + String code; + for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) { + code = codes[codeIndex]; + CSVParser parser = new CSVParser(new StringReader(code)); + parser.setCSVStrategy(); + System.out.println("---------\n" + code + "\n-------------"); + String[][] tmp = parser.getAllValues(); + assertEquals(res.length, tmp.length); + assertTrue(tmp.length > 0); + for (int i = 0; i < res.length; i++) { + for (int j = 0; j < tmp[i].length; j++) { + System.out.println("'" + tmp[i][j] + "'"); + } + assertTrue(Arrays.equals(res[i], tmp[i])); + } + } + } + + public void testEmptyLineBehaviourExcel() throws Exception { + String[] codes = { + "hello;\r\n\r\n\r\n", + "hello;\n\n\n", + "hello;\"\"\r\n\r\n\r\n", + "hello;\"\"\n\n\n" + }; + String[][] res = { + {"hello", ""}, + {""}, // ExcelStrategy does not ignore empty lines + {""} + }; + String code; + for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) { + code = codes[codeIndex]; + CSVParser parser = new CSVParser(new StringReader(code)); + parser.setExcelStrategy(); + System.out.println("---------\n" + code + "\n-------------"); + String[][] tmp = parser.getAllValues(); + assertEquals(res.length, tmp.length); + assertTrue(tmp.length > 0); + for (int i = 0; i < res.length; i++) { + for (int j = 0; j < tmp[i].length; j++) { + System.out.println("'" + tmp[i][j] + "'"); + } + assertTrue(Arrays.equals(res[i], tmp[i])); + } + } + } + + public void testEmptyLineBehaviourCSV() throws Exception { + String[] codes = { + "hello,\r\n\r\n\r\n", + "hello,\n\n\n", + "hello,\"\"\r\n\r\n\r\n", + "hello,\"\"\n\n\n" + }; + String[][] res = { + {"hello", ""} // CSV Strategy ignores empty lines + }; + String code; + for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) { + code = codes[codeIndex]; + CSVParser parser = new CSVParser(new StringReader(code)); + parser.setCSVStrategy(); + System.out.println("---------\n" + code + "\n-------------"); + String[][] tmp = parser.getAllValues(); + assertEquals(res.length, tmp.length); + assertTrue(tmp.length > 0); + for (int i = 0; i < res.length; i++) { + for (int j = 0; j < tmp[i].length; j++) { + System.out.println("'" + tmp[i][j] + "'"); + } + assertTrue(Arrays.equals(res[i], tmp[i])); + } + } + } + + public void testBackslashEscaping() throws IOException { + String code = + "one,two,three\n" + + "on\\\"e,two\n" + + "on\"e,two\n" + + "one,\"tw\\\"o\"\n" + + "one,\"t\\,wo\"\n" + + "one,two,\"th,ree\"\n" + + "\"a\\\\\"\n" + + "a\\,b\n" + + "\"a\\\\,b\""; + String[][] res = { + { "one", "two", "three" }, + { "on\\\"e", "two" }, + { "on\"e", "two" }, + { "one", "tw\"o" }, + { "one", "t\\,wo" }, // backslash in quotes only escapes a delimiter (",") + { "one", "two", "th,ree" }, + { "a\\\\" }, // backslash in quotes only escapes a delimiter (",") + { "a\\", "b" }, // a backslash must be returnd + { "a\\\\,b" } // backslash in quotes only escapes a delimiter (",") + }; + CSVParser parser = new CSVParser(new StringReader(code)); + System.out.println("---------\n" + code + "\n-------------"); + String[][] tmp = parser.getAllValues(); + assertEquals(res.length, tmp.length); + assertTrue(tmp.length > 0); + for (int i = 0; i < res.length; i++) { + for (int j = 0; j < tmp[i].length; j++) { + System.out.println("'" + tmp[i][j] + "'"); + } + assertTrue(Arrays.equals(res[i], tmp[i])); + } } // ====================================================== @@ -386,7 +576,8 @@ public class CSVParserTest extends TestCase { assertEquals(2, data[0].length); assertEquals(1, data[1].length); assertEquals("abc", data[0][0]); - assertEquals("def\\nghi", data[0][1]); + // an escape char in quotes only escapes a delimiter, not itself + assertEquals("def\\\\nghi", data[0][1]); assertEquals("jkl", data[1][0]); } @@ -402,9 +593,8 @@ public class CSVParserTest extends TestCase { public void testParse6() throws IOException { String[][] data = CSVParser.parse(""); - assertEquals(1, data.length); - assertEquals(1, data[0].length); - assertEquals("", data[0][0]); + // default strategy is CSV, which ignores empty lines + assertEquals(0, data.length); } public void testParse7() throws IOException { @@ -471,7 +661,7 @@ public class CSVParserTest extends TestCase { public void testUnicodeEscape() throws IOException { String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063"; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); System.out.println("---------\n" + code + "\n-------------"); parser.setUnicodeEscapeInterpretation(true); String[] data = parser.getLine(); @@ -482,7 +672,7 @@ public class CSVParserTest extends TestCase { public void testCarriageReturnLineFeedEndings() throws IOException { String code = "foo\r\nbaar,\r\nhello,world\r\n,kanu"; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); System.out.println("---------\n" + code + "\n-------------"); String[][] data = parser.getAllValues(); assertEquals(4, data.length); @@ -492,7 +682,7 @@ public class CSVParserTest extends TestCase { String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n"; //String code = "world\r\n\n"; //String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n"; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); System.out.println("---------\n" + code + "\n-------------"); String[][] data = parser.getAllValues(); // for (int i = 0; i < data.length; i++) { @@ -509,11 +699,11 @@ public class CSVParserTest extends TestCase { public void testLineTokenConsistency() throws IOException { String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n"; - TestCSVParser parser = new TestCSVParser(new StringReader(code)); + CSVParser parser = new CSVParser(new StringReader(code)); System.out.println("---------\n" + code + "\n-------------"); String[][] data = parser.getAllValues(); - parser = new TestCSVParser(new StringReader(code)); - TestCSVParser parser1 = new TestCSVParser(new StringReader(code)); + parser = new CSVParser(new StringReader(code)); + CSVParser parser1 = new CSVParser(new StringReader(code)); for (int i = 0; i < data.length; i++) { assertTrue(Arrays.equals(parser1.getLine(), data[i])); for (int j = 0; j < data[i].length; j++) {