diff --git a/src/java/org/apache/commons/csv/CSVParser.java b/src/java/org/apache/commons/csv/CSVParser.java
index 44b5d447..0b487588 100644
--- a/src/java/org/apache/commons/csv/CSVParser.java
+++ b/src/java/org/apache/commons/csv/CSVParser.java
@@ -34,7 +34,13 @@ import java.util.Vector;
*
Parsing of a csv-string having ';' as separator:
*
* String[][] data =
- * (new CSVParser(new StringReader("a;b\nc;d"),';')).getAllValues();
+ * (new CSVParser(new StringReader("a;b\nc;d"),';')).getAllValues();
+ *
+ *
+ * The API allows chained method calls, if you like this coding style:
+ *
+ * String[][] data = (new CSVParser(new StringReader("a;b\nc;d"),';'))
+ * .setExcelStrategy().setIgnoreEmptyLines(true).getAllValues();
*
*
*
@@ -45,14 +51,18 @@ import java.util.Vector;
* for more details
*/
public class CSVParser {
-
+
/** length of the initial token (content-)buffer */
private static final int INITIAL_TOKEN_LENGTH = 50;
// the token types
+ /** Token has no valid content, i.e. is in its initilized state. */
protected static final int TT_INVALID = -1;
+ /** Token with content, at beginning or in the middle of a line. */
protected static final int TT_TOKEN = 0;
+ /** Token (which can have content) when end of file is reached. */
protected static final int TT_EOF = 1;
+ /** Token with content when end of a line is reached. */
protected static final int TT_EORECORD = 2;
// the csv definition
@@ -72,12 +82,13 @@ public class CSVParser {
* It is used as contract between the lexer and the parser.
*/
class Token {
- // token type see TT_xxx constants
+ /** Token type, see TT_xxx constants. */
int type;
- // the content buffer
+ /** The content buffer. */
StringBuffer content;
- // token ready flag: indicates a valid token (ready for the parser)
+ /** Token ready flag: indicates a valid token with content (ready for the parser). */
boolean isReady;
+ /** Initializes an empty token. */
Token() {
content = new StringBuffer(INITIAL_TOKEN_LENGTH);
type = TT_INVALID;
@@ -92,6 +103,7 @@ public class CSVParser {
/**
* Parses the given String according to the default CSV strategy.
*
+ * @param s CSV String to be parsed.
* @return parsed String matrix (which is never null)
* @throws IOException in case of error
* @see #setCSVStrategy()
@@ -100,7 +112,13 @@ public class CSVParser {
if (s == null) {
throw new IllegalArgumentException("Null argument not allowed.");
}
- return (new CSVParser(new StringReader(s))).getAllValues();
+ String[][] result = (new CSVParser(new StringReader(s))).getAllValues();
+ if (result == null) {
+ // since CSVStrategy ignores empty lines an empty array is returned
+ // (i.e. not "result = new String[][] {{""}};")
+ result = new String[0][0];
+ }
+ return result;
}
/**
@@ -109,6 +127,7 @@ public class CSVParser {
* Parsing empty string will be handled as valid records containing zero
* elements, so the following property holds: parseLine("").length == 0.
*
+ * @param s CSV String to be parsed.
* @return parsed String vector (which is never null)
* @throws IOException in case of error
* @see #setCSVStrategy()
@@ -166,8 +185,8 @@ public class CSVParser {
* Customized csv parser.
*
* The parser parses according to the given CSV dialect settings.
- * Leading whitespaces are truncated whereas unicode escapes are
- * not interpreted.
+ * Leading whitespaces are truncated, unicode escapes are
+ * not interpreted and empty lines are ignored.
*
* @param input a Reader based on "csv-formatted" input
* @param delimiter a Char used for value separation
@@ -201,6 +220,7 @@ public class CSVParser {
* the stream.
*
* @return matrix of records x values ('null' when end of file)
+ * @throws IOException on parse error or input read-failure
*/
public String[][] getAllValues() throws IOException {
Vector records = new Vector();
@@ -221,7 +241,7 @@ public class CSVParser {
* and returns the next csv-value as string.
*
* @return next value in the input stream ('null' when end of file)
- * @throws IOException
+ * @throws IOException on parse error or input read-failure
*/
public String nextValue() throws IOException {
Token tkn = nextToken();
@@ -266,7 +286,11 @@ public class CSVParser {
record.add(tkn.content.toString());
break;
case TT_EOF:
- ret = null;
+ if (tkn.isReady) {
+ record.add(tkn.content.toString());
+ } else {
+ ret = null;
+ }
break;
case TT_INVALID:
default:
@@ -290,9 +314,8 @@ public class CSVParser {
* number does not correspond to the record-number
*
* @return current line number
- * @throws IOException
*/
- public int getLineNumber() throws IOException {
+ public int getLineNumber() {
return in.getLineNumber();
}
@@ -301,15 +324,17 @@ public class CSVParser {
// ======================================================
/**
- * Returns the next token
- *
- * a token coresponds to a term, a record change
- * or and end-of-file indicator
- */
+ * Returns the next token.
+ *
+ * A token corresponds to a term, a record change or an
+ * end-of-file indicator.
+ *
+ * @return the next token found
+ * @throws IOException on stream access error
+ */
protected Token nextToken() throws IOException {
Token tkn = new Token();
StringBuffer wsBuf = new StringBuffer();
- // boolean skipEmptyLines = false;
// get the last read char (required for empty line detection)
int lastChar = in.readAgain();
@@ -342,7 +367,7 @@ public class CSVParser {
}
// did we reached eof during the last iteration already ? TT_EOF
- if (isEndOfFile(lastChar)) {
+ if (isEndOfFile(lastChar) || (lastChar != delimiter && isEndOfFile(c))) {
tkn.type = TT_EOF;
return tkn;
}
@@ -375,8 +400,7 @@ public class CSVParser {
} else if (isEndOfFile(c)) {
// end of file return TT_EOF()
tkn.content.append("");
- tkn.type = TT_EORECORD;
- // tkn.type = TT_EOF;
+ tkn.type = TT_EOF;
tkn.isReady = true;
} else {
// next token must be a simple token
@@ -417,23 +441,15 @@ public class CSVParser {
tkn.isReady = true;
} else if (isEndOfFile(c)) {
// end of file
- // tkn.type = TT_EOF;
- tkn.type = TT_EORECORD;
+ tkn.type = TT_EOF;
tkn.isReady = true;
} else if (c == delimiter) {
// end of token
tkn.type = TT_TOKEN;
tkn.isReady = true;
- } else if (c == '\\') {
- // handle escaped delimiters (remove escaping)
- if (in.lookAhead() == this.delimiter) {
- tkn.content.append((char) in.read());
- } else if (interpretUnicodeEscapes && in.lookAhead() == 'u') {
- // interpret unicode escaped chars (like \u0070 -> p)
- tkn.content.append((char) unicodeEscapeLexer(c));
- } else {
- tkn.content.append((char) c);
- }
+ } else if (c == '\\' && interpretUnicodeEscapes && in.lookAhead() == 'u') {
+ // interpret unicode escaped chars (like \u0070 -> p)
+ tkn.content.append((char) unicodeEscapeLexer(c));
} else if (isWhitespace(c)) {
// gather whitespaces
// (as long as they are not at the beginning of a token)
@@ -484,7 +500,9 @@ public class CSVParser {
c = in.read();
tkn.content.append((char) c);
} else if (c == '\\' && in.lookAhead() == '\\') {
- // doubled escape character -> add single escape char to stream
+ // doubled escape char, it does not escape itself, only encapsulator
+ // -> add both escape chars to stream
+ tkn.content.append((char) c);
c = in.read();
tkn.content.append((char) c);
} else if (
@@ -493,16 +511,18 @@ public class CSVParser {
&& in.lookAhead() == 'u') {
// interpret unicode escaped chars (like \u0070 -> p)
tkn.content.append((char) unicodeEscapeLexer(c));
+ } else if (c == '\\') {
+ // use a single escape character -> add it to stream
+ tkn.content.append((char) c);
} else {
- // token finish mark reached: ignore ws till delimiter
+ // token finish mark (encapsulator) reached: ignore whitespace till delimiter
while (!tkn.isReady) {
int n = in.lookAhead();
if (n == delimiter) {
tkn.type = TT_TOKEN;
tkn.isReady = true;
} else if (isEndOfFile(n)) {
- // tkn.type = TT_EOF;
- tkn.type = TT_EORECORD;
+ tkn.type = TT_EOF;
tkn.isReady = true;
} else if (isEndOfLine(n)) {
// ok eo token reached
@@ -538,11 +558,11 @@ public class CSVParser {
/**
- * Decodes Unicode escapes
+ * Decodes Unicode escapes.
*
* Interpretation of "\\uXXXX" escape sequences
- * where XXXX is a hex-number
- * @param c
+ * where XXXX is a hex-number.
+ * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
* @return the decoded character
* @throws IOException on wrong unicode escape sequence or read error
*/
@@ -576,29 +596,40 @@ public class CSVParser {
* Sets the "Default CSV" settings.
*
* The default csv settings are relatively restrictive but implement
- * something like the "least-common-basis" of CSV.
- *
- * Values are separated by ',' (as the C in "CSV"). Complex values must
- * be surrounded by '"'. Comments are not supported. Leading whitespaces
- * are ignored, unicode escapes are not interpreted and empty lines
- * are skiped.
+ * something like the "least-common-basis" of CSV:
+ *
+ * - Delimiter of values is comma ',' (as the C in "CSV")
+ * - Complex values encapsulated by '"'
+ * - Comments are not supported
+ * - Leading whitespaces are ignored
+ * - Unicode escapes are not interpreted
+ * - empty lines are skiped
+ *
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setCSVStrategy() {
+ public CSVParser setCSVStrategy() {
setStrategy(',', '"', (char) 0, true, false, true);
+ return this;
}
/**
- * Sets the "Excel CSV" settings.
- *
- * There are companies out there which interpret "C" as an abbreviation for
- * "Semicolon". For these companies the following settings might be
- * appropriate:
- *
- * Delimiter Semicolon ';', Complex-values surrounded by '"', leading
- * whitespaces are not ignored and unicode escapes are not interpreted.
+ * Sets the "Excel CSV" settings. There are companies out there which
+ * interpret "C" as an abbreviation for "Semicolon". For these companies the
+ * following settings might be appropriate:
+ *
+ * - Delimiter of values is semicolon ';'
+ * - Complex values encapsulated by '"'
+ * - Comments are not supported
+ * - Leading whitespaces are not ignored
+ * - Unicode escapes are not interpreted
+ * - empty lines are not skiped
+ *
+ *
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setExcelStrategy() {
+ public CSVParser setExcelStrategy() {
setStrategy(';', '"', (char) 0, false, false, false);
+ return this;
}
/**
@@ -612,8 +643,9 @@ public class CSVParser {
* @param interpretUnicodeEscapes TRUE when unicode escapes should be
* interpreted
* @param ignoreEmptyLines TRUE when the parser should skip emtpy lines
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setStrategy(
+ public CSVParser setStrategy(
char delimiter,
char encapsulator,
char commentStart,
@@ -626,15 +658,18 @@ public class CSVParser {
this.setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
this.setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
this.setIgnoreEmptyLines(ignoreEmptyLines);
+ return this;
}
/**
- * Set the desired delimiter
+ * Set the desired delimiter.
*
* @param c a Char used for value separation
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setDelimiter(char c) {
+ public CSVParser setDelimiter(char c) {
this.delimiter = c;
+ return this;
}
/**
@@ -647,12 +682,14 @@ public class CSVParser {
}
/**
- * Set the desired encapsulator
+ * Set the desired encapsulator.
*
* @param c a Char used as value encapsulation marker
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setEncapsulator(char c) {
+ public CSVParser setEncapsulator(char c) {
this.encapsulator = c;
+ return this;
}
/**
@@ -665,16 +702,18 @@ public class CSVParser {
}
/**
- * Set the desired comment start character
+ * Set the desired comment start character.
*
* @param c a Char used for comment identification
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setCommentStart(char c) {
+ public CSVParser setCommentStart(char c) {
this.commentStart = c;
+ return this;
}
/**
- * Gets the comment identifier
+ * Gets the comment identifier.
*
* @return the comment identifier character
*/
@@ -683,16 +722,18 @@ public class CSVParser {
}
/**
- * Enables unicode escape interpretation
+ * Enables unicode escape interpretation.
*
* @param b TRUE when interpretation should be enabled
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setUnicodeEscapeInterpretation(boolean b) {
+ public CSVParser setUnicodeEscapeInterpretation(boolean b) {
this.interpretUnicodeEscapes = b;
+ return this;
}
/**
- * Shows wether unicode interpretation is enabled
+ * Shows wether unicode interpretation is enabled.
*
* @return TRUE when unicode interpretation is enabled
*/
@@ -704,16 +745,18 @@ public class CSVParser {
* Sets the ignore-leading-whitespaces behaviour.
*
* Should the lexer ignore leading whitespaces when parsing non
- * encapsulated tokens
+ * encapsulated tokens.
*
* @param b TRUE when leading whitespaces should be ignored
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setIgnoreLeadingWhitespaces(boolean b) {
+ public CSVParser setIgnoreLeadingWhitespaces(boolean b) {
this.ignoreLeadingWhitespaces = b;
+ return this;
}
/**
- * Shows wether unicode interpretation is enabled
+ * Shows whether unicode interpretation is enabled.
*
* @return TRUE when unicode interpretation is enabled
*/
@@ -726,10 +769,21 @@ public class CSVParser {
*
* When set to 'true' empty lines in the input will be ignored.
*
- * @param b
+ * @param b TRUE when empty lines in the input should be ignored
+ * @return current instance of CSVParser to allow chained method calls
*/
- public void setIgnoreEmptyLines(boolean b) {
+ public CSVParser setIgnoreEmptyLines(boolean b) {
this.ignoreEmptyLines = b;
+ return this;
+ }
+
+ /**
+ * Shows whether empty lines in the input are ignored.
+ *
+ * @return TRUE when empty lines in the input are ignored
+ */
+ public boolean getIgnoreEmptyLines() {
+ return this.ignoreEmptyLines;
}
// ======================================================
diff --git a/src/test/org/apache/commons/csv/CSVParserTest.java b/src/test/org/apache/commons/csv/CSVParserTest.java
index 164992ac..d53e79a4 100644
--- a/src/test/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/org/apache/commons/csv/CSVParserTest.java
@@ -36,12 +36,22 @@ import junit.framework.TestSuite;
public class CSVParserTest extends TestCase {
/**
- * TestCSVParser
+ * TestCSVParser.
*/
class TestCSVParser extends CSVParser {
+ /**
+ * Test parser to investigate the type of the internal Token.
+ * @param in a Reader
+ */
TestCSVParser(Reader in) {
super(in);
}
+ /**
+ * Calls super.nextToken() and prints out a String representation of token
+ * type and content.
+ * @return String representation of token type and content
+ * @throws IOException like {@link CSVParser#nextToken()}
+ */
public String testNextToken() throws IOException {
Token t = super.nextToken();
String tmp = Integer.toString(t.type) + ";" + t.content + ";";
@@ -51,13 +61,17 @@ public class CSVParserTest extends TestCase {
}
/**
- * Constructor for CSVParserTest.
- * @param arg0
+ * Constructor for JUnit.
+ * @param name Name to be used in JUnit Test Environment
*/
- public CSVParserTest(String arg0) {
- super(arg0);
+ public CSVParserTest(String name) {
+ super(name);
}
+ /**
+ * Returns a Test suite for JUnit.
+ * @return Test suite for JUnit
+ */
public static Test suite() {
return new TestSuite(CSVParserTest.class);
}
@@ -95,23 +109,40 @@ public class CSVParserTest extends TestCase {
public void testSetCSVStrategy() {
CSVParser parser = new CSVParser(new StringReader("hello world"));
// default settings
- assertEquals(parser.getCommentStart(), '\0');
- assertEquals(parser.getEncapsulator(), '"');
assertEquals(parser.getDelimiter(), ',');
+ assertEquals(parser.getEncapsulator(), '"');
+ assertEquals(parser.getCommentStart(), '\0');
+ assertEquals(true, parser.getIgnoreLeadingWhitespaces());
+ assertEquals(false, parser.getUnicodeEscapeInterpretation());
+ assertEquals(true, parser.getIgnoreEmptyLines());
// explicit csv settings
parser.setCSVStrategy();
- assertEquals(parser.getCommentStart(), '\0');
- assertEquals(parser.getEncapsulator(), '"');
assertEquals(parser.getDelimiter(), ',');
+ assertEquals(parser.getEncapsulator(), '"');
+ assertEquals(parser.getCommentStart(), '\0');
+ assertEquals(true, parser.getIgnoreLeadingWhitespaces());
+ assertEquals(false, parser.getUnicodeEscapeInterpretation());
+ assertEquals(true, parser.getIgnoreEmptyLines());
}
+ public void testSetExcelStrategy() {
+ CSVParser parser = new CSVParser(new StringReader("hello world"));
+ // explicit Excel settings
+ parser.setExcelStrategy();
+ assertEquals(parser.getDelimiter(), ';');
+ assertEquals(parser.getEncapsulator(), '"');
+ assertEquals(parser.getCommentStart(), '\0');
+ assertEquals(false, parser.getIgnoreLeadingWhitespaces());
+ assertEquals(false, parser.getUnicodeEscapeInterpretation());
+ assertEquals(false, parser.getIgnoreEmptyLines());
+ }
// ======================================================
// lexer tests
// ======================================================
- // single line (without comment)
+ // Single line (without comment)
public void testNextToken1() throws IOException {
String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,";
TestCSVParser parser = new TestCSVParser(new StringReader(code));
@@ -126,14 +157,13 @@ public class CSVParserTest extends TestCase {
assertEquals(CSVParser.TT_TOKEN + ";wxy;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";z;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
- assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken());
assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());
}
// multiline including comments (and empty lines)
public void testNextToken2() throws IOException {
/* file: 1,2,3,
- * a,b,c
+ * a,b x,c
*
* # this is a comment
* d,e,
@@ -172,10 +202,13 @@ public class CSVParserTest extends TestCase {
parser.setCommentStart('#');
System.out.println("---------\n" + code + "\n-------------");
assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
- assertEquals(CSVParser.TT_TOKEN + ";,;", parser.testNextToken());
+ // an unquoted single backslash is not an escape char
+ assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken());
+ assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
- assertEquals(CSVParser.TT_TOKEN + ";,;", parser.testNextToken());
- assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken());
+ // an unquoted single backslash is not an escape char
+ assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken());
+ assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());
}
@@ -183,7 +216,7 @@ public class CSVParserTest extends TestCase {
public void testNextToken4() throws IOException {
/* file: a,"foo",b
* a, " foo",b
- * a,"foo " ,b
+ * a,"foo " ,b // whitespace after closing encapsulator
* a, " foo " ,b
*/
String code =
@@ -202,28 +235,29 @@ public class CSVParserTest extends TestCase {
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + "; foo ;", parser.testNextToken());
- assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
- assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());
+// assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
+ assertEquals(CSVParser.TT_EOF + ";b;", parser.testNextToken());
}
// encapsulator tokenizer (multi line, delimiter in string)
public void testNextToken5() throws IOException {
String code =
- "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\",\"\"\"\"";
+ "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\""
+ + ",\"\\,\""
+ + ",\"\"\"\"";
TestCSVParser parser = new TestCSVParser(new StringReader(code));
parser.setCSVStrategy();
System.out.println("---------\n" + code + "\n-------------");
assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";foo\n;", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
- assertEquals(
- CSVParser.TT_EORECORD + ";foo\n baar ,,,;",
- parser.testNextToken());
+ assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;",
+ parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
- assertEquals(CSVParser.TT_EORECORD + ";\";", parser.testNextToken());
- assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());
-
+ // escape char in quoted input only escapes delimiter
+ assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
+ assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
}
// change delimiters, comment, encapsulater
@@ -259,11 +293,10 @@ public class CSVParserTest extends TestCase {
{"a", "b", "c", "d"},
{"a", "b", "1 2"},
{"foo baar", "b", ""},
- {"foo\n,,\n\",,\n\"", "d", "e"},
- {""}
+ {"foo\n,,\n\",,\n\"", "d", "e"}
};
public void testGetLine() throws IOException {
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------");
String[] tmp = null;
for (int i = 0; i < res.length; i++) {
@@ -275,7 +308,7 @@ public class CSVParserTest extends TestCase {
}
public void testNextValue() throws IOException {
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------");
String tmp = null;
for (int i = 0; i < res.length; i++) {
@@ -289,7 +322,7 @@ public class CSVParserTest extends TestCase {
}
public void testGetAllValues() throws IOException {
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
assertEquals(res.length, tmp.length);
@@ -299,7 +332,7 @@ public class CSVParserTest extends TestCase {
}
}
- public void testExcelStrategyTest() throws IOException {
+ public void testExcelStrategy1() throws IOException {
String code =
"value1;value2;value3;value4\r\na;b;c;d\r\n x;;;"
+ "\r\n\r\n\"\"\"hello\"\"\";\" \"\"world\"\"\";\"abc\ndef\";\r\n";
@@ -308,10 +341,9 @@ public class CSVParserTest extends TestCase {
{"a", "b", "c", "d"},
{" x", "", "", ""},
{""},
- {"\"hello\"", " \"world\"", "abc\ndef", ""},
- {""}
+ {"\"hello\"", " \"world\"", "abc\ndef", ""}
};
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
parser.setExcelStrategy();
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
@@ -322,17 +354,16 @@ public class CSVParserTest extends TestCase {
}
}
- public void testExcelStrategyTest2() throws Exception {
+ public void testExcelStrategy2() throws Exception {
String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n";
String[][] res = {
{"foo", "baar"},
{""},
{"hello", ""},
{""},
- {"world", ""},
- {""}
+ {"world", ""}
};
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
parser.setExcelStrategy();
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
@@ -344,7 +375,166 @@ public class CSVParserTest extends TestCase {
}
assertTrue(Arrays.equals(res[i], tmp[i]));
}
- //assertTrue(false);
+ }
+
+ public void testEndOfFileBehaviourExcel() throws Exception {
+ String[] codes = {
+ "hello;\r\n\r\nworld;\r\n",
+ "hello;\r\n\r\nworld;",
+ "hello;\r\n\r\nworld;\"\"\r\n",
+ "hello;\r\n\r\nworld;\"\"",
+ "hello;\r\n\r\nworld;\n",
+ "hello;\r\n\r\nworld;",
+ "hello;\r\n\r\nworld;\"\"\n",
+ "hello;\r\n\r\nworld;\"\""
+ };
+ String[][] res = {
+ {"hello", ""},
+ {""}, // ExcelStrategy does not ignore empty lines
+ {"world", ""}
+ };
+ String code;
+ for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
+ code = codes[codeIndex];
+ CSVParser parser = new CSVParser(new StringReader(code));
+ parser.setExcelStrategy();
+ System.out.println("---------\n" + code + "\n-------------");
+ String[][] tmp = parser.getAllValues();
+ assertEquals(res.length, tmp.length);
+ assertTrue(tmp.length > 0);
+ for (int i = 0; i < res.length; i++) {
+ for (int j = 0; j < tmp[i].length; j++) {
+ System.out.println("'" + tmp[i][j] + "'");
+ }
+ assertTrue(Arrays.equals(res[i], tmp[i]));
+ }
+ }
+ }
+
+ public void testEndOfFileBehaviorCSV() throws Exception {
+ String[] codes = {
+ "hello,\r\n\r\nworld,\r\n",
+ "hello,\r\n\r\nworld,",
+ "hello,\r\n\r\nworld,\"\"\r\n",
+ "hello,\r\n\r\nworld,\"\"",
+ "hello,\r\n\r\nworld,\n",
+ "hello,\r\n\r\nworld,",
+ "hello,\r\n\r\nworld,\"\"\n",
+ "hello,\r\n\r\nworld,\"\""
+ };
+ String[][] res = {
+ {"hello", ""}, // CSV Strategy ignores empty lines
+ {"world", ""}
+ };
+ String code;
+ for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
+ code = codes[codeIndex];
+ CSVParser parser = new CSVParser(new StringReader(code));
+ parser.setCSVStrategy();
+ System.out.println("---------\n" + code + "\n-------------");
+ String[][] tmp = parser.getAllValues();
+ assertEquals(res.length, tmp.length);
+ assertTrue(tmp.length > 0);
+ for (int i = 0; i < res.length; i++) {
+ for (int j = 0; j < tmp[i].length; j++) {
+ System.out.println("'" + tmp[i][j] + "'");
+ }
+ assertTrue(Arrays.equals(res[i], tmp[i]));
+ }
+ }
+ }
+
+ public void testEmptyLineBehaviourExcel() throws Exception {
+ String[] codes = {
+ "hello;\r\n\r\n\r\n",
+ "hello;\n\n\n",
+ "hello;\"\"\r\n\r\n\r\n",
+ "hello;\"\"\n\n\n"
+ };
+ String[][] res = {
+ {"hello", ""},
+ {""}, // ExcelStrategy does not ignore empty lines
+ {""}
+ };
+ String code;
+ for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
+ code = codes[codeIndex];
+ CSVParser parser = new CSVParser(new StringReader(code));
+ parser.setExcelStrategy();
+ System.out.println("---------\n" + code + "\n-------------");
+ String[][] tmp = parser.getAllValues();
+ assertEquals(res.length, tmp.length);
+ assertTrue(tmp.length > 0);
+ for (int i = 0; i < res.length; i++) {
+ for (int j = 0; j < tmp[i].length; j++) {
+ System.out.println("'" + tmp[i][j] + "'");
+ }
+ assertTrue(Arrays.equals(res[i], tmp[i]));
+ }
+ }
+ }
+
+ public void testEmptyLineBehaviourCSV() throws Exception {
+ String[] codes = {
+ "hello,\r\n\r\n\r\n",
+ "hello,\n\n\n",
+ "hello,\"\"\r\n\r\n\r\n",
+ "hello,\"\"\n\n\n"
+ };
+ String[][] res = {
+ {"hello", ""} // CSV Strategy ignores empty lines
+ };
+ String code;
+ for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
+ code = codes[codeIndex];
+ CSVParser parser = new CSVParser(new StringReader(code));
+ parser.setCSVStrategy();
+ System.out.println("---------\n" + code + "\n-------------");
+ String[][] tmp = parser.getAllValues();
+ assertEquals(res.length, tmp.length);
+ assertTrue(tmp.length > 0);
+ for (int i = 0; i < res.length; i++) {
+ for (int j = 0; j < tmp[i].length; j++) {
+ System.out.println("'" + tmp[i][j] + "'");
+ }
+ assertTrue(Arrays.equals(res[i], tmp[i]));
+ }
+ }
+ }
+
+ public void testBackslashEscaping() throws IOException {
+ String code =
+ "one,two,three\n"
+ + "on\\\"e,two\n"
+ + "on\"e,two\n"
+ + "one,\"tw\\\"o\"\n"
+ + "one,\"t\\,wo\"\n"
+ + "one,two,\"th,ree\"\n"
+ + "\"a\\\\\"\n"
+ + "a\\,b\n"
+ + "\"a\\\\,b\"";
+ String[][] res = {
+ { "one", "two", "three" },
+ { "on\\\"e", "two" },
+ { "on\"e", "two" },
+ { "one", "tw\"o" },
+ { "one", "t\\,wo" }, // backslash in quotes only escapes a delimiter (",")
+ { "one", "two", "th,ree" },
+ { "a\\\\" }, // backslash in quotes only escapes a delimiter (",")
+ { "a\\", "b" }, // a backslash must be returnd
+ { "a\\\\,b" } // backslash in quotes only escapes a delimiter (",")
+ };
+ CSVParser parser = new CSVParser(new StringReader(code));
+ System.out.println("---------\n" + code + "\n-------------");
+ String[][] tmp = parser.getAllValues();
+ assertEquals(res.length, tmp.length);
+ assertTrue(tmp.length > 0);
+ for (int i = 0; i < res.length; i++) {
+ for (int j = 0; j < tmp[i].length; j++) {
+ System.out.println("'" + tmp[i][j] + "'");
+ }
+ assertTrue(Arrays.equals(res[i], tmp[i]));
+ }
}
// ======================================================
@@ -386,7 +576,8 @@ public class CSVParserTest extends TestCase {
assertEquals(2, data[0].length);
assertEquals(1, data[1].length);
assertEquals("abc", data[0][0]);
- assertEquals("def\\nghi", data[0][1]);
+ // an escape char in quotes only escapes a delimiter, not itself
+ assertEquals("def\\\\nghi", data[0][1]);
assertEquals("jkl", data[1][0]);
}
@@ -402,9 +593,8 @@ public class CSVParserTest extends TestCase {
public void testParse6() throws IOException {
String[][] data = CSVParser.parse("");
- assertEquals(1, data.length);
- assertEquals(1, data[0].length);
- assertEquals("", data[0][0]);
+ // default strategy is CSV, which ignores empty lines
+ assertEquals(0, data.length);
}
public void testParse7() throws IOException {
@@ -471,7 +661,7 @@ public class CSVParserTest extends TestCase {
public void testUnicodeEscape() throws IOException {
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------");
parser.setUnicodeEscapeInterpretation(true);
String[] data = parser.getLine();
@@ -482,7 +672,7 @@ public class CSVParserTest extends TestCase {
public void testCarriageReturnLineFeedEndings() throws IOException {
String code = "foo\r\nbaar,\r\nhello,world\r\n,kanu";
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------");
String[][] data = parser.getAllValues();
assertEquals(4, data.length);
@@ -492,7 +682,7 @@ public class CSVParserTest extends TestCase {
String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n";
//String code = "world\r\n\n";
//String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n";
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------");
String[][] data = parser.getAllValues();
// for (int i = 0; i < data.length; i++) {
@@ -509,11 +699,11 @@ public class CSVParserTest extends TestCase {
public void testLineTokenConsistency() throws IOException {
String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n";
- TestCSVParser parser = new TestCSVParser(new StringReader(code));
+ CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------");
String[][] data = parser.getAllValues();
- parser = new TestCSVParser(new StringReader(code));
- TestCSVParser parser1 = new TestCSVParser(new StringReader(code));
+ parser = new CSVParser(new StringReader(code));
+ CSVParser parser1 = new CSVParser(new StringReader(code));
for (int i = 0; i < data.length; i++) {
assertTrue(Arrays.equals(parser1.getLine(), data[i]));
for (int j = 0; j < data[i].length; j++) {