Javadoc improvements, more unit tests, change of API to a chain style, some bugfixes

git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/sandbox/csv/trunk@383468 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Henri Yandell 2006-03-06 05:11:21 +00:00
parent 58793330f1
commit f047581f95
2 changed files with 365 additions and 121 deletions

View File

@ -34,7 +34,13 @@ import java.util.Vector;
* <p>Parsing of a csv-string having ';' as separator:</p> * <p>Parsing of a csv-string having ';' as separator:</p>
* <pre> * <pre>
* String[][] data = * String[][] data =
* (new CSVParser(new StringReader("a;b\nc;d"),';')).getAllValues(); * (new CSVParser(new StringReader("a;b\nc;d"),';')).getAllValues();
* </pre>
*
* <p>The API allows chained method calls, if you like this coding style:</p>
* <pre>
* String[][] data = (new CSVParser(new StringReader("a;b\nc;d"),';'))
* .setExcelStrategy().setIgnoreEmptyLines(true).getAllValues();
* </pre> * </pre>
* *
* <p> * <p>
@ -45,14 +51,18 @@ import java.util.Vector;
* for more details</p> * for more details</p>
*/ */
public class CSVParser { public class CSVParser {
/** length of the initial token (content-)buffer */ /** length of the initial token (content-)buffer */
private static final int INITIAL_TOKEN_LENGTH = 50; private static final int INITIAL_TOKEN_LENGTH = 50;
// the token types // the token types
/** Token has no valid content, i.e. is in its initilized state. */
protected static final int TT_INVALID = -1; protected static final int TT_INVALID = -1;
/** Token with content, at beginning or in the middle of a line. */
protected static final int TT_TOKEN = 0; protected static final int TT_TOKEN = 0;
/** Token (which can have content) when end of file is reached. */
protected static final int TT_EOF = 1; protected static final int TT_EOF = 1;
/** Token with content when end of a line is reached. */
protected static final int TT_EORECORD = 2; protected static final int TT_EORECORD = 2;
// the csv definition // the csv definition
@ -72,12 +82,13 @@ public class CSVParser {
* It is used as contract between the lexer and the parser. * It is used as contract between the lexer and the parser.
*/ */
class Token { class Token {
// token type see TT_xxx constants /** Token type, see TT_xxx constants. */
int type; int type;
// the content buffer /** The content buffer. */
StringBuffer content; StringBuffer content;
// token ready flag: indicates a valid token (ready for the parser) /** Token ready flag: indicates a valid token with content (ready for the parser). */
boolean isReady; boolean isReady;
/** Initializes an empty token. */
Token() { Token() {
content = new StringBuffer(INITIAL_TOKEN_LENGTH); content = new StringBuffer(INITIAL_TOKEN_LENGTH);
type = TT_INVALID; type = TT_INVALID;
@ -92,6 +103,7 @@ public class CSVParser {
/** /**
* Parses the given String according to the default CSV strategy. * Parses the given String according to the default CSV strategy.
* *
* @param s CSV String to be parsed.
* @return parsed String matrix (which is never null) * @return parsed String matrix (which is never null)
* @throws IOException in case of error * @throws IOException in case of error
* @see #setCSVStrategy() * @see #setCSVStrategy()
@ -100,7 +112,13 @@ public class CSVParser {
if (s == null) { if (s == null) {
throw new IllegalArgumentException("Null argument not allowed."); throw new IllegalArgumentException("Null argument not allowed.");
} }
return (new CSVParser(new StringReader(s))).getAllValues(); String[][] result = (new CSVParser(new StringReader(s))).getAllValues();
if (result == null) {
// since CSVStrategy ignores empty lines an empty array is returned
// (i.e. not "result = new String[][] {{""}};")
result = new String[0][0];
}
return result;
} }
/** /**
@ -109,6 +127,7 @@ public class CSVParser {
* Parsing empty string will be handled as valid records containing zero * Parsing empty string will be handled as valid records containing zero
* elements, so the following property holds: parseLine("").length == 0. * elements, so the following property holds: parseLine("").length == 0.
* *
* @param s CSV String to be parsed.
* @return parsed String vector (which is never null) * @return parsed String vector (which is never null)
* @throws IOException in case of error * @throws IOException in case of error
* @see #setCSVStrategy() * @see #setCSVStrategy()
@ -166,8 +185,8 @@ public class CSVParser {
* Customized csv parser. * Customized csv parser.
* *
* The parser parses according to the given CSV dialect settings. * The parser parses according to the given CSV dialect settings.
* Leading whitespaces are truncated whereas unicode escapes are * Leading whitespaces are truncated, unicode escapes are
* not interpreted. * not interpreted and empty lines are ignored.
* *
* @param input a Reader based on "csv-formatted" input * @param input a Reader based on "csv-formatted" input
* @param delimiter a Char used for value separation * @param delimiter a Char used for value separation
@ -201,6 +220,7 @@ public class CSVParser {
* the stream. * the stream.
* *
* @return matrix of records x values ('null' when end of file) * @return matrix of records x values ('null' when end of file)
* @throws IOException on parse error or input read-failure
*/ */
public String[][] getAllValues() throws IOException { public String[][] getAllValues() throws IOException {
Vector records = new Vector(); Vector records = new Vector();
@ -221,7 +241,7 @@ public class CSVParser {
* and returns the next csv-value as string. * and returns the next csv-value as string.
* *
* @return next value in the input stream ('null' when end of file) * @return next value in the input stream ('null' when end of file)
* @throws IOException * @throws IOException on parse error or input read-failure
*/ */
public String nextValue() throws IOException { public String nextValue() throws IOException {
Token tkn = nextToken(); Token tkn = nextToken();
@ -266,7 +286,11 @@ public class CSVParser {
record.add(tkn.content.toString()); record.add(tkn.content.toString());
break; break;
case TT_EOF: case TT_EOF:
ret = null; if (tkn.isReady) {
record.add(tkn.content.toString());
} else {
ret = null;
}
break; break;
case TT_INVALID: case TT_INVALID:
default: default:
@ -290,9 +314,8 @@ public class CSVParser {
* number does not correspond to the record-number * number does not correspond to the record-number
* *
* @return current line number * @return current line number
* @throws IOException
*/ */
public int getLineNumber() throws IOException { public int getLineNumber() {
return in.getLineNumber(); return in.getLineNumber();
} }
@ -301,15 +324,17 @@ public class CSVParser {
// ====================================================== // ======================================================
/** /**
* Returns the next token * Returns the next token.
* *
* a token coresponds to a term, a record change * A token corresponds to a term, a record change or an
* or and end-of-file indicator * end-of-file indicator.
*/ *
* @return the next token found
* @throws IOException on stream access error
*/
protected Token nextToken() throws IOException { protected Token nextToken() throws IOException {
Token tkn = new Token(); Token tkn = new Token();
StringBuffer wsBuf = new StringBuffer(); StringBuffer wsBuf = new StringBuffer();
// boolean skipEmptyLines = false;
// get the last read char (required for empty line detection) // get the last read char (required for empty line detection)
int lastChar = in.readAgain(); int lastChar = in.readAgain();
@ -342,7 +367,7 @@ public class CSVParser {
} }
// did we reached eof during the last iteration already ? TT_EOF // did we reached eof during the last iteration already ? TT_EOF
if (isEndOfFile(lastChar)) { if (isEndOfFile(lastChar) || (lastChar != delimiter && isEndOfFile(c))) {
tkn.type = TT_EOF; tkn.type = TT_EOF;
return tkn; return tkn;
} }
@ -375,8 +400,7 @@ public class CSVParser {
} else if (isEndOfFile(c)) { } else if (isEndOfFile(c)) {
// end of file return TT_EOF() // end of file return TT_EOF()
tkn.content.append(""); tkn.content.append("");
tkn.type = TT_EORECORD; tkn.type = TT_EOF;
// tkn.type = TT_EOF;
tkn.isReady = true; tkn.isReady = true;
} else { } else {
// next token must be a simple token // next token must be a simple token
@ -417,23 +441,15 @@ public class CSVParser {
tkn.isReady = true; tkn.isReady = true;
} else if (isEndOfFile(c)) { } else if (isEndOfFile(c)) {
// end of file // end of file
// tkn.type = TT_EOF; tkn.type = TT_EOF;
tkn.type = TT_EORECORD;
tkn.isReady = true; tkn.isReady = true;
} else if (c == delimiter) { } else if (c == delimiter) {
// end of token // end of token
tkn.type = TT_TOKEN; tkn.type = TT_TOKEN;
tkn.isReady = true; tkn.isReady = true;
} else if (c == '\\') { } else if (c == '\\' && interpretUnicodeEscapes && in.lookAhead() == 'u') {
// handle escaped delimiters (remove escaping) // interpret unicode escaped chars (like \u0070 -> p)
if (in.lookAhead() == this.delimiter) { tkn.content.append((char) unicodeEscapeLexer(c));
tkn.content.append((char) in.read());
} else if (interpretUnicodeEscapes && in.lookAhead() == 'u') {
// interpret unicode escaped chars (like \u0070 -> p)
tkn.content.append((char) unicodeEscapeLexer(c));
} else {
tkn.content.append((char) c);
}
} else if (isWhitespace(c)) { } else if (isWhitespace(c)) {
// gather whitespaces // gather whitespaces
// (as long as they are not at the beginning of a token) // (as long as they are not at the beginning of a token)
@ -484,7 +500,9 @@ public class CSVParser {
c = in.read(); c = in.read();
tkn.content.append((char) c); tkn.content.append((char) c);
} else if (c == '\\' && in.lookAhead() == '\\') { } else if (c == '\\' && in.lookAhead() == '\\') {
// doubled escape character -> add single escape char to stream // doubled escape char, it does not escape itself, only encapsulator
// -> add both escape chars to stream
tkn.content.append((char) c);
c = in.read(); c = in.read();
tkn.content.append((char) c); tkn.content.append((char) c);
} else if ( } else if (
@ -493,16 +511,18 @@ public class CSVParser {
&& in.lookAhead() == 'u') { && in.lookAhead() == 'u') {
// interpret unicode escaped chars (like \u0070 -> p) // interpret unicode escaped chars (like \u0070 -> p)
tkn.content.append((char) unicodeEscapeLexer(c)); tkn.content.append((char) unicodeEscapeLexer(c));
} else if (c == '\\') {
// use a single escape character -> add it to stream
tkn.content.append((char) c);
} else { } else {
// token finish mark reached: ignore ws till delimiter // token finish mark (encapsulator) reached: ignore whitespace till delimiter
while (!tkn.isReady) { while (!tkn.isReady) {
int n = in.lookAhead(); int n = in.lookAhead();
if (n == delimiter) { if (n == delimiter) {
tkn.type = TT_TOKEN; tkn.type = TT_TOKEN;
tkn.isReady = true; tkn.isReady = true;
} else if (isEndOfFile(n)) { } else if (isEndOfFile(n)) {
// tkn.type = TT_EOF; tkn.type = TT_EOF;
tkn.type = TT_EORECORD;
tkn.isReady = true; tkn.isReady = true;
} else if (isEndOfLine(n)) { } else if (isEndOfLine(n)) {
// ok eo token reached // ok eo token reached
@ -538,11 +558,11 @@ public class CSVParser {
/** /**
* Decodes Unicode escapes * Decodes Unicode escapes.
* *
* Interpretation of "\\uXXXX" escape sequences * Interpretation of "\\uXXXX" escape sequences
* where XXXX is a hex-number * where XXXX is a hex-number.
* @param c * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
* @return the decoded character * @return the decoded character
* @throws IOException on wrong unicode escape sequence or read error * @throws IOException on wrong unicode escape sequence or read error
*/ */
@ -576,29 +596,40 @@ public class CSVParser {
* Sets the "Default CSV" settings. * Sets the "Default CSV" settings.
* *
* The default csv settings are relatively restrictive but implement * The default csv settings are relatively restrictive but implement
* something like the "least-common-basis" of CSV. * something like the "least-common-basis" of CSV:
* * <ul>
* Values are separated by ',' (as the C in "CSV"). Complex values must * <li> Delimiter of values is comma ',' (as the C in "CSV") </li>
* be surrounded by '"'. Comments are not supported. Leading whitespaces * <li> Complex values encapsulated by '"' </li>
* are ignored, unicode escapes are not interpreted and empty lines * <li> Comments are not supported </li>
* are skiped. * <li> Leading whitespaces are ignored </li>
* <li> Unicode escapes are not interpreted </li>
* <li> empty lines are skiped </li>
* </ul>
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setCSVStrategy() { public CSVParser setCSVStrategy() {
setStrategy(',', '"', (char) 0, true, false, true); setStrategy(',', '"', (char) 0, true, false, true);
return this;
} }
/** /**
* Sets the "Excel CSV" settings. * Sets the "Excel CSV" settings. There are companies out there which
* * interpret "C" as an abbreviation for "Semicolon". For these companies the
* There are companies out there which interpret "C" as an abbreviation for * following settings might be appropriate:
* "Semicolon". For these companies the following settings might be * <ul>
* appropriate: * <li> Delimiter of values is semicolon ';' </li>
* <p> * <li> Complex values encapsulated by '"' </li>
* Delimiter Semicolon ';', Complex-values surrounded by '"', leading * <li> Comments are not supported </li>
* whitespaces are not ignored and unicode escapes are not interpreted. * <li> Leading whitespaces are not ignored </li>
* <li> Unicode escapes are not interpreted </li>
* <li> empty lines are not skiped </li>
* </ul>
*
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setExcelStrategy() { public CSVParser setExcelStrategy() {
setStrategy(';', '"', (char) 0, false, false, false); setStrategy(';', '"', (char) 0, false, false, false);
return this;
} }
/** /**
@ -612,8 +643,9 @@ public class CSVParser {
* @param interpretUnicodeEscapes TRUE when unicode escapes should be * @param interpretUnicodeEscapes TRUE when unicode escapes should be
* interpreted * interpreted
* @param ignoreEmptyLines TRUE when the parser should skip emtpy lines * @param ignoreEmptyLines TRUE when the parser should skip emtpy lines
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setStrategy( public CSVParser setStrategy(
char delimiter, char delimiter,
char encapsulator, char encapsulator,
char commentStart, char commentStart,
@ -626,15 +658,18 @@ public class CSVParser {
this.setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace); this.setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
this.setUnicodeEscapeInterpretation(interpretUnicodeEscapes); this.setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
this.setIgnoreEmptyLines(ignoreEmptyLines); this.setIgnoreEmptyLines(ignoreEmptyLines);
return this;
} }
/** /**
* Set the desired delimiter * Set the desired delimiter.
* *
* @param c a Char used for value separation * @param c a Char used for value separation
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setDelimiter(char c) { public CSVParser setDelimiter(char c) {
this.delimiter = c; this.delimiter = c;
return this;
} }
/** /**
@ -647,12 +682,14 @@ public class CSVParser {
} }
/** /**
* Set the desired encapsulator * Set the desired encapsulator.
* *
* @param c a Char used as value encapsulation marker * @param c a Char used as value encapsulation marker
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setEncapsulator(char c) { public CSVParser setEncapsulator(char c) {
this.encapsulator = c; this.encapsulator = c;
return this;
} }
/** /**
@ -665,16 +702,18 @@ public class CSVParser {
} }
/** /**
* Set the desired comment start character * Set the desired comment start character.
* *
* @param c a Char used for comment identification * @param c a Char used for comment identification
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setCommentStart(char c) { public CSVParser setCommentStart(char c) {
this.commentStart = c; this.commentStart = c;
return this;
} }
/** /**
* Gets the comment identifier * Gets the comment identifier.
* *
* @return the comment identifier character * @return the comment identifier character
*/ */
@ -683,16 +722,18 @@ public class CSVParser {
} }
/** /**
* Enables unicode escape interpretation * Enables unicode escape interpretation.
* *
* @param b TRUE when interpretation should be enabled * @param b TRUE when interpretation should be enabled
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setUnicodeEscapeInterpretation(boolean b) { public CSVParser setUnicodeEscapeInterpretation(boolean b) {
this.interpretUnicodeEscapes = b; this.interpretUnicodeEscapes = b;
return this;
} }
/** /**
* Shows wether unicode interpretation is enabled * Shows wether unicode interpretation is enabled.
* *
* @return TRUE when unicode interpretation is enabled * @return TRUE when unicode interpretation is enabled
*/ */
@ -704,16 +745,18 @@ public class CSVParser {
* Sets the ignore-leading-whitespaces behaviour. * Sets the ignore-leading-whitespaces behaviour.
* *
* Should the lexer ignore leading whitespaces when parsing non * Should the lexer ignore leading whitespaces when parsing non
* encapsulated tokens * encapsulated tokens.
* *
* @param b TRUE when leading whitespaces should be ignored * @param b TRUE when leading whitespaces should be ignored
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setIgnoreLeadingWhitespaces(boolean b) { public CSVParser setIgnoreLeadingWhitespaces(boolean b) {
this.ignoreLeadingWhitespaces = b; this.ignoreLeadingWhitespaces = b;
return this;
} }
/** /**
* Shows wether unicode interpretation is enabled * Shows whether unicode interpretation is enabled.
* *
* @return TRUE when unicode interpretation is enabled * @return TRUE when unicode interpretation is enabled
*/ */
@ -726,10 +769,21 @@ public class CSVParser {
* *
* When set to 'true' empty lines in the input will be ignored. * When set to 'true' empty lines in the input will be ignored.
* *
* @param b * @param b TRUE when empty lines in the input should be ignored
* @return current instance of CSVParser to allow chained method calls
*/ */
public void setIgnoreEmptyLines(boolean b) { public CSVParser setIgnoreEmptyLines(boolean b) {
this.ignoreEmptyLines = b; this.ignoreEmptyLines = b;
return this;
}
/**
* Shows whether empty lines in the input are ignored.
*
* @return TRUE when empty lines in the input are ignored
*/
public boolean getIgnoreEmptyLines() {
return this.ignoreEmptyLines;
} }
// ====================================================== // ======================================================

View File

@ -36,12 +36,22 @@ import junit.framework.TestSuite;
public class CSVParserTest extends TestCase { public class CSVParserTest extends TestCase {
/** /**
* TestCSVParser * TestCSVParser.
*/ */
class TestCSVParser extends CSVParser { class TestCSVParser extends CSVParser {
/**
* Test parser to investigate the type of the internal Token.
* @param in a Reader
*/
TestCSVParser(Reader in) { TestCSVParser(Reader in) {
super(in); super(in);
} }
/**
* Calls super.nextToken() and prints out a String representation of token
* type and content.
* @return String representation of token type and content
* @throws IOException like {@link CSVParser#nextToken()}
*/
public String testNextToken() throws IOException { public String testNextToken() throws IOException {
Token t = super.nextToken(); Token t = super.nextToken();
String tmp = Integer.toString(t.type) + ";" + t.content + ";"; String tmp = Integer.toString(t.type) + ";" + t.content + ";";
@ -51,13 +61,17 @@ public class CSVParserTest extends TestCase {
} }
/** /**
* Constructor for CSVParserTest. * Constructor for JUnit.
* @param arg0 * @param name Name to be used in JUnit Test Environment
*/ */
public CSVParserTest(String arg0) { public CSVParserTest(String name) {
super(arg0); super(name);
} }
/**
* Returns a Test suite for JUnit.
* @return Test suite for JUnit
*/
public static Test suite() { public static Test suite() {
return new TestSuite(CSVParserTest.class); return new TestSuite(CSVParserTest.class);
} }
@ -95,23 +109,40 @@ public class CSVParserTest extends TestCase {
public void testSetCSVStrategy() { public void testSetCSVStrategy() {
CSVParser parser = new CSVParser(new StringReader("hello world")); CSVParser parser = new CSVParser(new StringReader("hello world"));
// default settings // default settings
assertEquals(parser.getCommentStart(), '\0');
assertEquals(parser.getEncapsulator(), '"');
assertEquals(parser.getDelimiter(), ','); assertEquals(parser.getDelimiter(), ',');
assertEquals(parser.getEncapsulator(), '"');
assertEquals(parser.getCommentStart(), '\0');
assertEquals(true, parser.getIgnoreLeadingWhitespaces());
assertEquals(false, parser.getUnicodeEscapeInterpretation());
assertEquals(true, parser.getIgnoreEmptyLines());
// explicit csv settings // explicit csv settings
parser.setCSVStrategy(); parser.setCSVStrategy();
assertEquals(parser.getCommentStart(), '\0');
assertEquals(parser.getEncapsulator(), '"');
assertEquals(parser.getDelimiter(), ','); assertEquals(parser.getDelimiter(), ',');
assertEquals(parser.getEncapsulator(), '"');
assertEquals(parser.getCommentStart(), '\0');
assertEquals(true, parser.getIgnoreLeadingWhitespaces());
assertEquals(false, parser.getUnicodeEscapeInterpretation());
assertEquals(true, parser.getIgnoreEmptyLines());
} }
public void testSetExcelStrategy() {
CSVParser parser = new CSVParser(new StringReader("hello world"));
// explicit Excel settings
parser.setExcelStrategy();
assertEquals(parser.getDelimiter(), ';');
assertEquals(parser.getEncapsulator(), '"');
assertEquals(parser.getCommentStart(), '\0');
assertEquals(false, parser.getIgnoreLeadingWhitespaces());
assertEquals(false, parser.getUnicodeEscapeInterpretation());
assertEquals(false, parser.getIgnoreEmptyLines());
}
// ====================================================== // ======================================================
// lexer tests // lexer tests
// ====================================================== // ======================================================
// single line (without comment) // Single line (without comment)
public void testNextToken1() throws IOException { public void testNextToken1() throws IOException {
String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,"; String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,";
TestCSVParser parser = new TestCSVParser(new StringReader(code)); TestCSVParser parser = new TestCSVParser(new StringReader(code));
@ -126,14 +157,13 @@ public class CSVParserTest extends TestCase {
assertEquals(CSVParser.TT_TOKEN + ";wxy;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";wxy;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";z;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";z;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken());
assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());
} }
// multiline including comments (and empty lines) // multiline including comments (and empty lines)
public void testNextToken2() throws IOException { public void testNextToken2() throws IOException {
/* file: 1,2,3, /* file: 1,2,3,
* a,b,c * a,b x,c
* *
* # this is a comment * # this is a comment
* d,e, * d,e,
@ -172,10 +202,13 @@ public class CSVParserTest extends TestCase {
parser.setCommentStart('#'); parser.setCommentStart('#');
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";,;", parser.testNextToken()); // an unquoted single backslash is not an escape char
assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";,;", parser.testNextToken()); // an unquoted single backslash is not an escape char
assertEquals(CSVParser.TT_EORECORD + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";\\;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";;", parser.testNextToken());
assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken());
} }
@ -183,7 +216,7 @@ public class CSVParserTest extends TestCase {
public void testNextToken4() throws IOException { public void testNextToken4() throws IOException {
/* file: a,"foo",b /* file: a,"foo",b
* a, " foo",b * a, " foo",b
* a,"foo " ,b * a,"foo " ,b // whitespace after closing encapsulator
* a, " foo " ,b * a, " foo " ,b
*/ */
String code = String code =
@ -202,28 +235,29 @@ public class CSVParserTest extends TestCase {
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + "; foo ;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + "; foo ;", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); // assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_EOF + ";b;", parser.testNextToken());
} }
// encapsulator tokenizer (multi line, delimiter in string) // encapsulator tokenizer (multi line, delimiter in string)
public void testNextToken5() throws IOException { public void testNextToken5() throws IOException {
String code = String code =
"a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\",\"\"\"\""; "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\""
+ ",\"\\,\""
+ ",\"\"\"\"";
TestCSVParser parser = new TestCSVParser(new StringReader(code)); TestCSVParser parser = new TestCSVParser(new StringReader(code));
parser.setCSVStrategy(); parser.setCSVStrategy();
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";a;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";foo\n;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";foo\n;", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
assertEquals( assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;",
CSVParser.TT_EORECORD + ";foo\n baar ,,,;", parser.testNextToken());
parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";\";", parser.testNextToken()); // escape char in quoted input only escapes delimiter
assertEquals(CSVParser.TT_EOF + ";;", parser.testNextToken()); assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
} }
// change delimiters, comment, encapsulater // change delimiters, comment, encapsulater
@ -259,11 +293,10 @@ public class CSVParserTest extends TestCase {
{"a", "b", "c", "d"}, {"a", "b", "c", "d"},
{"a", "b", "1 2"}, {"a", "b", "1 2"},
{"foo baar", "b", ""}, {"foo baar", "b", ""},
{"foo\n,,\n\",,\n\"", "d", "e"}, {"foo\n,,\n\",,\n\"", "d", "e"}
{""}
}; };
public void testGetLine() throws IOException { public void testGetLine() throws IOException {
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
String[] tmp = null; String[] tmp = null;
for (int i = 0; i < res.length; i++) { for (int i = 0; i < res.length; i++) {
@ -275,7 +308,7 @@ public class CSVParserTest extends TestCase {
} }
public void testNextValue() throws IOException { public void testNextValue() throws IOException {
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
String tmp = null; String tmp = null;
for (int i = 0; i < res.length; i++) { for (int i = 0; i < res.length; i++) {
@ -289,7 +322,7 @@ public class CSVParserTest extends TestCase {
} }
public void testGetAllValues() throws IOException { public void testGetAllValues() throws IOException {
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues(); String[][] tmp = parser.getAllValues();
assertEquals(res.length, tmp.length); assertEquals(res.length, tmp.length);
@ -299,7 +332,7 @@ public class CSVParserTest extends TestCase {
} }
} }
public void testExcelStrategyTest() throws IOException { public void testExcelStrategy1() throws IOException {
String code = String code =
"value1;value2;value3;value4\r\na;b;c;d\r\n x;;;" "value1;value2;value3;value4\r\na;b;c;d\r\n x;;;"
+ "\r\n\r\n\"\"\"hello\"\"\";\" \"\"world\"\"\";\"abc\ndef\";\r\n"; + "\r\n\r\n\"\"\"hello\"\"\";\" \"\"world\"\"\";\"abc\ndef\";\r\n";
@ -308,10 +341,9 @@ public class CSVParserTest extends TestCase {
{"a", "b", "c", "d"}, {"a", "b", "c", "d"},
{" x", "", "", ""}, {" x", "", "", ""},
{""}, {""},
{"\"hello\"", " \"world\"", "abc\ndef", ""}, {"\"hello\"", " \"world\"", "abc\ndef", ""}
{""}
}; };
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
parser.setExcelStrategy(); parser.setExcelStrategy();
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues(); String[][] tmp = parser.getAllValues();
@ -322,17 +354,16 @@ public class CSVParserTest extends TestCase {
} }
} }
public void testExcelStrategyTest2() throws Exception { public void testExcelStrategy2() throws Exception {
String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n"; String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n";
String[][] res = { String[][] res = {
{"foo", "baar"}, {"foo", "baar"},
{""}, {""},
{"hello", ""}, {"hello", ""},
{""}, {""},
{"world", ""}, {"world", ""}
{""}
}; };
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
parser.setExcelStrategy(); parser.setExcelStrategy();
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues(); String[][] tmp = parser.getAllValues();
@ -344,7 +375,166 @@ public class CSVParserTest extends TestCase {
} }
assertTrue(Arrays.equals(res[i], tmp[i])); assertTrue(Arrays.equals(res[i], tmp[i]));
} }
//assertTrue(false); }
public void testEndOfFileBehaviourExcel() throws Exception {
String[] codes = {
"hello;\r\n\r\nworld;\r\n",
"hello;\r\n\r\nworld;",
"hello;\r\n\r\nworld;\"\"\r\n",
"hello;\r\n\r\nworld;\"\"",
"hello;\r\n\r\nworld;\n",
"hello;\r\n\r\nworld;",
"hello;\r\n\r\nworld;\"\"\n",
"hello;\r\n\r\nworld;\"\""
};
String[][] res = {
{"hello", ""},
{""}, // ExcelStrategy does not ignore empty lines
{"world", ""}
};
String code;
for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
code = codes[codeIndex];
CSVParser parser = new CSVParser(new StringReader(code));
parser.setExcelStrategy();
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
assertEquals(res.length, tmp.length);
assertTrue(tmp.length > 0);
for (int i = 0; i < res.length; i++) {
for (int j = 0; j < tmp[i].length; j++) {
System.out.println("'" + tmp[i][j] + "'");
}
assertTrue(Arrays.equals(res[i], tmp[i]));
}
}
}
public void testEndOfFileBehaviorCSV() throws Exception {
String[] codes = {
"hello,\r\n\r\nworld,\r\n",
"hello,\r\n\r\nworld,",
"hello,\r\n\r\nworld,\"\"\r\n",
"hello,\r\n\r\nworld,\"\"",
"hello,\r\n\r\nworld,\n",
"hello,\r\n\r\nworld,",
"hello,\r\n\r\nworld,\"\"\n",
"hello,\r\n\r\nworld,\"\""
};
String[][] res = {
{"hello", ""}, // CSV Strategy ignores empty lines
{"world", ""}
};
String code;
for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
code = codes[codeIndex];
CSVParser parser = new CSVParser(new StringReader(code));
parser.setCSVStrategy();
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
assertEquals(res.length, tmp.length);
assertTrue(tmp.length > 0);
for (int i = 0; i < res.length; i++) {
for (int j = 0; j < tmp[i].length; j++) {
System.out.println("'" + tmp[i][j] + "'");
}
assertTrue(Arrays.equals(res[i], tmp[i]));
}
}
}
public void testEmptyLineBehaviourExcel() throws Exception {
String[] codes = {
"hello;\r\n\r\n\r\n",
"hello;\n\n\n",
"hello;\"\"\r\n\r\n\r\n",
"hello;\"\"\n\n\n"
};
String[][] res = {
{"hello", ""},
{""}, // ExcelStrategy does not ignore empty lines
{""}
};
String code;
for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
code = codes[codeIndex];
CSVParser parser = new CSVParser(new StringReader(code));
parser.setExcelStrategy();
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
assertEquals(res.length, tmp.length);
assertTrue(tmp.length > 0);
for (int i = 0; i < res.length; i++) {
for (int j = 0; j < tmp[i].length; j++) {
System.out.println("'" + tmp[i][j] + "'");
}
assertTrue(Arrays.equals(res[i], tmp[i]));
}
}
}
public void testEmptyLineBehaviourCSV() throws Exception {
String[] codes = {
"hello,\r\n\r\n\r\n",
"hello,\n\n\n",
"hello,\"\"\r\n\r\n\r\n",
"hello,\"\"\n\n\n"
};
String[][] res = {
{"hello", ""} // CSV Strategy ignores empty lines
};
String code;
for (int codeIndex = 0; codeIndex < codes.length; codeIndex++) {
code = codes[codeIndex];
CSVParser parser = new CSVParser(new StringReader(code));
parser.setCSVStrategy();
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
assertEquals(res.length, tmp.length);
assertTrue(tmp.length > 0);
for (int i = 0; i < res.length; i++) {
for (int j = 0; j < tmp[i].length; j++) {
System.out.println("'" + tmp[i][j] + "'");
}
assertTrue(Arrays.equals(res[i], tmp[i]));
}
}
}
public void testBackslashEscaping() throws IOException {
String code =
"one,two,three\n"
+ "on\\\"e,two\n"
+ "on\"e,two\n"
+ "one,\"tw\\\"o\"\n"
+ "one,\"t\\,wo\"\n"
+ "one,two,\"th,ree\"\n"
+ "\"a\\\\\"\n"
+ "a\\,b\n"
+ "\"a\\\\,b\"";
String[][] res = {
{ "one", "two", "three" },
{ "on\\\"e", "two" },
{ "on\"e", "two" },
{ "one", "tw\"o" },
{ "one", "t\\,wo" }, // backslash in quotes only escapes a delimiter (",")
{ "one", "two", "th,ree" },
{ "a\\\\" }, // backslash in quotes only escapes a delimiter (",")
{ "a\\", "b" }, // a backslash must be returnd
{ "a\\\\,b" } // backslash in quotes only escapes a delimiter (",")
};
CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
assertEquals(res.length, tmp.length);
assertTrue(tmp.length > 0);
for (int i = 0; i < res.length; i++) {
for (int j = 0; j < tmp[i].length; j++) {
System.out.println("'" + tmp[i][j] + "'");
}
assertTrue(Arrays.equals(res[i], tmp[i]));
}
} }
// ====================================================== // ======================================================
@ -386,7 +576,8 @@ public class CSVParserTest extends TestCase {
assertEquals(2, data[0].length); assertEquals(2, data[0].length);
assertEquals(1, data[1].length); assertEquals(1, data[1].length);
assertEquals("abc", data[0][0]); assertEquals("abc", data[0][0]);
assertEquals("def\\nghi", data[0][1]); // an escape char in quotes only escapes a delimiter, not itself
assertEquals("def\\\\nghi", data[0][1]);
assertEquals("jkl", data[1][0]); assertEquals("jkl", data[1][0]);
} }
@ -402,9 +593,8 @@ public class CSVParserTest extends TestCase {
public void testParse6() throws IOException { public void testParse6() throws IOException {
String[][] data = CSVParser.parse(""); String[][] data = CSVParser.parse("");
assertEquals(1, data.length); // default strategy is CSV, which ignores empty lines
assertEquals(1, data[0].length); assertEquals(0, data.length);
assertEquals("", data[0][0]);
} }
public void testParse7() throws IOException { public void testParse7() throws IOException {
@ -471,7 +661,7 @@ public class CSVParserTest extends TestCase {
public void testUnicodeEscape() throws IOException { public void testUnicodeEscape() throws IOException {
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063"; String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
parser.setUnicodeEscapeInterpretation(true); parser.setUnicodeEscapeInterpretation(true);
String[] data = parser.getLine(); String[] data = parser.getLine();
@ -482,7 +672,7 @@ public class CSVParserTest extends TestCase {
public void testCarriageReturnLineFeedEndings() throws IOException { public void testCarriageReturnLineFeedEndings() throws IOException {
String code = "foo\r\nbaar,\r\nhello,world\r\n,kanu"; String code = "foo\r\nbaar,\r\nhello,world\r\n,kanu";
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
String[][] data = parser.getAllValues(); String[][] data = parser.getAllValues();
assertEquals(4, data.length); assertEquals(4, data.length);
@ -492,7 +682,7 @@ public class CSVParserTest extends TestCase {
String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n"; String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n";
//String code = "world\r\n\n"; //String code = "world\r\n\n";
//String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n"; //String code = "foo;baar\r\n\r\nhello;\r\n\r\nworld;\r\n";
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
String[][] data = parser.getAllValues(); String[][] data = parser.getAllValues();
// for (int i = 0; i < data.length; i++) { // for (int i = 0; i < data.length; i++) {
@ -509,11 +699,11 @@ public class CSVParserTest extends TestCase {
public void testLineTokenConsistency() throws IOException { public void testLineTokenConsistency() throws IOException {
String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n"; String code = "\nfoo,baar\n\r\n,\n\n,world\r\n\n";
TestCSVParser parser = new TestCSVParser(new StringReader(code)); CSVParser parser = new CSVParser(new StringReader(code));
System.out.println("---------\n" + code + "\n-------------"); System.out.println("---------\n" + code + "\n-------------");
String[][] data = parser.getAllValues(); String[][] data = parser.getAllValues();
parser = new TestCSVParser(new StringReader(code)); parser = new CSVParser(new StringReader(code));
TestCSVParser parser1 = new TestCSVParser(new StringReader(code)); CSVParser parser1 = new CSVParser(new StringReader(code));
for (int i = 0; i < data.length; i++) { for (int i = 0; i < data.length; i++) {
assertTrue(Arrays.equals(parser1.getLine(), data[i])); assertTrue(Arrays.equals(parser1.getLine(), data[i]));
for (int j = 0; j < data[i].length; j++) { for (int j = 0; j < data[i].length; j++) {