diff --git a/src/java/org/apache/commons/csv/CSVParser.java b/src/java/org/apache/commons/csv/CSVParser.java index 7e5339c5..a6535f03 100644 --- a/src/java/org/apache/commons/csv/CSVParser.java +++ b/src/java/org/apache/commons/csv/CSVParser.java @@ -134,7 +134,7 @@ public class CSVParser { * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}. */ public CSVParser(Reader input, char delimiter) { - this(input, delimiter, '"', (char) 0); + this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED); } /** @@ -347,7 +347,7 @@ public class CSVParser { eol = isEndOfLine(c); } // ok, start of token reached: comment, encapsulated, or token - if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) { + if (c == strategy.getCommentStart()) { // ignore everything till end of line and continue (incr linecount) in.readLine(); tkn = nextToken(tkn.reset()); @@ -400,19 +400,22 @@ public class CSVParser { */ private Token simpleTokenLexer(Token tkn, int c) throws IOException { wsBuf.clear(); - while (!tkn.isReady) { + for (;;) { if (isEndOfLine(c)) { // end of record tkn.type = TT_EORECORD; tkn.isReady = true; + return tkn; } else if (isEndOfFile(c)) { // end of file tkn.type = TT_EOF; tkn.isReady = true; + return tkn; } else if (c == strategy.getDelimiter()) { // end of token tkn.type = TT_TOKEN; tkn.isReady = true; + return tkn; } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') { // interpret unicode escaped chars (like \u0070 -> p) tkn.content.append((char) unicodeEscapeLexer(c)); @@ -422,6 +425,8 @@ public class CSVParser { if (tkn.content.length() > 0) { wsBuf.append((char) c); } + } else if (c == strategy.getEscape()) { + tkn.content.append((char)readEscape(c)); } else { // prepend whitespaces (if we have) if (wsBuf.length() > 0) { @@ -435,7 +440,6 @@ public class CSVParser { c = in.read(); } } - return tkn; } @@ -457,70 +461,55 @@ public class CSVParser { int startLineNumber = getLineNumber(); // ignore the given delimiter // assert c == delimiter; - c = in.read(); - while (!tkn.isReady) { - boolean skipRead = false; - if (c == strategy.getEncapsulator() || c == '\\') { - // check lookahead + for (;;) { + c = in.read(); + + if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') { + tkn.content.append((char) unicodeEscapeLexer(c)); + } else if (c == strategy.getEscape()) { + tkn.content.append((char)readEscape(c)); + } else if (c == strategy.getEncapsulator()) { if (in.lookAhead() == strategy.getEncapsulator()) { // double or escaped encapsulator -> add single encapsulator to token c = in.read(); tkn.content.append((char) c); - } else if (c == '\\' && in.lookAhead() == '\\') { - // doubled escape char, it does not escape itself, only encapsulator - // -> add both escape chars to stream - tkn.content.append((char) c); - c = in.read(); - tkn.content.append((char) c); - } else if ( - strategy.getUnicodeEscapeInterpretation() - && c == '\\' - && in.lookAhead() == 'u') { - // interpret unicode escaped chars (like \u0070 -> p) - tkn.content.append((char) unicodeEscapeLexer(c)); - } else if (c == '\\') { - // use a single escape character -> add it to stream - tkn.content.append((char) c); } else { // token finish mark (encapsulator) reached: ignore whitespace till delimiter - while (!tkn.isReady) { + for (;;) { c = in.read(); if (c == strategy.getDelimiter()) { tkn.type = TT_TOKEN; tkn.isReady = true; + return tkn; } else if (isEndOfFile(c)) { tkn.type = TT_EOF; tkn.isReady = true; + return tkn; } else if (isEndOfLine(c)) { // ok eo token reached tkn.type = TT_EORECORD; tkn.isReady = true; + return tkn; } else if (!isWhitespace(c)) { - // error invalid char between token and next delimiter - throw new IOException( - "(line " + getLineNumber() - + ") invalid char between encapsulated token end delimiter" - ); - } + // error invalid char between token and next delimiter + throw new IOException( + "(line " + getLineNumber() + + ") invalid char between encapsulated token end delimiter" + ); + } } - skipRead = true; } } else if (isEndOfFile(c)) { // error condition (end of file before end of token) throw new IOException( - "(startline " + startLineNumber + ")" - + "eof reached before encapsulated token finished" - ); + "(startline " + startLineNumber + ")" + + "eof reached before encapsulated token finished" + ); } else { // consume character tkn.content.append((char) c); } - // get the next char - if (!tkn.isReady && !skipRead) { - c = in.read(); - } } - return tkn; } @@ -554,6 +543,21 @@ public class CSVParser { } return ret; } + + private int readEscape(int c) throws IOException { + // assume c is the escape char (normally a backslash) + c = in.read(); + int out; + switch (c) { + case 'r': out='\r'; break; + case 'n': out='\n'; break; + case 't': out='\t'; break; + case 'b': out='\b'; break; + case 'f': out='\f'; break; + default : out=c; + } + return out; + } // ====================================================== // strategies diff --git a/src/java/org/apache/commons/csv/CSVStrategy.java b/src/java/org/apache/commons/csv/CSVStrategy.java index af627d1c..9ef30315 100644 --- a/src/java/org/apache/commons/csv/CSVStrategy.java +++ b/src/java/org/apache/commons/csv/CSVStrategy.java @@ -28,15 +28,21 @@ public class CSVStrategy implements Cloneable, Serializable { private char delimiter; private char encapsulator; private char commentStart; + private char escape; private boolean ignoreLeadingWhitespaces; private boolean interpretUnicodeEscapes; private boolean ignoreEmptyLines; - public static char COMMENTS_DISABLED = (char) 0; + // -2 is used to signal disabled, because it won't be confused with + // an EOF signal (-1), and because \ufffe in UTF-16 would be + // encoded as two chars (using surrogates) and thus there should never + // be a collision with a real text char. + public static char COMMENTS_DISABLED = (char)-2; + public static char ESCAPE_DISABLED = (char)-2; - public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, true, false, true); - public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, false, false, false); - public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, true, false, true); + public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true); + public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false); + public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true); public CSVStrategy(char delimiter, char encapsulator, char commentStart) { @@ -58,7 +64,8 @@ public class CSVStrategy implements Cloneable, Serializable { public CSVStrategy( char delimiter, char encapsulator, - char commentStart, + char commentStart, + char escape, boolean ignoreLeadingWhitespace, boolean interpretUnicodeEscapes, boolean ignoreEmptyLines) @@ -66,11 +73,25 @@ public class CSVStrategy implements Cloneable, Serializable { setDelimiter(delimiter); setEncapsulator(encapsulator); setCommentStart(commentStart); + setEscape(escape); setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace); setUnicodeEscapeInterpretation(interpretUnicodeEscapes); setIgnoreEmptyLines(ignoreEmptyLines); } + /** @deprecated */ + public CSVStrategy( + char delimiter, + char encapsulator, + char commentStart, + boolean ignoreLeadingWhitespace, + boolean interpretUnicodeEscapes, + boolean ignoreEmptyLines) + { + this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines); + } + + public void setDelimiter(char delimiter) { this.delimiter = delimiter; } public char getDelimiter() { return this.delimiter; } @@ -81,6 +102,9 @@ public class CSVStrategy implements Cloneable, Serializable { public char getCommentStart() { return this.commentStart; } public boolean isCommentingDisabled() { return this.commentStart == COMMENTS_DISABLED; } + public void setEscape(char escape) { this.escape = escape; } + public char getEscape() { return this.escape; } + public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; } public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; } diff --git a/src/test/org/apache/commons/csv/CSVParserTest.java b/src/test/org/apache/commons/csv/CSVParserTest.java index 53b341ef..a95ff7c8 100644 --- a/src/test/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/org/apache/commons/csv/CSVParserTest.java @@ -182,9 +182,7 @@ public class CSVParserTest extends TestCase { // encapsulator tokenizer (multi line, delimiter in string) public void testNextToken5() throws IOException { String code = - "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\"" - + ",\"\\,\"" - + ",\"\"\"\""; + "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\""; TestCSVParser parser = new TestCSVParser(new StringReader(code)); parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY); System.out.println("---------\n" + code + "\n-------------"); @@ -193,11 +191,8 @@ public class CSVParserTest extends TestCase { assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken()); assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken()); - assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken()); - // escape char in quoted input only escapes delimiter - assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken()); - assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken()); + assertEquals(CSVParser.TT_EOF + ";\n\t \n;", parser.testNextToken()); + } // change delimiters, comment, encapsulater @@ -207,7 +202,7 @@ public class CSVParserTest extends TestCase { * !comment;;;; * ;; */ - String code = "a;'b and \\' more\n'\n!comment;;;;\n;;"; + String code = "a;'b and '' more\n'\n!comment;;;;\n;;"; TestCSVParser parser = new TestCSVParser(new StringReader(code)); parser.setStrategy( new CSVStrategy(';', '\'', '!') ); System.out.println("---------\n" + code + "\n-------------"); @@ -226,8 +221,9 @@ public class CSVParserTest extends TestCase { "a,b,c,d\n" + " a , b , 1 2 \n" + "\"foo baar\", b,\n" - + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n"; - String[][] res = { + // + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n"; + + " \"foo\n,,\n\"\",,\n\"\"\",d,e\n"; // changed to use standard CSV escaping + String[][] res = { {"a", "b", "c", "d"}, {"a", "b", "1 2"}, {"foo baar", "b", ""}, @@ -439,7 +435,7 @@ public class CSVParserTest extends TestCase { } } - public void testBackslashEscaping() throws IOException { + public void OLDtestBackslashEscaping() throws IOException { String code = "one,two,three\n" + "on\\\"e,two\n" @@ -474,6 +470,49 @@ public class CSVParserTest extends TestCase { } } + public void testBackslashEscaping() throws IOException { + + // To avoid confusion over the need for escaping chars in java code, + // We will test with a forward slash as the escape char, and a single + // quote as the encapsulator. + + String code = + "one,two,three\n" // 0 + + "'',''\n" // 1) empty encapsulators + + "/',/'\n" // 2) single encapsulators + + "'/'','/''\n" // 3) single encapsulators encapsulated via escape + + "'''',''''\n" // 4) single encapsulators encapsulated via doubling + + "/,,/,\n" // 5) separator escaped + + "//,//\n" // 6) escape escaped + + "'//','//'\n" // 7) escape escaped in encapsulation + + ""; + String[][] res = { + { "one", "two", "three" }, // 0 + { "", "" }, // 1 + { "'", "'" }, // 2 + { "'", "'" }, // 3 + { "'", "'" }, // 4 + { ",", "," }, // 5 + { "/", "/" }, // 6 + { "/", "/" }, // 7 + }; + + + CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true); + + CSVParser parser = new CSVParser(new StringReader(code), strategy); + System.out.println("---------\n" + code + "\n-------------"); + String[][] tmp = parser.getAllValues(); + assertTrue(tmp.length > 0); + for (int i = 0; i < res.length; i++) { + for (int j = 0; j < tmp[i].length; j++) { + System.out.println("'" + tmp[i][j] + "' should be '" + res[i][j] + "'"); + } + assertTrue(Arrays.equals(res[i], tmp[i])); + } + } + + public void testUnicodeEscape() throws IOException { String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063"; CSVParser parser = new CSVParser(new StringReader(code)); diff --git a/src/test/org/apache/commons/csv/CSVStrategyTest.java b/src/test/org/apache/commons/csv/CSVStrategyTest.java index ed819463..74cb31c2 100644 --- a/src/test/org/apache/commons/csv/CSVStrategyTest.java +++ b/src/test/org/apache/commons/csv/CSVStrategyTest.java @@ -91,7 +91,7 @@ public class CSVStrategyTest extends TestCase { // default settings assertEquals(strategy.getDelimiter(), ','); assertEquals(strategy.getEncapsulator(), '"'); - assertEquals(strategy.getCommentStart(), '\0'); + assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED); assertEquals(true, strategy.getIgnoreLeadingWhitespaces()); assertEquals(false, strategy.getUnicodeEscapeInterpretation()); assertEquals(true, strategy.getIgnoreEmptyLines()); @@ -99,7 +99,7 @@ public class CSVStrategyTest extends TestCase { parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY); assertEquals(strategy.getDelimiter(), ','); assertEquals(strategy.getEncapsulator(), '"'); - assertEquals(strategy.getCommentStart(), '\0'); + assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED); assertEquals(true, strategy.getIgnoreLeadingWhitespaces()); assertEquals(false, strategy.getUnicodeEscapeInterpretation()); assertEquals(true, strategy.getIgnoreEmptyLines()); @@ -109,7 +109,7 @@ public class CSVStrategyTest extends TestCase { CSVStrategy strategy = CSVStrategy.EXCEL_STRATEGY; assertEquals(strategy.getDelimiter(), ','); assertEquals(strategy.getEncapsulator(), '"'); - assertEquals(strategy.getCommentStart(), '\0'); + assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED); assertEquals(false, strategy.getIgnoreLeadingWhitespaces()); assertEquals(false, strategy.getUnicodeEscapeInterpretation()); assertEquals(false, strategy.getIgnoreEmptyLines());