SANDBOX-206: add escape to strategy, turn off backslash-style escaping by default

git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@609155 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2008-01-05 15:37:26 +00:00
parent f34ce7d093
commit b55fb21d78
4 changed files with 127 additions and 60 deletions

View File

@ -134,7 +134,7 @@ public class CSVParser {
* @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
*/
public CSVParser(Reader input, char delimiter) {
this(input, delimiter, '"', (char) 0);
this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
}
/**
@ -347,7 +347,7 @@ public class CSVParser {
eol = isEndOfLine(c);
}
// ok, start of token reached: comment, encapsulated, or token
if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) {
if (c == strategy.getCommentStart()) {
// ignore everything till end of line and continue (incr linecount)
in.readLine();
tkn = nextToken(tkn.reset());
@ -400,19 +400,22 @@ public class CSVParser {
*/
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
wsBuf.clear();
while (!tkn.isReady) {
for (;;) {
if (isEndOfLine(c)) {
// end of record
tkn.type = TT_EORECORD;
tkn.isReady = true;
return tkn;
} else if (isEndOfFile(c)) {
// end of file
tkn.type = TT_EOF;
tkn.isReady = true;
return tkn;
} else if (c == strategy.getDelimiter()) {
// end of token
tkn.type = TT_TOKEN;
tkn.isReady = true;
return tkn;
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
// interpret unicode escaped chars (like \u0070 -> p)
tkn.content.append((char) unicodeEscapeLexer(c));
@ -422,6 +425,8 @@ public class CSVParser {
if (tkn.content.length() > 0) {
wsBuf.append((char) c);
}
} else if (c == strategy.getEscape()) {
tkn.content.append((char)readEscape(c));
} else {
// prepend whitespaces (if we have)
if (wsBuf.length() > 0) {
@ -435,7 +440,6 @@ public class CSVParser {
c = in.read();
}
}
return tkn;
}
@ -457,70 +461,55 @@ public class CSVParser {
int startLineNumber = getLineNumber();
// ignore the given delimiter
// assert c == delimiter;
c = in.read();
while (!tkn.isReady) {
boolean skipRead = false;
if (c == strategy.getEncapsulator() || c == '\\') {
// check lookahead
for (;;) {
c = in.read();
if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
tkn.content.append((char) unicodeEscapeLexer(c));
} else if (c == strategy.getEscape()) {
tkn.content.append((char)readEscape(c));
} else if (c == strategy.getEncapsulator()) {
if (in.lookAhead() == strategy.getEncapsulator()) {
// double or escaped encapsulator -> add single encapsulator to token
c = in.read();
tkn.content.append((char) c);
} else if (c == '\\' && in.lookAhead() == '\\') {
// doubled escape char, it does not escape itself, only encapsulator
// -> add both escape chars to stream
tkn.content.append((char) c);
c = in.read();
tkn.content.append((char) c);
} else if (
strategy.getUnicodeEscapeInterpretation()
&& c == '\\'
&& in.lookAhead() == 'u') {
// interpret unicode escaped chars (like \u0070 -> p)
tkn.content.append((char) unicodeEscapeLexer(c));
} else if (c == '\\') {
// use a single escape character -> add it to stream
tkn.content.append((char) c);
} else {
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
while (!tkn.isReady) {
for (;;) {
c = in.read();
if (c == strategy.getDelimiter()) {
tkn.type = TT_TOKEN;
tkn.isReady = true;
return tkn;
} else if (isEndOfFile(c)) {
tkn.type = TT_EOF;
tkn.isReady = true;
return tkn;
} else if (isEndOfLine(c)) {
// ok eo token reached
tkn.type = TT_EORECORD;
tkn.isReady = true;
return tkn;
} else if (!isWhitespace(c)) {
// error invalid char between token and next delimiter
throw new IOException(
"(line " + getLineNumber()
+ ") invalid char between encapsulated token end delimiter"
);
}
// error invalid char between token and next delimiter
throw new IOException(
"(line " + getLineNumber()
+ ") invalid char between encapsulated token end delimiter"
);
}
}
skipRead = true;
}
} else if (isEndOfFile(c)) {
// error condition (end of file before end of token)
throw new IOException(
"(startline " + startLineNumber + ")"
+ "eof reached before encapsulated token finished"
);
"(startline " + startLineNumber + ")"
+ "eof reached before encapsulated token finished"
);
} else {
// consume character
tkn.content.append((char) c);
}
// get the next char
if (!tkn.isReady && !skipRead) {
c = in.read();
}
}
return tkn;
}
@ -554,6 +543,21 @@ public class CSVParser {
}
return ret;
}
private int readEscape(int c) throws IOException {
// assume c is the escape char (normally a backslash)
c = in.read();
int out;
switch (c) {
case 'r': out='\r'; break;
case 'n': out='\n'; break;
case 't': out='\t'; break;
case 'b': out='\b'; break;
case 'f': out='\f'; break;
default : out=c;
}
return out;
}
// ======================================================
// strategies

View File

@ -28,15 +28,21 @@ public class CSVStrategy implements Cloneable, Serializable {
private char delimiter;
private char encapsulator;
private char commentStart;
private char escape;
private boolean ignoreLeadingWhitespaces;
private boolean interpretUnicodeEscapes;
private boolean ignoreEmptyLines;
public static char COMMENTS_DISABLED = (char) 0;
// -2 is used to signal disabled, because it won't be confused with
// an EOF signal (-1), and because \ufffe in UTF-16 would be
// encoded as two chars (using surrogates) and thus there should never
// be a collision with a real text char.
public static char COMMENTS_DISABLED = (char)-2;
public static char ESCAPE_DISABLED = (char)-2;
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, true, false, true);
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, false, false, false);
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, true, false, true);
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
@ -58,7 +64,8 @@ public class CSVStrategy implements Cloneable, Serializable {
public CSVStrategy(
char delimiter,
char encapsulator,
char commentStart,
char commentStart,
char escape,
boolean ignoreLeadingWhitespace,
boolean interpretUnicodeEscapes,
boolean ignoreEmptyLines)
@ -66,11 +73,25 @@ public class CSVStrategy implements Cloneable, Serializable {
setDelimiter(delimiter);
setEncapsulator(encapsulator);
setCommentStart(commentStart);
setEscape(escape);
setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
setIgnoreEmptyLines(ignoreEmptyLines);
}
/** @deprecated */
public CSVStrategy(
char delimiter,
char encapsulator,
char commentStart,
boolean ignoreLeadingWhitespace,
boolean interpretUnicodeEscapes,
boolean ignoreEmptyLines)
{
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
}
public void setDelimiter(char delimiter) { this.delimiter = delimiter; }
public char getDelimiter() { return this.delimiter; }
@ -81,6 +102,9 @@ public class CSVStrategy implements Cloneable, Serializable {
public char getCommentStart() { return this.commentStart; }
public boolean isCommentingDisabled() { return this.commentStart == COMMENTS_DISABLED; }
public void setEscape(char escape) { this.escape = escape; }
public char getEscape() { return this.escape; }
public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; }

View File

@ -182,9 +182,7 @@ public class CSVParserTest extends TestCase {
// encapsulator tokenizer (multi line, delimiter in string)
public void testNextToken5() throws IOException {
String code =
"a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\""
+ ",\"\\,\""
+ ",\"\"\"\"";
"a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
TestCSVParser parser = new TestCSVParser(new StringReader(code));
parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
System.out.println("---------\n" + code + "\n-------------");
@ -193,11 +191,8 @@ public class CSVParserTest extends TestCase {
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;",
parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
// escape char in quoted input only escapes delimiter
assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
assertEquals(CSVParser.TT_EOF + ";\n\t \n;", parser.testNextToken());
}
// change delimiters, comment, encapsulater
@ -207,7 +202,7 @@ public class CSVParserTest extends TestCase {
* !comment;;;;
* ;;
*/
String code = "a;'b and \\' more\n'\n!comment;;;;\n;;";
String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
TestCSVParser parser = new TestCSVParser(new StringReader(code));
parser.setStrategy( new CSVStrategy(';', '\'', '!') );
System.out.println("---------\n" + code + "\n-------------");
@ -226,8 +221,9 @@ public class CSVParserTest extends TestCase {
"a,b,c,d\n"
+ " a , b , 1 2 \n"
+ "\"foo baar\", b,\n"
+ " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
String[][] res = {
// + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
+ " \"foo\n,,\n\"\",,\n\"\"\",d,e\n"; // changed to use standard CSV escaping
String[][] res = {
{"a", "b", "c", "d"},
{"a", "b", "1 2"},
{"foo baar", "b", ""},
@ -439,7 +435,7 @@ public class CSVParserTest extends TestCase {
}
}
public void testBackslashEscaping() throws IOException {
public void OLDtestBackslashEscaping() throws IOException {
String code =
"one,two,three\n"
+ "on\\\"e,two\n"
@ -474,6 +470,49 @@ public class CSVParserTest extends TestCase {
}
}
public void testBackslashEscaping() throws IOException {
// To avoid confusion over the need for escaping chars in java code,
// We will test with a forward slash as the escape char, and a single
// quote as the encapsulator.
String code =
"one,two,three\n" // 0
+ "'',''\n" // 1) empty encapsulators
+ "/',/'\n" // 2) single encapsulators
+ "'/'','/''\n" // 3) single encapsulators encapsulated via escape
+ "'''',''''\n" // 4) single encapsulators encapsulated via doubling
+ "/,,/,\n" // 5) separator escaped
+ "//,//\n" // 6) escape escaped
+ "'//','//'\n" // 7) escape escaped in encapsulation
+ "";
String[][] res = {
{ "one", "two", "three" }, // 0
{ "", "" }, // 1
{ "'", "'" }, // 2
{ "'", "'" }, // 3
{ "'", "'" }, // 4
{ ",", "," }, // 5
{ "/", "/" }, // 6
{ "/", "/" }, // 7
};
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
CSVParser parser = new CSVParser(new StringReader(code), strategy);
System.out.println("---------\n" + code + "\n-------------");
String[][] tmp = parser.getAllValues();
assertTrue(tmp.length > 0);
for (int i = 0; i < res.length; i++) {
for (int j = 0; j < tmp[i].length; j++) {
System.out.println("'" + tmp[i][j] + "' should be '" + res[i][j] + "'");
}
assertTrue(Arrays.equals(res[i], tmp[i]));
}
}
public void testUnicodeEscape() throws IOException {
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
CSVParser parser = new CSVParser(new StringReader(code));

View File

@ -91,7 +91,7 @@ public class CSVStrategyTest extends TestCase {
// default settings
assertEquals(strategy.getDelimiter(), ',');
assertEquals(strategy.getEncapsulator(), '"');
assertEquals(strategy.getCommentStart(), '\0');
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
assertEquals(true, strategy.getIgnoreLeadingWhitespaces());
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
assertEquals(true, strategy.getIgnoreEmptyLines());
@ -99,7 +99,7 @@ public class CSVStrategyTest extends TestCase {
parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
assertEquals(strategy.getDelimiter(), ',');
assertEquals(strategy.getEncapsulator(), '"');
assertEquals(strategy.getCommentStart(), '\0');
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
assertEquals(true, strategy.getIgnoreLeadingWhitespaces());
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
assertEquals(true, strategy.getIgnoreEmptyLines());
@ -109,7 +109,7 @@ public class CSVStrategyTest extends TestCase {
CSVStrategy strategy = CSVStrategy.EXCEL_STRATEGY;
assertEquals(strategy.getDelimiter(), ',');
assertEquals(strategy.getEncapsulator(), '"');
assertEquals(strategy.getCommentStart(), '\0');
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
assertEquals(false, strategy.getIgnoreLeadingWhitespaces());
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
assertEquals(false, strategy.getIgnoreEmptyLines());