mirror of
https://github.com/apache/commons-csv.git
synced 2025-02-28 05:49:04 +00:00
SANDBOX-206: add escape to strategy, turn off backslash-style escaping by default
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@609155 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f34ce7d093
commit
b55fb21d78
@ -134,7 +134,7 @@ public class CSVParser {
|
||||
* @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
|
||||
*/
|
||||
public CSVParser(Reader input, char delimiter) {
|
||||
this(input, delimiter, '"', (char) 0);
|
||||
this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -347,7 +347,7 @@ public class CSVParser {
|
||||
eol = isEndOfLine(c);
|
||||
}
|
||||
// ok, start of token reached: comment, encapsulated, or token
|
||||
if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) {
|
||||
if (c == strategy.getCommentStart()) {
|
||||
// ignore everything till end of line and continue (incr linecount)
|
||||
in.readLine();
|
||||
tkn = nextToken(tkn.reset());
|
||||
@ -400,19 +400,22 @@ public class CSVParser {
|
||||
*/
|
||||
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
|
||||
wsBuf.clear();
|
||||
while (!tkn.isReady) {
|
||||
for (;;) {
|
||||
if (isEndOfLine(c)) {
|
||||
// end of record
|
||||
tkn.type = TT_EORECORD;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
} else if (isEndOfFile(c)) {
|
||||
// end of file
|
||||
tkn.type = TT_EOF;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
} else if (c == strategy.getDelimiter()) {
|
||||
// end of token
|
||||
tkn.type = TT_TOKEN;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
|
||||
// interpret unicode escaped chars (like \u0070 -> p)
|
||||
tkn.content.append((char) unicodeEscapeLexer(c));
|
||||
@ -422,6 +425,8 @@ public class CSVParser {
|
||||
if (tkn.content.length() > 0) {
|
||||
wsBuf.append((char) c);
|
||||
}
|
||||
} else if (c == strategy.getEscape()) {
|
||||
tkn.content.append((char)readEscape(c));
|
||||
} else {
|
||||
// prepend whitespaces (if we have)
|
||||
if (wsBuf.length() > 0) {
|
||||
@ -435,7 +440,6 @@ public class CSVParser {
|
||||
c = in.read();
|
||||
}
|
||||
}
|
||||
return tkn;
|
||||
}
|
||||
|
||||
|
||||
@ -457,70 +461,55 @@ public class CSVParser {
|
||||
int startLineNumber = getLineNumber();
|
||||
// ignore the given delimiter
|
||||
// assert c == delimiter;
|
||||
c = in.read();
|
||||
while (!tkn.isReady) {
|
||||
boolean skipRead = false;
|
||||
if (c == strategy.getEncapsulator() || c == '\\') {
|
||||
// check lookahead
|
||||
for (;;) {
|
||||
c = in.read();
|
||||
|
||||
if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
|
||||
tkn.content.append((char) unicodeEscapeLexer(c));
|
||||
} else if (c == strategy.getEscape()) {
|
||||
tkn.content.append((char)readEscape(c));
|
||||
} else if (c == strategy.getEncapsulator()) {
|
||||
if (in.lookAhead() == strategy.getEncapsulator()) {
|
||||
// double or escaped encapsulator -> add single encapsulator to token
|
||||
c = in.read();
|
||||
tkn.content.append((char) c);
|
||||
} else if (c == '\\' && in.lookAhead() == '\\') {
|
||||
// doubled escape char, it does not escape itself, only encapsulator
|
||||
// -> add both escape chars to stream
|
||||
tkn.content.append((char) c);
|
||||
c = in.read();
|
||||
tkn.content.append((char) c);
|
||||
} else if (
|
||||
strategy.getUnicodeEscapeInterpretation()
|
||||
&& c == '\\'
|
||||
&& in.lookAhead() == 'u') {
|
||||
// interpret unicode escaped chars (like \u0070 -> p)
|
||||
tkn.content.append((char) unicodeEscapeLexer(c));
|
||||
} else if (c == '\\') {
|
||||
// use a single escape character -> add it to stream
|
||||
tkn.content.append((char) c);
|
||||
} else {
|
||||
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
|
||||
while (!tkn.isReady) {
|
||||
for (;;) {
|
||||
c = in.read();
|
||||
if (c == strategy.getDelimiter()) {
|
||||
tkn.type = TT_TOKEN;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
} else if (isEndOfFile(c)) {
|
||||
tkn.type = TT_EOF;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
} else if (isEndOfLine(c)) {
|
||||
// ok eo token reached
|
||||
tkn.type = TT_EORECORD;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
} else if (!isWhitespace(c)) {
|
||||
// error invalid char between token and next delimiter
|
||||
throw new IOException(
|
||||
"(line " + getLineNumber()
|
||||
+ ") invalid char between encapsulated token end delimiter"
|
||||
);
|
||||
}
|
||||
// error invalid char between token and next delimiter
|
||||
throw new IOException(
|
||||
"(line " + getLineNumber()
|
||||
+ ") invalid char between encapsulated token end delimiter"
|
||||
);
|
||||
}
|
||||
}
|
||||
skipRead = true;
|
||||
}
|
||||
} else if (isEndOfFile(c)) {
|
||||
// error condition (end of file before end of token)
|
||||
throw new IOException(
|
||||
"(startline " + startLineNumber + ")"
|
||||
+ "eof reached before encapsulated token finished"
|
||||
);
|
||||
"(startline " + startLineNumber + ")"
|
||||
+ "eof reached before encapsulated token finished"
|
||||
);
|
||||
} else {
|
||||
// consume character
|
||||
tkn.content.append((char) c);
|
||||
}
|
||||
// get the next char
|
||||
if (!tkn.isReady && !skipRead) {
|
||||
c = in.read();
|
||||
}
|
||||
}
|
||||
return tkn;
|
||||
}
|
||||
|
||||
|
||||
@ -554,6 +543,21 @@ public class CSVParser {
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private int readEscape(int c) throws IOException {
|
||||
// assume c is the escape char (normally a backslash)
|
||||
c = in.read();
|
||||
int out;
|
||||
switch (c) {
|
||||
case 'r': out='\r'; break;
|
||||
case 'n': out='\n'; break;
|
||||
case 't': out='\t'; break;
|
||||
case 'b': out='\b'; break;
|
||||
case 'f': out='\f'; break;
|
||||
default : out=c;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
// strategies
|
||||
|
@ -28,15 +28,21 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
private char delimiter;
|
||||
private char encapsulator;
|
||||
private char commentStart;
|
||||
private char escape;
|
||||
private boolean ignoreLeadingWhitespaces;
|
||||
private boolean interpretUnicodeEscapes;
|
||||
private boolean ignoreEmptyLines;
|
||||
|
||||
public static char COMMENTS_DISABLED = (char) 0;
|
||||
// -2 is used to signal disabled, because it won't be confused with
|
||||
// an EOF signal (-1), and because \ufffe in UTF-16 would be
|
||||
// encoded as two chars (using surrogates) and thus there should never
|
||||
// be a collision with a real text char.
|
||||
public static char COMMENTS_DISABLED = (char)-2;
|
||||
public static char ESCAPE_DISABLED = (char)-2;
|
||||
|
||||
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, true, false, true);
|
||||
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, false, false, false);
|
||||
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, true, false, true);
|
||||
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
|
||||
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
|
||||
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
|
||||
|
||||
|
||||
public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
|
||||
@ -58,7 +64,8 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
public CSVStrategy(
|
||||
char delimiter,
|
||||
char encapsulator,
|
||||
char commentStart,
|
||||
char commentStart,
|
||||
char escape,
|
||||
boolean ignoreLeadingWhitespace,
|
||||
boolean interpretUnicodeEscapes,
|
||||
boolean ignoreEmptyLines)
|
||||
@ -66,11 +73,25 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
setDelimiter(delimiter);
|
||||
setEncapsulator(encapsulator);
|
||||
setCommentStart(commentStart);
|
||||
setEscape(escape);
|
||||
setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
|
||||
setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
|
||||
setIgnoreEmptyLines(ignoreEmptyLines);
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public CSVStrategy(
|
||||
char delimiter,
|
||||
char encapsulator,
|
||||
char commentStart,
|
||||
boolean ignoreLeadingWhitespace,
|
||||
boolean interpretUnicodeEscapes,
|
||||
boolean ignoreEmptyLines)
|
||||
{
|
||||
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
|
||||
}
|
||||
|
||||
|
||||
public void setDelimiter(char delimiter) { this.delimiter = delimiter; }
|
||||
public char getDelimiter() { return this.delimiter; }
|
||||
|
||||
@ -81,6 +102,9 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
public char getCommentStart() { return this.commentStart; }
|
||||
public boolean isCommentingDisabled() { return this.commentStart == COMMENTS_DISABLED; }
|
||||
|
||||
public void setEscape(char escape) { this.escape = escape; }
|
||||
public char getEscape() { return this.escape; }
|
||||
|
||||
public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
|
||||
public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; }
|
||||
|
||||
|
@ -182,9 +182,7 @@ public class CSVParserTest extends TestCase {
|
||||
// encapsulator tokenizer (multi line, delimiter in string)
|
||||
public void testNextToken5() throws IOException {
|
||||
String code =
|
||||
"a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\""
|
||||
+ ",\"\\,\""
|
||||
+ ",\"\"\"\"";
|
||||
"a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
|
||||
TestCSVParser parser = new TestCSVParser(new StringReader(code));
|
||||
parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
|
||||
System.out.println("---------\n" + code + "\n-------------");
|
||||
@ -193,11 +191,8 @@ public class CSVParserTest extends TestCase {
|
||||
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
|
||||
assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;",
|
||||
parser.testNextToken());
|
||||
assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
|
||||
assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
|
||||
// escape char in quoted input only escapes delimiter
|
||||
assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
|
||||
assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
|
||||
assertEquals(CSVParser.TT_EOF + ";\n\t \n;", parser.testNextToken());
|
||||
|
||||
}
|
||||
|
||||
// change delimiters, comment, encapsulater
|
||||
@ -207,7 +202,7 @@ public class CSVParserTest extends TestCase {
|
||||
* !comment;;;;
|
||||
* ;;
|
||||
*/
|
||||
String code = "a;'b and \\' more\n'\n!comment;;;;\n;;";
|
||||
String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
|
||||
TestCSVParser parser = new TestCSVParser(new StringReader(code));
|
||||
parser.setStrategy( new CSVStrategy(';', '\'', '!') );
|
||||
System.out.println("---------\n" + code + "\n-------------");
|
||||
@ -226,8 +221,9 @@ public class CSVParserTest extends TestCase {
|
||||
"a,b,c,d\n"
|
||||
+ " a , b , 1 2 \n"
|
||||
+ "\"foo baar\", b,\n"
|
||||
+ " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
|
||||
String[][] res = {
|
||||
// + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
|
||||
+ " \"foo\n,,\n\"\",,\n\"\"\",d,e\n"; // changed to use standard CSV escaping
|
||||
String[][] res = {
|
||||
{"a", "b", "c", "d"},
|
||||
{"a", "b", "1 2"},
|
||||
{"foo baar", "b", ""},
|
||||
@ -439,7 +435,7 @@ public class CSVParserTest extends TestCase {
|
||||
}
|
||||
}
|
||||
|
||||
public void testBackslashEscaping() throws IOException {
|
||||
public void OLDtestBackslashEscaping() throws IOException {
|
||||
String code =
|
||||
"one,two,three\n"
|
||||
+ "on\\\"e,two\n"
|
||||
@ -474,6 +470,49 @@ public class CSVParserTest extends TestCase {
|
||||
}
|
||||
}
|
||||
|
||||
public void testBackslashEscaping() throws IOException {
|
||||
|
||||
// To avoid confusion over the need for escaping chars in java code,
|
||||
// We will test with a forward slash as the escape char, and a single
|
||||
// quote as the encapsulator.
|
||||
|
||||
String code =
|
||||
"one,two,three\n" // 0
|
||||
+ "'',''\n" // 1) empty encapsulators
|
||||
+ "/',/'\n" // 2) single encapsulators
|
||||
+ "'/'','/''\n" // 3) single encapsulators encapsulated via escape
|
||||
+ "'''',''''\n" // 4) single encapsulators encapsulated via doubling
|
||||
+ "/,,/,\n" // 5) separator escaped
|
||||
+ "//,//\n" // 6) escape escaped
|
||||
+ "'//','//'\n" // 7) escape escaped in encapsulation
|
||||
+ "";
|
||||
String[][] res = {
|
||||
{ "one", "two", "three" }, // 0
|
||||
{ "", "" }, // 1
|
||||
{ "'", "'" }, // 2
|
||||
{ "'", "'" }, // 3
|
||||
{ "'", "'" }, // 4
|
||||
{ ",", "," }, // 5
|
||||
{ "/", "/" }, // 6
|
||||
{ "/", "/" }, // 7
|
||||
};
|
||||
|
||||
|
||||
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
|
||||
|
||||
CSVParser parser = new CSVParser(new StringReader(code), strategy);
|
||||
System.out.println("---------\n" + code + "\n-------------");
|
||||
String[][] tmp = parser.getAllValues();
|
||||
assertTrue(tmp.length > 0);
|
||||
for (int i = 0; i < res.length; i++) {
|
||||
for (int j = 0; j < tmp[i].length; j++) {
|
||||
System.out.println("'" + tmp[i][j] + "' should be '" + res[i][j] + "'");
|
||||
}
|
||||
assertTrue(Arrays.equals(res[i], tmp[i]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testUnicodeEscape() throws IOException {
|
||||
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
|
||||
CSVParser parser = new CSVParser(new StringReader(code));
|
||||
|
@ -91,7 +91,7 @@ public class CSVStrategyTest extends TestCase {
|
||||
// default settings
|
||||
assertEquals(strategy.getDelimiter(), ',');
|
||||
assertEquals(strategy.getEncapsulator(), '"');
|
||||
assertEquals(strategy.getCommentStart(), '\0');
|
||||
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
|
||||
assertEquals(true, strategy.getIgnoreLeadingWhitespaces());
|
||||
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
|
||||
assertEquals(true, strategy.getIgnoreEmptyLines());
|
||||
@ -99,7 +99,7 @@ public class CSVStrategyTest extends TestCase {
|
||||
parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
|
||||
assertEquals(strategy.getDelimiter(), ',');
|
||||
assertEquals(strategy.getEncapsulator(), '"');
|
||||
assertEquals(strategy.getCommentStart(), '\0');
|
||||
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
|
||||
assertEquals(true, strategy.getIgnoreLeadingWhitespaces());
|
||||
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
|
||||
assertEquals(true, strategy.getIgnoreEmptyLines());
|
||||
@ -109,7 +109,7 @@ public class CSVStrategyTest extends TestCase {
|
||||
CSVStrategy strategy = CSVStrategy.EXCEL_STRATEGY;
|
||||
assertEquals(strategy.getDelimiter(), ',');
|
||||
assertEquals(strategy.getEncapsulator(), '"');
|
||||
assertEquals(strategy.getCommentStart(), '\0');
|
||||
assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
|
||||
assertEquals(false, strategy.getIgnoreLeadingWhitespaces());
|
||||
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
|
||||
assertEquals(false, strategy.getIgnoreEmptyLines());
|
||||
|
Loading…
x
Reference in New Issue
Block a user