CSV-58 Unescape handling needs rethinking

Fixed up most issues.
TODO should TAB, FF and BACKSPACE be un/escaped?

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1478621 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sebastian Bazley 2013-05-03 01:10:34 +00:00
parent e31980892c
commit 2c61208262
3 changed files with 40 additions and 16 deletions

View File

@ -160,7 +160,12 @@ final class CSVLexer extends Lexer {
tkn.type = TOKEN; tkn.type = TOKEN;
break; break;
} else if (isEscape(c)) { } else if (isEscape(c)) {
tkn.content.append((char) readEscape()); final int unescaped = readEscape();
if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
tkn.content.append((char) c).append((char) in.getLastChar());
} else {
tkn.content.append((char) unescaped);
}
c = in.read(); // continue c = in.read(); // continue
} else { } else {
tkn.content.append((char) c); tkn.content.append((char) c);
@ -203,7 +208,12 @@ final class CSVLexer extends Lexer {
c = in.read(); c = in.read();
if (isEscape(c)) { if (isEscape(c)) {
tkn.content.append((char) readEscape()); final int unescaped = readEscape();
if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
tkn.content.append((char) c).append((char) in.getLastChar());
} else {
tkn.content.append((char) unescaped);
}
} else if (isQuoteChar(c)) { } else if (isQuoteChar(c)) {
if (isQuoteChar(in.lookAhead())) { if (isQuoteChar(in.lookAhead())) {
// double or escaped encapsulator -> add single encapsulator to token // double or escaped encapsulator -> add single encapsulator to token

View File

@ -74,8 +74,18 @@ abstract class Lexer {
} }
// TODO escape handling needs more work // TODO escape handling needs more work
/**
* Handle an escape sequence.
* The current character must be the escape character.
* On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
* on the input stream.
*
* @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
* @throws IOException if there is a problem reading the stream or the end of stream is detected:
* the escape character is not allowed at end of strem
*/
int readEscape() throws IOException { int readEscape() throws IOException {
// assume c is the escape char (normally a backslash) // the escape char has just been read (normally a backslash)
final int c = in.read(); final int c = in.read();
switch (c) { switch (c) {
case 'r': case 'r':
@ -88,10 +98,21 @@ abstract class Lexer {
return BACKSPACE; return BACKSPACE;
case 'f': case 'f':
return FF; return FF;
case CR:
case LF:
case FF: // TODO is this correct?
case TAB: // TODO is this correct? Do tabs need to be escaped?
case BACKSPACE: // TODO is this correct?
return c;
case END_OF_STREAM: case END_OF_STREAM:
throw new IOException("EOF whilst processing escape sequence"); throw new IOException("EOF whilst processing escape sequence");
default: default:
return c; // Now check for meta-characters
if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) || isCommentStart(c)) {
return c;
}
// indicate unexpected char - available from in.getLastChar()
return END_OF_STREAM;
} }
} }

View File

@ -36,7 +36,6 @@ import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import org.junit.Before; import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
/** /**
@ -311,48 +310,42 @@ public class CSVLexerTest {
assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped")); assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
} }
@Test @Test // TODO is this correct? Do we expect TAB to be un/escaped?
public void testEscapedTab() throws Exception { public void testEscapedTab() throws Exception {
final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping); final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped")); assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
} }
@Test @Test // TODO is this correct? Do we expect BACKSPACE to be un/escaped?
public void testEscapeBackspace() throws Exception { public void testEscapeBackspace() throws Exception {
final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping); final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped")); assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
} }
@Test @Test // TODO is this correct? Do we expect FF to be un/escaped?
public void testEscapeFF() throws Exception { public void testEscapeFF() throws Exception {
final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping); final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped")); assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
} }
// FIXME this should work after CSV-58 is resolved. Currently the result will be "charactera\NEscaped"
@Test @Test
@Ignore
public void testEscapedMySqlNullValue() throws Exception { public void testEscapedMySqlNullValue() throws Exception {
// MySQL uses \N to symbolize null values. We have to restore this // MySQL uses \N to symbolize null values. We have to restore this
final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping); final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped")); assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
} }
// FIXME this should work after CSV-58 is resolved. Currently the result will be "characteraEscaped"
@Test @Test
@Ignore
public void testEscapedCharacter() throws Exception { public void testEscapedCharacter() throws Exception {
final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping); final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped")); assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
} }
// FIXME this should work after CSV-58 is resolved. Currently the result will be "characterCREscaped"
@Test @Test
@Ignore
public void testEscapedControlCharacter() throws Exception { public void testEscapedControlCharacter() throws Exception {
// we are explicitly using an escape different from \ here, because \r is the character sequence for CR // we are explicitly using an escape different from \ here
final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build()); final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build());
assertThat(lexer.nextToken(new Token()), hasContent("character!rEscaped")); assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
} }
@Test @Test