CSV-58 Unescape handling needs rethinking
Fixed up most issues. TODO should TAB, FF and BACKSPACE be un/escaped? git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1478621 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e31980892c
commit
2c61208262
|
@ -160,7 +160,12 @@ final class CSVLexer extends Lexer {
|
|||
tkn.type = TOKEN;
|
||||
break;
|
||||
} else if (isEscape(c)) {
|
||||
tkn.content.append((char) readEscape());
|
||||
final int unescaped = readEscape();
|
||||
if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
|
||||
tkn.content.append((char) c).append((char) in.getLastChar());
|
||||
} else {
|
||||
tkn.content.append((char) unescaped);
|
||||
}
|
||||
c = in.read(); // continue
|
||||
} else {
|
||||
tkn.content.append((char) c);
|
||||
|
@ -203,7 +208,12 @@ final class CSVLexer extends Lexer {
|
|||
c = in.read();
|
||||
|
||||
if (isEscape(c)) {
|
||||
tkn.content.append((char) readEscape());
|
||||
final int unescaped = readEscape();
|
||||
if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
|
||||
tkn.content.append((char) c).append((char) in.getLastChar());
|
||||
} else {
|
||||
tkn.content.append((char) unescaped);
|
||||
}
|
||||
} else if (isQuoteChar(c)) {
|
||||
if (isQuoteChar(in.lookAhead())) {
|
||||
// double or escaped encapsulator -> add single encapsulator to token
|
||||
|
|
|
@ -74,8 +74,18 @@ abstract class Lexer {
|
|||
}
|
||||
|
||||
// TODO escape handling needs more work
|
||||
/**
|
||||
* Handle an escape sequence.
|
||||
* The current character must be the escape character.
|
||||
* On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
|
||||
* on the input stream.
|
||||
*
|
||||
* @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
|
||||
* @throws IOException if there is a problem reading the stream or the end of stream is detected:
|
||||
* the escape character is not allowed at end of strem
|
||||
*/
|
||||
int readEscape() throws IOException {
|
||||
// assume c is the escape char (normally a backslash)
|
||||
// the escape char has just been read (normally a backslash)
|
||||
final int c = in.read();
|
||||
switch (c) {
|
||||
case 'r':
|
||||
|
@ -88,10 +98,21 @@ abstract class Lexer {
|
|||
return BACKSPACE;
|
||||
case 'f':
|
||||
return FF;
|
||||
case CR:
|
||||
case LF:
|
||||
case FF: // TODO is this correct?
|
||||
case TAB: // TODO is this correct? Do tabs need to be escaped?
|
||||
case BACKSPACE: // TODO is this correct?
|
||||
return c;
|
||||
case END_OF_STREAM:
|
||||
throw new IOException("EOF whilst processing escape sequence");
|
||||
default:
|
||||
return c;
|
||||
// Now check for meta-characters
|
||||
if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) || isCommentStart(c)) {
|
||||
return c;
|
||||
}
|
||||
// indicate unexpected char - available from in.getLastChar()
|
||||
return END_OF_STREAM;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -36,7 +36,6 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
|
@ -311,48 +310,42 @@ public class CSVLexerTest {
|
|||
assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test // TODO is this correct? Do we expect TAB to be un/escaped?
|
||||
public void testEscapedTab() throws Exception {
|
||||
final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
|
||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test // TODO is this correct? Do we expect BACKSPACE to be un/escaped?
|
||||
public void testEscapeBackspace() throws Exception {
|
||||
final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
|
||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test // TODO is this correct? Do we expect FF to be un/escaped?
|
||||
public void testEscapeFF() throws Exception {
|
||||
final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
|
||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
|
||||
}
|
||||
|
||||
// FIXME this should work after CSV-58 is resolved. Currently the result will be "charactera\NEscaped"
|
||||
@Test
|
||||
@Ignore
|
||||
public void testEscapedMySqlNullValue() throws Exception {
|
||||
// MySQL uses \N to symbolize null values. We have to restore this
|
||||
final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
|
||||
assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
|
||||
}
|
||||
|
||||
// FIXME this should work after CSV-58 is resolved. Currently the result will be "characteraEscaped"
|
||||
@Test
|
||||
@Ignore
|
||||
public void testEscapedCharacter() throws Exception {
|
||||
final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
|
||||
assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
|
||||
}
|
||||
|
||||
// FIXME this should work after CSV-58 is resolved. Currently the result will be "characterCREscaped"
|
||||
@Test
|
||||
@Ignore
|
||||
public void testEscapedControlCharacter() throws Exception {
|
||||
// we are explicitly using an escape different from \ here, because \r is the character sequence for CR
|
||||
// we are explicitly using an escape different from \ here
|
||||
final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build());
|
||||
assertThat(lexer.nextToken(new Token()), hasContent("character!rEscaped"));
|
||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue