CSV-58 Unescape handling needs rethinking
Fixed up most issues. TODO should TAB, FF and BACKSPACE be un/escaped? git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1478621 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e31980892c
commit
2c61208262
|
@ -160,7 +160,12 @@ final class CSVLexer extends Lexer {
|
||||||
tkn.type = TOKEN;
|
tkn.type = TOKEN;
|
||||||
break;
|
break;
|
||||||
} else if (isEscape(c)) {
|
} else if (isEscape(c)) {
|
||||||
tkn.content.append((char) readEscape());
|
final int unescaped = readEscape();
|
||||||
|
if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
|
||||||
|
tkn.content.append((char) c).append((char) in.getLastChar());
|
||||||
|
} else {
|
||||||
|
tkn.content.append((char) unescaped);
|
||||||
|
}
|
||||||
c = in.read(); // continue
|
c = in.read(); // continue
|
||||||
} else {
|
} else {
|
||||||
tkn.content.append((char) c);
|
tkn.content.append((char) c);
|
||||||
|
@ -203,7 +208,12 @@ final class CSVLexer extends Lexer {
|
||||||
c = in.read();
|
c = in.read();
|
||||||
|
|
||||||
if (isEscape(c)) {
|
if (isEscape(c)) {
|
||||||
tkn.content.append((char) readEscape());
|
final int unescaped = readEscape();
|
||||||
|
if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
|
||||||
|
tkn.content.append((char) c).append((char) in.getLastChar());
|
||||||
|
} else {
|
||||||
|
tkn.content.append((char) unescaped);
|
||||||
|
}
|
||||||
} else if (isQuoteChar(c)) {
|
} else if (isQuoteChar(c)) {
|
||||||
if (isQuoteChar(in.lookAhead())) {
|
if (isQuoteChar(in.lookAhead())) {
|
||||||
// double or escaped encapsulator -> add single encapsulator to token
|
// double or escaped encapsulator -> add single encapsulator to token
|
||||||
|
|
|
@ -74,8 +74,18 @@ abstract class Lexer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO escape handling needs more work
|
// TODO escape handling needs more work
|
||||||
|
/**
|
||||||
|
* Handle an escape sequence.
|
||||||
|
* The current character must be the escape character.
|
||||||
|
* On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
|
||||||
|
* on the input stream.
|
||||||
|
*
|
||||||
|
* @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
|
||||||
|
* @throws IOException if there is a problem reading the stream or the end of stream is detected:
|
||||||
|
* the escape character is not allowed at end of strem
|
||||||
|
*/
|
||||||
int readEscape() throws IOException {
|
int readEscape() throws IOException {
|
||||||
// assume c is the escape char (normally a backslash)
|
// the escape char has just been read (normally a backslash)
|
||||||
final int c = in.read();
|
final int c = in.read();
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'r':
|
case 'r':
|
||||||
|
@ -88,10 +98,21 @@ abstract class Lexer {
|
||||||
return BACKSPACE;
|
return BACKSPACE;
|
||||||
case 'f':
|
case 'f':
|
||||||
return FF;
|
return FF;
|
||||||
|
case CR:
|
||||||
|
case LF:
|
||||||
|
case FF: // TODO is this correct?
|
||||||
|
case TAB: // TODO is this correct? Do tabs need to be escaped?
|
||||||
|
case BACKSPACE: // TODO is this correct?
|
||||||
|
return c;
|
||||||
case END_OF_STREAM:
|
case END_OF_STREAM:
|
||||||
throw new IOException("EOF whilst processing escape sequence");
|
throw new IOException("EOF whilst processing escape sequence");
|
||||||
default:
|
default:
|
||||||
return c;
|
// Now check for meta-characters
|
||||||
|
if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) || isCommentStart(c)) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
// indicate unexpected char - available from in.getLastChar()
|
||||||
|
return END_OF_STREAM;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,6 @@ import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -311,48 +310,42 @@ public class CSVLexerTest {
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test // TODO is this correct? Do we expect TAB to be un/escaped?
|
||||||
public void testEscapedTab() throws Exception {
|
public void testEscapedTab() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
|
final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test // TODO is this correct? Do we expect BACKSPACE to be un/escaped?
|
||||||
public void testEscapeBackspace() throws Exception {
|
public void testEscapeBackspace() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
|
final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test // TODO is this correct? Do we expect FF to be un/escaped?
|
||||||
public void testEscapeFF() throws Exception {
|
public void testEscapeFF() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
|
final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME this should work after CSV-58 is resolved. Currently the result will be "charactera\NEscaped"
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
|
||||||
public void testEscapedMySqlNullValue() throws Exception {
|
public void testEscapedMySqlNullValue() throws Exception {
|
||||||
// MySQL uses \N to symbolize null values. We have to restore this
|
// MySQL uses \N to symbolize null values. We have to restore this
|
||||||
final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
|
final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME this should work after CSV-58 is resolved. Currently the result will be "characteraEscaped"
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
|
||||||
public void testEscapedCharacter() throws Exception {
|
public void testEscapedCharacter() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
|
final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME this should work after CSV-58 is resolved. Currently the result will be "characterCREscaped"
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
|
||||||
public void testEscapedControlCharacter() throws Exception {
|
public void testEscapedControlCharacter() throws Exception {
|
||||||
// we are explicitly using an escape different from \ here, because \r is the character sequence for CR
|
// we are explicitly using an escape different from \ here
|
||||||
final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build());
|
final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build());
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character!rEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue