From 2c6120826245f89fedf2f936ab4a0c3edd8717f3 Mon Sep 17 00:00:00 2001 From: Sebastian Bazley Date: Fri, 3 May 2013 01:10:34 +0000 Subject: [PATCH] CSV-58 Unescape handling needs rethinking Fixed up most issues. TODO should TAB, FF and BACKSPACE be un/escaped? git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1478621 13f79535-47bb-0310-9956-ffa450edef68 --- .../java/org/apache/commons/csv/CSVLexer.java | 14 +++++++++-- .../java/org/apache/commons/csv/Lexer.java | 25 +++++++++++++++++-- .../org/apache/commons/csv/CSVLexerTest.java | 17 ++++--------- 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVLexer.java b/src/main/java/org/apache/commons/csv/CSVLexer.java index 6baae593..90b9c589 100644 --- a/src/main/java/org/apache/commons/csv/CSVLexer.java +++ b/src/main/java/org/apache/commons/csv/CSVLexer.java @@ -160,7 +160,12 @@ final class CSVLexer extends Lexer { tkn.type = TOKEN; break; } else if (isEscape(c)) { - tkn.content.append((char) readEscape()); + final int unescaped = readEscape(); + if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape + tkn.content.append((char) c).append((char) in.getLastChar()); + } else { + tkn.content.append((char) unescaped); + } c = in.read(); // continue } else { tkn.content.append((char) c); @@ -203,7 +208,12 @@ final class CSVLexer extends Lexer { c = in.read(); if (isEscape(c)) { - tkn.content.append((char) readEscape()); + final int unescaped = readEscape(); + if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape + tkn.content.append((char) c).append((char) in.getLastChar()); + } else { + tkn.content.append((char) unescaped); + } } else if (isQuoteChar(c)) { if (isQuoteChar(in.lookAhead())) { // double or escaped encapsulator -> add single encapsulator to token diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index d0fc7717..af997456 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -74,8 +74,18 @@ abstract class Lexer { } // TODO escape handling needs more work + /** + * Handle an escape sequence. + * The current character must be the escape character. + * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()} + * on the input stream. + * + * @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid. + * @throws IOException if there is a problem reading the stream or the end of stream is detected: + * the escape character is not allowed at end of strem + */ int readEscape() throws IOException { - // assume c is the escape char (normally a backslash) + // the escape char has just been read (normally a backslash) final int c = in.read(); switch (c) { case 'r': @@ -88,10 +98,21 @@ abstract class Lexer { return BACKSPACE; case 'f': return FF; + case CR: + case LF: + case FF: // TODO is this correct? + case TAB: // TODO is this correct? Do tabs need to be escaped? + case BACKSPACE: // TODO is this correct? + return c; case END_OF_STREAM: throw new IOException("EOF whilst processing escape sequence"); default: - return c; + // Now check for meta-characters + if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) || isCommentStart(c)) { + return c; + } + // indicate unexpected char - available from in.getLastChar() + return END_OF_STREAM; } } diff --git a/src/test/java/org/apache/commons/csv/CSVLexerTest.java b/src/test/java/org/apache/commons/csv/CSVLexerTest.java index c7c58cae..a4ac74ff 100644 --- a/src/test/java/org/apache/commons/csv/CSVLexerTest.java +++ b/src/test/java/org/apache/commons/csv/CSVLexerTest.java @@ -36,7 +36,6 @@ import java.io.IOException; import java.io.StringReader; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; /** @@ -311,48 +310,42 @@ public class CSVLexerTest { assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped")); } - @Test + @Test // TODO is this correct? Do we expect TAB to be un/escaped? public void testEscapedTab() throws Exception { final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping); assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped")); } - @Test + @Test // TODO is this correct? Do we expect BACKSPACE to be un/escaped? public void testEscapeBackspace() throws Exception { final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping); assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped")); } - @Test + @Test // TODO is this correct? Do we expect FF to be un/escaped? public void testEscapeFF() throws Exception { final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping); assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped")); } - // FIXME this should work after CSV-58 is resolved. Currently the result will be "charactera\NEscaped" @Test - @Ignore public void testEscapedMySqlNullValue() throws Exception { // MySQL uses \N to symbolize null values. We have to restore this final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping); assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped")); } - // FIXME this should work after CSV-58 is resolved. Currently the result will be "characteraEscaped" @Test - @Ignore public void testEscapedCharacter() throws Exception { final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping); assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped")); } - // FIXME this should work after CSV-58 is resolved. Currently the result will be "characterCREscaped" @Test - @Ignore public void testEscapedControlCharacter() throws Exception { - // we are explicitly using an escape different from \ here, because \r is the character sequence for CR + // we are explicitly using an escape different from \ here final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build()); - assertThat(lexer.nextToken(new Token()), hasContent("character!rEscaped")); + assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped")); } @Test