CSV-58 Unescape handling needs rethinking

Fixed up most issues. TODO should TAB, FF and BACKSPACE be un/escaped? git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1478621 13f79535-47bb-0310-9956-ffa450edef68
2013-05-03 01:10:34 +00:00 · 2013-05-03 01:10:34 +00:00 · 2c61208262
parent e31980892c
commit 2c61208262
3 changed files with 40 additions and 16 deletions
--- a/src/main/java/org/apache/commons/csv/CSVLexer.java
+++ b/src/main/java/org/apache/commons/csv/CSVLexer.java
@ -160,7 +160,12 @@ final class CSVLexer extends Lexer {
                tkn.type = TOKEN;
                break;
            } else if (isEscape(c)) {
-                tkn.content.append((char) readEscape());
+                final int unescaped = readEscape();
                if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
                    tkn.content.append((char) c).append((char) in.getLastChar());
                } else {
                    tkn.content.append((char) unescaped);
                }
                c = in.read(); // continue
            } else {
                tkn.content.append((char) c);
@ -203,7 +208,12 @@ final class CSVLexer extends Lexer {
            c = in.read();
            if (isEscape(c)) {
-                tkn.content.append((char) readEscape());
+                final int unescaped = readEscape();
                if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
                    tkn.content.append((char) c).append((char) in.getLastChar());
                } else {
                    tkn.content.append((char) unescaped);
                }
            } else if (isQuoteChar(c)) {
                if (isQuoteChar(in.lookAhead())) {
                    // double or escaped encapsulator -> add single encapsulator to token
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@ -74,8 +74,18 @@ abstract class Lexer {
    }
    // TODO escape handling needs more work
    /**
     * Handle an escape sequence.
     * The current character must be the escape character.
     * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
     * on the input stream.
     * 
     * @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid. 
     * @throws IOException if there is a problem reading the stream or the end of stream is detected: 
     * the escape character is not allowed at end of strem
     */
    int readEscape() throws IOException {
-        // assume c is the escape char (normally a backslash)
+        // the escape char has just been read (normally a backslash)
        final int c = in.read();
        switch (c) {
        case 'r':
@ -88,11 +98,22 @@ abstract class Lexer {
            return BACKSPACE;
        case 'f':
            return FF;
        case CR:
        case LF:
        case FF: // TODO is this correct?
        case TAB: // TODO is this correct? Do tabs need to be escaped?
        case BACKSPACE: // TODO is this correct?
            return c;
        case END_OF_STREAM:
            throw new IOException("EOF whilst processing escape sequence");
        default:
            // Now check for meta-characters
            if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) || isCommentStart(c)) {
                return c;
            }
            // indicate unexpected char - available from in.getLastChar()
            return END_OF_STREAM;
        }
    }
    void trimTrailingSpaces(final StringBuilder buffer) {
--- a/src/test/java/org/apache/commons/csv/CSVLexerTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVLexerTest.java
@ -36,7 +36,6 @@ import java.io.IOException;
 import java.io.StringReader;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
 /**
@ -311,48 +310,42 @@ public class CSVLexerTest {
        assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
    }
-    @Test
+    @Test // TODO is this correct? Do we expect TAB to be un/escaped?
    public void testEscapedTab() throws Exception {
        final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
        assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
    }
-    @Test
+    @Test // TODO is this correct? Do we expect BACKSPACE to be un/escaped?
    public void testEscapeBackspace() throws Exception {
        final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
        assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
    }
-    @Test
+    @Test // TODO is this correct? Do we expect FF to be un/escaped?
    public void testEscapeFF() throws Exception {
        final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
        assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
    }
    // FIXME this should work after CSV-58 is resolved. Currently the result will be "charactera\NEscaped"
    @Test
    @Ignore
    public void testEscapedMySqlNullValue() throws Exception {
        // MySQL uses \N to symbolize null values. We have to restore this
        final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
        assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
    }
    // FIXME this should work after CSV-58 is resolved. Currently the result will be "characteraEscaped"
    @Test
    @Ignore
    public void testEscapedCharacter() throws Exception {
        final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
        assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
    }
    // FIXME this should work after CSV-58 is resolved. Currently the result will be "characterCREscaped"
    @Test
    @Ignore
    public void testEscapedControlCharacter() throws Exception {
-        // we are explicitly using an escape different from \ here, because \r is the character sequence for CR
+        // we are explicitly using an escape different from \ here
        final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build());
-        assertThat(lexer.nextToken(new Token()), hasContent("character!rEscaped"));
+        assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
    }
    @Test