From 2c6120826245f89fedf2f936ab4a0c3edd8717f3 Mon Sep 17 00:00:00 2001
From: Sebastian Bazley <sebb@apache.org>
Date: Fri, 3 May 2013 01:10:34 +0000
Subject: [PATCH] CSV-58 Unescape handling needs rethinking Fixed up most
 issues. TODO should TAB, FF and BACKSPACE be un/escaped?

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1478621 13f79535-47bb-0310-9956-ffa450edef68
---
 .../java/org/apache/commons/csv/CSVLexer.java | 14 +++++++++--
 .../java/org/apache/commons/csv/Lexer.java    | 25 +++++++++++++++++--
 .../org/apache/commons/csv/CSVLexerTest.java  | 17 ++++---------
 3 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/src/main/java/org/apache/commons/csv/CSVLexer.java b/src/main/java/org/apache/commons/csv/CSVLexer.java
index 6baae593..90b9c589 100644
--- a/src/main/java/org/apache/commons/csv/CSVLexer.java
+++ b/src/main/java/org/apache/commons/csv/CSVLexer.java
@@ -160,7 +160,12 @@ final class CSVLexer extends Lexer {
                 tkn.type = TOKEN;
                 break;
             } else if (isEscape(c)) {
-                tkn.content.append((char) readEscape());
+                final int unescaped = readEscape();
+                if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
+                    tkn.content.append((char) c).append((char) in.getLastChar());
+                } else {
+                    tkn.content.append((char) unescaped);
+                }
                 c = in.read(); // continue
             } else {
                 tkn.content.append((char) c);
@@ -203,7 +208,12 @@ final class CSVLexer extends Lexer {
             c = in.read();
 
             if (isEscape(c)) {
-                tkn.content.append((char) readEscape());
+                final int unescaped = readEscape();
+                if (unescaped == Constants.END_OF_STREAM) { // unexpected char after escape
+                    tkn.content.append((char) c).append((char) in.getLastChar());
+                } else {
+                    tkn.content.append((char) unescaped);
+                }
             } else if (isQuoteChar(c)) {
                 if (isQuoteChar(in.lookAhead())) {
                     // double or escaped encapsulator -> add single encapsulator to token
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
index d0fc7717..af997456 100644
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -74,8 +74,18 @@ abstract class Lexer {
     }
 
     // TODO escape handling needs more work
+    /**
+     * Handle an escape sequence.
+     * The current character must be the escape character.
+     * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
+     * on the input stream.
+     * 
+     * @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid. 
+     * @throws IOException if there is a problem reading the stream or the end of stream is detected: 
+     * the escape character is not allowed at end of strem
+     */
     int readEscape() throws IOException {
-        // assume c is the escape char (normally a backslash)
+        // the escape char has just been read (normally a backslash)
         final int c = in.read();
         switch (c) {
         case 'r':
@@ -88,10 +98,21 @@ abstract class Lexer {
             return BACKSPACE;
         case 'f':
             return FF;
+        case CR:
+        case LF:
+        case FF: // TODO is this correct?
+        case TAB: // TODO is this correct? Do tabs need to be escaped?
+        case BACKSPACE: // TODO is this correct?
+            return c;
         case END_OF_STREAM:
             throw new IOException("EOF whilst processing escape sequence");
         default:
-            return c;
+            // Now check for meta-characters
+            if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) || isCommentStart(c)) {
+                return c;
+            }
+            // indicate unexpected char - available from in.getLastChar()
+            return END_OF_STREAM;
         }
     }
 
diff --git a/src/test/java/org/apache/commons/csv/CSVLexerTest.java b/src/test/java/org/apache/commons/csv/CSVLexerTest.java
index c7c58cae..a4ac74ff 100644
--- a/src/test/java/org/apache/commons/csv/CSVLexerTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVLexerTest.java
@@ -36,7 +36,6 @@ import java.io.IOException;
 import java.io.StringReader;
 
 import org.junit.Before;
-import org.junit.Ignore;
 import org.junit.Test;
 
 /**
@@ -311,48 +310,42 @@ public class CSVLexerTest {
         assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
     }
 
-    @Test
+    @Test // TODO is this correct? Do we expect TAB to be un/escaped?
     public void testEscapedTab() throws Exception {
         final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
         assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
     }
 
-    @Test
+    @Test // TODO is this correct? Do we expect BACKSPACE to be un/escaped?
     public void testEscapeBackspace() throws Exception {
         final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
         assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
     }
 
-    @Test
+    @Test // TODO is this correct? Do we expect FF to be un/escaped?
     public void testEscapeFF() throws Exception {
         final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
         assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
     }
 
-    // FIXME this should work after CSV-58 is resolved. Currently the result will be "charactera\NEscaped"
     @Test
-    @Ignore
     public void testEscapedMySqlNullValue() throws Exception {
         // MySQL uses \N to symbolize null values. We have to restore this
         final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
         assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
     }
 
-    // FIXME this should work after CSV-58 is resolved. Currently the result will be "characteraEscaped"
     @Test
-    @Ignore
     public void testEscapedCharacter() throws Exception {
         final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
         assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
     }
 
-    // FIXME this should work after CSV-58 is resolved. Currently the result will be "characterCREscaped"
     @Test
-    @Ignore
     public void testEscapedControlCharacter() throws Exception {
-        // we are explicitly using an escape different from \ here, because \r is the character sequence for CR
+        // we are explicitly using an escape different from \ here
         final Lexer lexer = getLexer("character!rEscaped", CSVFormat.newBuilder().withEscape('!').build());
-        assertThat(lexer.nextToken(new Token()), hasContent("character!rEscaped"));
+        assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
     }
 
     @Test