SANDBOX-206: add escape to strategy, turn off backslash-style escaping by default

git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@609155 13f79535-47bb-0310-9956-ffa450edef68
2025-02-28 05:49:04 +00:00 · 2008-01-05 15:37:26 +00:00 · 2008-01-05 15:37:26 +00:00 · b55fb21d78
commit b55fb21d78
parent f34ce7d093
4 changed files with 127 additions and 60 deletions
--- a/src/java/org/apache/commons/csv/CSVParser.java
+++ b/src/java/org/apache/commons/csv/CSVParser.java
@ -134,7 +134,7 @@ public class CSVParser {
   * @deprecated use {@link #CSVParser(Reader,CSVStrategy)}.
   */
  public CSVParser(Reader input, char delimiter) {
-    this(input, delimiter, '"', (char) 0);
+    this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
  }
  
  /**
@ -347,7 +347,7 @@ public class CSVParser {
        eol = isEndOfLine(c);
      }
      // ok, start of token reached: comment, encapsulated, or token
-      if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart()) {
+      if (c == strategy.getCommentStart()) {
        // ignore everything till end of line and continue (incr linecount)
        in.readLine();
        tkn = nextToken(tkn.reset());
@ -400,19 +400,22 @@ public class CSVParser {
   */
  private Token simpleTokenLexer(Token tkn, int c) throws IOException {
    wsBuf.clear();
-    while (!tkn.isReady) {
+    for (;;) {
      if (isEndOfLine(c)) {
        // end of record
        tkn.type = TT_EORECORD;
        tkn.isReady = true;
+        return tkn;
      } else if (isEndOfFile(c)) {
        // end of file
        tkn.type = TT_EOF;
        tkn.isReady = true;
+        return tkn;
      } else if (c == strategy.getDelimiter()) {
        // end of token
        tkn.type = TT_TOKEN;
        tkn.isReady = true;
+        return tkn;
      } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
        // interpret unicode escaped chars (like \u0070 -> p)
        tkn.content.append((char) unicodeEscapeLexer(c));
@ -422,6 +425,8 @@ public class CSVParser {
        if (tkn.content.length() > 0) {
          wsBuf.append((char) c);
        }
+      } else if (c == strategy.getEscape()) {
+        tkn.content.append((char)readEscape(c));
      } else {
        // prepend whitespaces (if we have)
        if (wsBuf.length() > 0) {
@ -435,7 +440,6 @@ public class CSVParser {
        c = in.read();
      }
    }
-    return tkn;
  }
  
  
@ -457,70 +461,55 @@ public class CSVParser {
    int startLineNumber = getLineNumber();
    // ignore the given delimiter
    // assert c == delimiter;
-    c = in.read();
-    while (!tkn.isReady) {
-      boolean skipRead = false;
-      if (c == strategy.getEncapsulator() || c == '\\') {
-        // check lookahead
+    for (;;) {
+      c = in.read();
+
+      if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
+        tkn.content.append((char) unicodeEscapeLexer(c));
+      } else if (c == strategy.getEscape()) {
+        tkn.content.append((char)readEscape(c));
+      } else if (c == strategy.getEncapsulator()) {
        if (in.lookAhead() == strategy.getEncapsulator()) {
          // double or escaped encapsulator -> add single encapsulator to token
          c = in.read();
          tkn.content.append((char) c);
-        } else if (c == '\\' && in.lookAhead() == '\\') {
-          // doubled escape char, it does not escape itself, only encapsulator 
-          // -> add both escape chars to stream
-          tkn.content.append((char) c);
-          c = in.read();
-          tkn.content.append((char) c);
-        } else if (
-          strategy.getUnicodeEscapeInterpretation()
-          && c == '\\' 
-          && in.lookAhead() == 'u') {
-          // interpret unicode escaped chars (like \u0070 -> p)
-          tkn.content.append((char) unicodeEscapeLexer(c));
-        } else if (c == '\\') {
-          // use a single escape character -> add it to stream
-          tkn.content.append((char) c);
        } else {
          // token finish mark (encapsulator) reached: ignore whitespace till delimiter
-          while (!tkn.isReady) {
+          for (;;) {
            c = in.read();
            if (c == strategy.getDelimiter()) {
              tkn.type = TT_TOKEN;
              tkn.isReady = true;
+              return tkn;
            } else if (isEndOfFile(c)) {
              tkn.type = TT_EOF;
              tkn.isReady = true;
+              return tkn;
            } else if (isEndOfLine(c)) {
              // ok eo token reached
              tkn.type = TT_EORECORD;
              tkn.isReady = true;
+              return tkn;
            } else if (!isWhitespace(c)) {
-                // error invalid char between token and next delimiter
-                throw new IOException(
-                  "(line " + getLineNumber() 
-                  + ") invalid char between encapsulated token end delimiter"
-                );
-              }
+              // error invalid char between token and next delimiter
+              throw new IOException(
+                      "(line " + getLineNumber()
+                              + ") invalid char between encapsulated token end delimiter"
+              );
+            }
          }
-          skipRead = true;
        }
      } else if (isEndOfFile(c)) {
        // error condition (end of file before end of token)
        throw new IOException(
-          "(startline " + startLineNumber + ")"
-          + "eof reached before encapsulated token finished"
-          );
+                "(startline " + startLineNumber + ")"
+                        + "eof reached before encapsulated token finished"
+        );
      } else {
        // consume character
        tkn.content.append((char) c);
      }
-      // get the next char
-      if (!tkn.isReady && !skipRead) {
-        c = in.read();
-      }
    }
-    return tkn;
  }
  
  
@ -554,6 +543,21 @@ public class CSVParser {
    }
    return ret;
  }
+
+  private int readEscape(int c) throws IOException {
+    // assume c is the escape char (normally a backslash)
+    c = in.read();
+    int out;
+    switch (c) {
+      case 'r': out='\r'; break;
+      case 'n': out='\n'; break;
+      case 't': out='\t'; break;
+      case 'b': out='\b'; break;
+      case 'f': out='\f'; break;
+      default : out=c;
+    }
+    return out;
+  }
  
  // ======================================================
  //  strategies
--- a/src/java/org/apache/commons/csv/CSVStrategy.java
+++ b/src/java/org/apache/commons/csv/CSVStrategy.java
@ -28,15 +28,21 @@ public class CSVStrategy implements Cloneable, Serializable {
    private char delimiter;
    private char encapsulator;
    private char commentStart;
+    private char escape;
    private boolean ignoreLeadingWhitespaces;
    private boolean interpretUnicodeEscapes;
    private boolean ignoreEmptyLines;

-    public static char COMMENTS_DISABLED       = (char) 0;
+    // -2 is used to signal disabled, because it won't be confused with
+    // an EOF signal (-1), and because \ufffe in UTF-16 would be
+    // encoded as two chars (using surrogates) and thus there should never
+    // be a collision with a real text char.
+    public static char COMMENTS_DISABLED       = (char)-2;
+    public static char ESCAPE_DISABLED         = (char)-2;

-    public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, true,  false, true);
-    public static CSVStrategy EXCEL_STRATEGY   = new CSVStrategy(',', '"', COMMENTS_DISABLED, false, false, false);
-    public static CSVStrategy TDF_STRATEGY     = new CSVStrategy('	', '"', COMMENTS_DISABLED, true,  false, true);
+    public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true,  false, true);
+    public static CSVStrategy EXCEL_STRATEGY   = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
+    public static CSVStrategy TDF_STRATEGY     = new CSVStrategy('	', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true,  false, true);


    public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
@ -58,7 +64,8 @@ public class CSVStrategy implements Cloneable, Serializable {
    public CSVStrategy(
        char delimiter, 
        char encapsulator, 
-        char commentStart, 
+        char commentStart,
+        char escape,
        boolean ignoreLeadingWhitespace, 
        boolean interpretUnicodeEscapes,
        boolean ignoreEmptyLines) 
@ -66,11 +73,25 @@ public class CSVStrategy implements Cloneable, Serializable {
        setDelimiter(delimiter);
        setEncapsulator(encapsulator);
        setCommentStart(commentStart);
+        setEscape(escape);
        setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
        setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
        setIgnoreEmptyLines(ignoreEmptyLines);
    }

+    /** @deprecated */
+    public CSVStrategy(
+        char delimiter,
+        char encapsulator,
+        char commentStart,
+        boolean ignoreLeadingWhitespace,
+        boolean interpretUnicodeEscapes,
+        boolean ignoreEmptyLines)
+    {
+        this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
+    }
+
+
    public void setDelimiter(char delimiter) { this.delimiter = delimiter; }
    public char getDelimiter() { return this.delimiter; }

@ -81,6 +102,9 @@ public class CSVStrategy implements Cloneable, Serializable {
    public char getCommentStart() { return this.commentStart; }
    public boolean isCommentingDisabled() { return this.commentStart == COMMENTS_DISABLED; }

+    public void setEscape(char escape) { this.escape = escape; }
+    public char getEscape() { return this.escape; }
+
    public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
    public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; }

--- a/src/test/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/org/apache/commons/csv/CSVParserTest.java
@ -182,9 +182,7 @@ public class CSVParserTest extends TestCase {
  // encapsulator tokenizer (multi line, delimiter in string)
  public void testNextToken5() throws IOException {   
    String code = 
-      "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\",\"\\\"\""
-      + ",\"\\,\"" 
-      + ",\"\"\"\"";
+      "a,\"foo\n\",b\n\"foo\n  baar ,,,\"\n\"\n\t \n\"";
    TestCSVParser parser = new TestCSVParser(new StringReader(code));
    parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
    System.out.println("---------\n" + code + "\n-------------");
@ -193,11 +191,8 @@ public class CSVParserTest extends TestCase {
    assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
    assertEquals(CSVParser.TT_EORECORD + ";foo\n  baar ,,,;",
        parser.testNextToken());
-    assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
-    assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
-    // escape char in quoted input only escapes delimiter
-    assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
-    assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
+    assertEquals(CSVParser.TT_EOF + ";\n\t \n;", parser.testNextToken());
+
  }
  
  // change delimiters, comment, encapsulater
@ -207,7 +202,7 @@ public class CSVParserTest extends TestCase {
     *       !comment;;;;
     *       ;;
     */
-    String code = "a;'b and \\' more\n'\n!comment;;;;\n;;";
+    String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
    TestCSVParser parser = new TestCSVParser(new StringReader(code));
    parser.setStrategy( new CSVStrategy(';', '\'', '!') );
    System.out.println("---------\n" + code + "\n-------------");
@ -226,8 +221,9 @@ public class CSVParserTest extends TestCase {
    "a,b,c,d\n"
    + " a , b , 1 2 \n"
    + "\"foo baar\", b,\n"
-    + "   \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
-  String[][] res = { 
+   // + "   \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
+      + "   \"foo\n,,\n\"\",,\n\"\"\",d,e\n";   // changed to use standard CSV escaping
+  String[][] res = {
    {"a", "b", "c", "d"},
    {"a", "b", "1 2"}, 
    {"foo baar", "b", ""}, 
@ -439,7 +435,7 @@ public class CSVParserTest extends TestCase {
    }
  }
  
-  public void testBackslashEscaping() throws IOException {
+  public void OLDtestBackslashEscaping() throws IOException {
    String code =
      "one,two,three\n"
      + "on\\\"e,two\n"
@ -474,6 +470,49 @@ public class CSVParserTest extends TestCase {
    }
  }
  
+  public void testBackslashEscaping() throws IOException {
+
+    // To avoid confusion over the need for escaping chars in java code,
+    // We will test with a forward slash as the escape char, and a single
+    // quote as the encapsulator.
+
+    String code =
+      "one,two,three\n" // 0
+      + "'',''\n"       // 1) empty encapsulators
+      + "/',/'\n"       // 2) single encapsulators
+      + "'/'','/''\n"   // 3) single encapsulators encapsulated via escape
+      + "'''',''''\n"   // 4) single encapsulators encapsulated via doubling
+      + "/,,/,\n"       // 5) separator escaped
+      + "//,//\n"       // 6) escape escaped
+      + "'//','//'\n"   // 7) escape escaped in encapsulation
+      + "";
+    String[][] res = {
+        { "one", "two", "three" }, // 0
+        { "", "" },                // 1
+        { "'", "'" },              // 2
+        { "'", "'" },              // 3
+        { "'", "'" },              // 4
+        { ",", "," },              // 5
+        { "/", "/" },              // 6
+        { "/", "/" },              // 7
+      };
+
+
+    CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
+
+    CSVParser parser = new CSVParser(new StringReader(code), strategy);
+    System.out.println("---------\n" + code + "\n-------------");
+    String[][] tmp = parser.getAllValues();
+    assertTrue(tmp.length > 0);
+    for (int i = 0; i < res.length; i++) {
+      for (int j = 0; j < tmp[i].length; j++) {
+        System.out.println("'" + tmp[i][j] + "'  should be '" + res[i][j] + "'");
+      }
+      assertTrue(Arrays.equals(res[i], tmp[i]));
+    }
+  }
+
+
    public void testUnicodeEscape() throws IOException {
      String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
      CSVParser parser = new CSVParser(new StringReader(code));
--- a/src/test/org/apache/commons/csv/CSVStrategyTest.java
+++ b/src/test/org/apache/commons/csv/CSVStrategyTest.java
@ -91,7 +91,7 @@ public class CSVStrategyTest extends TestCase {
    // default settings
    assertEquals(strategy.getDelimiter(), ',');
    assertEquals(strategy.getEncapsulator(), '"');
-    assertEquals(strategy.getCommentStart(), '\0');
+    assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
    assertEquals(true,  strategy.getIgnoreLeadingWhitespaces());
    assertEquals(false, strategy.getUnicodeEscapeInterpretation());
    assertEquals(true,  strategy.getIgnoreEmptyLines());
@ -99,7 +99,7 @@ public class CSVStrategyTest extends TestCase {
    parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
    assertEquals(strategy.getDelimiter(), ',');
    assertEquals(strategy.getEncapsulator(), '"');
-    assertEquals(strategy.getCommentStart(), '\0');
+    assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
    assertEquals(true,  strategy.getIgnoreLeadingWhitespaces());
    assertEquals(false, strategy.getUnicodeEscapeInterpretation());
    assertEquals(true,  strategy.getIgnoreEmptyLines());
@ -109,7 +109,7 @@ public class CSVStrategyTest extends TestCase {
    CSVStrategy strategy = CSVStrategy.EXCEL_STRATEGY;
    assertEquals(strategy.getDelimiter(), ',');
    assertEquals(strategy.getEncapsulator(), '"');
-    assertEquals(strategy.getCommentStart(), '\0');
+    assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
    assertEquals(false,  strategy.getIgnoreLeadingWhitespaces());
    assertEquals(false, strategy.getUnicodeEscapeInterpretation());
    assertEquals(false, strategy.getIgnoreEmptyLines());