A different take on PR #303

Add support for trailing text after the closing quote, and EOF without a final closing quote, for Excel compatibility. Fix a unit test and add a RAT exclude for the sample CSV file.
2024-03-12 19:41:43 -04:00 · 2024-03-12 19:41:43 -04:00 · c137e80407
parent b069c2dcf9
commit c137e80407
6 changed files with 170 additions and 63 deletions
--- a/pom.xml
+++ b/pom.xml
@ -23,7 +23,7 @@
    <version>67</version>
  </parent>
  <artifactId>commons-csv</artifactId>
-  <version>1.10.1-SNAPSHOT</version>
+  <version>1.11.0-SNAPSHOT</version>
  <name>Apache Commons CSV</name>
  <url>https://commons.apache.org/proper/commons-csv/</url>
  <inceptionYear>2005</inceptionYear>
@ -161,7 +161,7 @@
  </distributionManagement>

  <properties>
-    <commons.release.version>1.10.1</commons.release.version>
+    <commons.release.version>1.11.0</commons.release.version>
    <commons.release.desc>(Java 8 or above)</commons.release.desc>
    <!-- The RC version used in the staging repository URL. -->
    <commons.rc.version>RC1</commons.rc.version>
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@ -42,7 +42,9 @@
  <body>
    <release version="1.10.1" date="YYYY-MM-DD" description="Feature and bug fix release (Java 8 or above)">
      <!-- ADD -->
-      <action issue="CSV-308" type="fix" dev="ggregory" due-to="Buddhi De Silva, Gary Gregory">[Javadoc] Add example to CSVFormat#setHeaderComments() #344.</action>
+      <action issue="CSV-308" type="add" dev="ggregory" due-to="Buddhi De Silva, Gary Gregory">[Javadoc] Add example to CSVFormat#setHeaderComments() #344.</action>
+      <action type="add" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Add and use CSVFormat#setTrailingData(boolean) in CSVFormat.EXCEL for Excel compatibility #303.</action>
+      <action type="add" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Add and use CSVFormat#setLenientEof(boolean) in CSVFormat.EXCEL for Excel compatibility #303.</action>
      <!-- FIX -->
      <action type="fix" issue="CSV-306" dev="ggregory" due-to="Sam Ng, Bruno P. Kinoshita">Replace deprecated method in user guide, update external link #324, #325.</action>
      <action type="fix" dev="ggregory" due-to="Seth Falco, Bruno P. Kinoshita">Document duplicate header behavior #309.</action>
@ -53,6 +55,7 @@
      <action type="fix" issue="CSV-311" dev="ggregory" due-to="Christian Feuersaenger, Gary Gregory">OutOfMemory for very long rows despite using column value of type Reader.</action>
      <action type="fix" dev="ggregory" due-to="Gary Gregory">Use try-with-resources to manage JDBC Clob in CSVPrinter.printRecords(ResultSet).</action>
      <action type="fix" dev="ggregory" due-to="Gary Gregory">JDBC Blob columns are now output as Base64 instead of Object#toString(), which usually is InputStream#toString().</action>
+      <action type="fix" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Support unusual Excel use cases: Add support for trailing data after the closing quote, and EOF without a final closing quote #303.</action>
      <!-- UPDATE -->
      <action type="update" dev="ggregory" due-to="Gary Gregory">Bump commons-io:commons-io: from 2.11.0 to 2.15.1.</action>
      <action type="update" dev="ggregory" due-to="Gary Gregory, Dependabot">Bump commons-parent from 57 to 67.</action>
--- a/src/main/java/org/apache/commons/csv/CSVFormat.java
+++ b/src/main/java/org/apache/commons/csv/CSVFormat.java
@ -248,6 +248,10 @@ public final class CSVFormat implements Serializable {

        private boolean skipHeaderRecord;

+        private boolean lenientEof;
+
+        private boolean trailingData;
+
        private boolean trailingDelimiter;

        private boolean trim;
@ -267,6 +271,8 @@ public final class CSVFormat implements Serializable {
            this.headers = csvFormat.headers;
            this.skipHeaderRecord = csvFormat.skipHeaderRecord;
            this.ignoreHeaderCase = csvFormat.ignoreHeaderCase;
+            this.lenientEof = csvFormat.lenientEof;
+            this.trailingData = csvFormat.trailingData;
            this.trailingDelimiter = csvFormat.trailingDelimiter;
            this.trim = csvFormat.trim;
            this.autoFlush = csvFormat.autoFlush;
@ -689,6 +695,18 @@ public final class CSVFormat implements Serializable {
            return this;
        }

+        /**
+         * Sets whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
+         *
+         * @param lenientEof whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
+         * @return This instance.
+         * @since 1.11.0
+         */
+        public Builder setLenientEof(final boolean lenientEof) {
+            this.lenientEof = lenientEof;
+            return this;
+        }
+
        /**
         * Sets the String to convert to and from {@code null}. No substitution occurs if {@code null}.
         *
@ -785,6 +803,18 @@ public final class CSVFormat implements Serializable {
            return this;
        }

+        /**
+         * Sets whether reading trailing data is allowed in records, helps Excel compatibility.
+         *
+         * @param trailingData whether reading trailing data is allowed in records, helps Excel compatibility.
+         * @return This instance.
+         * @since 1.11.0
+         */
+        public Builder setTrailingData(final boolean trailingData) {
+            this.trailingData = trailingData;
+            return this;
+        }
+
        /**
         * Sets whether to add a trailing delimiter.
         *
@ -914,7 +944,7 @@ public final class CSVFormat implements Serializable {
     * @see Predefined#Default
     */
    public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false,
-            false, false, false, DuplicateHeaderMode.ALLOW_ALL);
+            false, false, false, DuplicateHeaderMode.ALLOW_ALL, false, false);

    /**
     * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale-dependent, it might be necessary
@ -935,9 +965,11 @@ public final class CSVFormat implements Serializable {
     * <li>{@code setDelimiter(',')}</li>
     * <li>{@code setQuote('"')}</li>
     * <li>{@code setRecordSeparator("\r\n")}</li>
+     * <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
     * <li>{@code setIgnoreEmptyLines(false)}</li>
     * <li>{@code setAllowMissingColumnNames(true)}</li>
-     * <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
+     * <li>{@code setTrailingData(true)}</li>
+     * <li>{@code setLenientEof(true)}</li>
     * </ul>
     * <p>
     * Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and
@ -950,6 +982,8 @@ public final class CSVFormat implements Serializable {
    public static final CSVFormat EXCEL = DEFAULT.builder()
            .setIgnoreEmptyLines(false)
            .setAllowMissingColumnNames(true)
+            .setTrailingData(true)
+            .setLenientEof(true)
            .build();
    // @formatter:on

@ -1372,7 +1406,7 @@ public final class CSVFormat implements Serializable {
     */
    public static CSVFormat newFormat(final char delimiter) {
        return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false,
-                DuplicateHeaderMode.ALLOW_ALL);
+                DuplicateHeaderMode.ALLOW_ALL, false, false);
    }

    static String[] toStringArray(final Object[] values) {
@ -1455,6 +1489,10 @@ public final class CSVFormat implements Serializable {

    private final boolean skipHeaderRecord;

+    private final boolean lenientEof;
+
+    private final boolean trailingData;
+
    private final boolean trailingDelimiter;

    private final boolean trim;
@ -1474,6 +1512,8 @@ public final class CSVFormat implements Serializable {
        this.headers = builder.headers;
        this.skipHeaderRecord = builder.skipHeaderRecord;
        this.ignoreHeaderCase = builder.ignoreHeaderCase;
+        this.lenientEof = builder.lenientEof;
+        this.trailingData = builder.trailingData;
        this.trailingDelimiter = builder.trailingDelimiter;
        this.trim = builder.trim;
        this.autoFlush = builder.autoFlush;
@ -1494,22 +1534,24 @@ public final class CSVFormat implements Serializable {
     * @param ignoreEmptyLines        {@code true} when the parser should skip empty lines.
     * @param recordSeparator         the line separator to use for output.
     * @param nullString              the line separator to use for output.
-     * @param headerComments          the comments to be printed by the Printer before the actual CSV data.
-     * @param header                  the header
-     * @param skipHeaderRecord        if {@code true} the header row will be skipped
-     * @param allowMissingColumnNames if {@code true} the missing column names are allowed when parsing the header line
-     * @param ignoreHeaderCase        if {@code true} header names will be accessed ignoring case when parsing input
-     * @param trim                    if {@code true} next record value will be trimmed
-     * @param trailingDelimiter       if {@code true} the trailing delimiter wil be added before record separator (if set)
-     * @param autoFlush               if {@code true} the underlying stream will be flushed before closing
-     * @param duplicateHeaderMode     the behavior when handling duplicate headers
+     * @param headerComments          the comments to be printed by the Printer before the actual CSV data..
+     * @param header                  the header.
+     * @param skipHeaderRecord        if {@code true} the header row will be skipped.
+     * @param allowMissingColumnNames if {@code true} the missing column names are allowed when parsing the header line.
+     * @param ignoreHeaderCase        if {@code true} header names will be accessed ignoring case when parsing input.
+     * @param trim                    if {@code true} next record value will be trimmed.
+     * @param trailingDelimiter       if {@code true} the trailing delimiter wil be added before record separator (if set)..
+     * @param autoFlush               if {@code true} the underlying stream will be flushed before closing.
+     * @param duplicateHeaderMode     the behavior when handling duplicate headers.
+     * @param trailingData            whether reading trailing data is allowed in records, helps Excel compatibility.
+     * @param lenientEof              whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
     * @throws IllegalArgumentException if the delimiter is a line break character.
     */
    private CSVFormat(final String delimiter, final Character quoteChar, final QuoteMode quoteMode, final Character commentStart, final Character escape,
            final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
            final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames,
            final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush,
-            final DuplicateHeaderMode duplicateHeaderMode) {
+            final DuplicateHeaderMode duplicateHeaderMode, final boolean trailingData, final boolean lenientEof) {
        this.delimiter = delimiter;
        this.quoteCharacter = quoteChar;
        this.quoteMode = quoteMode;
@ -1524,6 +1566,8 @@ public final class CSVFormat implements Serializable {
        this.headers = clone(header);
        this.skipHeaderRecord = skipHeaderRecord;
        this.ignoreHeaderCase = ignoreHeaderCase;
+        this.lenientEof = lenientEof;
+        this.trailingData = trailingData;
        this.trailingDelimiter = trailingDelimiter;
        this.trim = trim;
        this.autoFlush = autoFlush;
@ -1571,18 +1615,23 @@ public final class CSVFormat implements Serializable {
        if (this == obj) {
            return true;
        }
-        if (obj == null || getClass() != obj.getClass()) {
+        if (obj == null) {
+            return false;
+        }
+        if (getClass() != obj.getClass()) {
            return false;
        }
        final CSVFormat other = (CSVFormat) obj;
-        return duplicateHeaderMode == other.duplicateHeaderMode && allowMissingColumnNames == other.allowMissingColumnNames &&
-                autoFlush == other.autoFlush && Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
-                Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(headers, other.headers) &&
-                Arrays.equals(headerComments, other.headerComments) && ignoreEmptyLines == other.ignoreEmptyLines &&
-                ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces &&
-                Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode &&
-                Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) &&
-                skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
+        return allowMissingColumnNames == other.allowMissingColumnNames && autoFlush == other.autoFlush &&
+                Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
+                duplicateHeaderMode == other.duplicateHeaderMode && Objects.equals(escapeCharacter, other.escapeCharacter) &&
+                Arrays.equals(headerComments, other.headerComments) && Arrays.equals(headers, other.headers) &&
+                ignoreEmptyLines == other.ignoreEmptyLines && ignoreHeaderCase == other.ignoreHeaderCase &&
+                ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof &&
+                Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) &&
+                quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) &&
+                Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord &&
+                trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
    }

    private void escape(final char c, final Appendable appendable) throws IOException {
@ -1808,6 +1857,16 @@ public final class CSVFormat implements Serializable {
        return ignoreSurroundingSpaces;
    }

+    /**
+     * Gets whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
+     *
+     * @return whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
+     * @since 1.11.0
+     */
+    public boolean getLenientEof() {
+        return lenientEof;
+    }
+
    /**
     * Gets the String to convert to and from {@code null}.
     * <ul>
@ -1857,6 +1916,16 @@ public final class CSVFormat implements Serializable {
        return skipHeaderRecord;
    }

+    /**
+     * Gets whether reading trailing data is allowed in records, helps Excel compatibility.
+     *
+     * @return whether reading trailing data is allowed in records, helps Excel compatibility.
+     * @since 1.11.0
+     */
+    public boolean getTrailingData() {
+        return trailingData;
+    }
+
    /**
     * Gets whether to add a trailing delimiter.
     *
@ -1881,11 +1950,12 @@ public final class CSVFormat implements Serializable {
    public int hashCode() {
        final int prime = 31;
        int result = 1;
-        result = prime * result + Arrays.hashCode(headers);
        result = prime * result + Arrays.hashCode(headerComments);
-        return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter,
-                ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator,
-                skipHeaderRecord, trailingDelimiter, trim);
+        result = prime * result + Arrays.hashCode(headers);
+        result = prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter,
+                ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, nullString, quoteCharacter, quoteMode, quotedNullString,
+                recordSeparator, skipHeaderRecord, trailingData, trailingDelimiter, trim);
+        return result;
    }

    /**
@ -2006,6 +2076,26 @@ public final class CSVFormat implements Serializable {
        return new CSVPrinter(new OutputStreamWriter(new FileOutputStream(out), charset), this);
    }

+    private void print(final InputStream inputStream, final Appendable out, final boolean newRecord) throws IOException {
+        // InputStream is never null here
+        // There is nothing to escape when quoting is used which is the default.
+        if (!newRecord) {
+            append(getDelimiterString(), out);
+        }
+        final boolean quoteCharacterSet = isQuoteCharacterSet();
+        if (quoteCharacterSet) {
+            append(getQuoteCharacter().charValue(), out);
+        }
+        // Stream the input to the output without reading or holding the whole value in memory.
+        // AppendableOutputStream cannot "close" an Appendable.
+        try (OutputStream outputStream = new Base64OutputStream(new AppendableOutputStream<>(out))) {
+            IOUtils.copy(inputStream, outputStream);
+        }
+        if (quoteCharacterSet) {
+            append(getQuoteCharacter().charValue(), out);
+        }
+    }
+
    /**
     * Prints the {@code value} as the next value on the line to {@code out}. The value will be escaped or encapsulated as needed. Useful when one wants to
     * avoid creating CSVPrinters. Trims the value if {@link #getTrim()} is true.
@ -2081,26 +2171,6 @@ public final class CSVFormat implements Serializable {
        return print(Files.newBufferedWriter(out, charset));
    }

-    private void print(final InputStream inputStream, final Appendable out, final boolean newRecord) throws IOException {
-        // InputStream is never null here
-        // There is nothing to escape when quoting is used which is the default.
-        if (!newRecord) {
-            append(getDelimiterString(), out);
-        }
-        final boolean quoteCharacterSet = isQuoteCharacterSet();
-        if (quoteCharacterSet) {
-            append(getQuoteCharacter().charValue(), out);
-        }
-        // Stream the input to the output without reading or holding the whole value in memory.
-        // AppendableOutputStream cannot "close" an Appendable.
-        try (OutputStream outputStream = new Base64OutputStream(new AppendableOutputStream<>(out))) {
-            IOUtils.copy(inputStream, outputStream);
-        }
-        if (quoteCharacterSet) {
-            append(getQuoteCharacter().charValue(), out);
-        }
-    }
-
    private void print(final Reader reader, final Appendable out, final boolean newRecord) throws IOException {
        // Reader is never null here
        if (!newRecord) {
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@ -53,9 +53,10 @@ final class Lexer implements Closeable {
    private final char escape;
    private final char quoteChar;
    private final char commentStart;
-
    private final boolean ignoreSurroundingSpaces;
    private final boolean ignoreEmptyLines;
+    private final boolean lenientEof;
+    private final boolean trailingData;

    /** The input stream */
    private final ExtendedBufferedReader reader;
@ -71,6 +72,8 @@ final class Lexer implements Closeable {
        this.commentStart = mapNullToDisabled(format.getCommentMarker());
        this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
        this.ignoreEmptyLines = format.getIgnoreEmptyLines();
+        this.lenientEof = format.getLenientEof();
+        this.trailingData = format.getTrailingData();
        this.delimiterBuf = new char[delimiter.length - 1];
        this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
    }
@ -364,14 +367,21 @@ final class Lexer implements Closeable {
                            token.type = EORECORD;
                            return token;
                        }
-                        if (!Character.isWhitespace((char)c)) {
+                        if (trailingData) {
+                            token.content.append((char) c);
+                        } else if (!Character.isWhitespace((char) c)) {
                            // error invalid char between token and next delimiter
-                            throw new IOException("Invalid char between encapsulated token and delimiter at line: " +
-                                    getCurrentLineNumber() + ", position: " + getCharacterPosition());
+                            throw new IOException(String.format("Invalid char between encapsulated token and delimiter at line: %,d, position: %,d",
+                                    getCurrentLineNumber(), getCharacterPosition()));
                        }
                    }
                }
            } else if (isEndOfFile(c)) {
+                if (lenientEof) {
+                    token.type = Token.Type.EOF;
+                    token.isReady = true; // There is data at EOF
+                    return token;
+                }
                // error condition (end of file before end of token)
                throw new IOException("(startline " + startLineNumber +
                        ") EOF reached before encapsulated token finished");
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@ -299,7 +299,6 @@ public class CSVParserTest {
    }

    @Test
-    @Disabled("PR 295 does not work")
    public void testCSV141Excel() throws Exception {
        testCSV141Ok(CSVFormat.EXCEL);
    }
@ -357,16 +356,15 @@ public class CSVParserTest {
            record = parser.nextRecord();
            assertEquals("1414770318327", record.get(0));
            assertEquals("android.widget.EditText", record.get(1));
-            assertEquals("pass sem1", record.get(2));
-            assertEquals(3, record.size());
-            // row 4
+            assertEquals("pass sem1\n1414770318628\"", record.get(2));
+            assertEquals("android.widget.EditText", record.get(3));
+            assertEquals("pass sem1 _84*|*", record.get(4));
+            assertEquals("0", record.get(5));
+            assertEquals("pass sem1\n", record.get(6));
+            assertEquals(7, record.size());
+            // EOF
            record = parser.nextRecord();
-            assertEquals("1414770318628", record.get(0));
-            assertEquals("android.widget.EditText", record.get(1));
-            assertEquals("pass sem1 _84*|*", record.get(2));
-            assertEquals("0", record.get(3));
-            assertEquals("pass sem1", record.get(4));
-            assertEquals(5, record.size());
+            assertNull(record);
        }
    }

--- a/src/test/java/org/apache/commons/csv/LexerTest.java
+++ b/src/test/java/org/apache/commons/csv/LexerTest.java
@ -196,6 +196,19 @@ public class LexerTest {
        }
    }

+    @Test
+    public void testEOFWithoutClosingQuote() throws Exception {
+        final String code = "a,\"b";
+        try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).build())) {
+            assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
+            assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
+        }
+        try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).build())) {
+            assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
+            assertThrows(IOException.class, () -> parser.nextToken(new Token()));
+        }
+    }
+
    @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
    public void testEscapedBackspace() throws Exception {
        try (final Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
@ -423,6 +436,19 @@ public class LexerTest {
        }
    }

+    @Test
+    public void testTrailingTextAfterQuote() throws Exception {
+        final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
+        try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).build())) {
+            assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
+            assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
+            assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
+        }
+        try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).build())) {
+            assertThrows(IOException.class, () -> parser.nextToken(new Token()));
+        }
+    }
+
    @Test
    public void testTrimTrailingSpacesZeroLength() throws Exception {
        final StringBuilder buffer = new StringBuilder("");