From ed0ca2232105f6df1f3fc68762f03c7595dc20c2 Mon Sep 17 00:00:00 2001 From: Damjan Jovanovic Date: Tue, 27 Dec 2022 04:38:29 +0200 Subject: [PATCH 1/2] Add support for trailing text after the closing quote, for Excel compatibility. --- .../org/apache/commons/csv/CSVFormat.java | 48 ++++++++++++++++--- .../java/org/apache/commons/csv/Lexer.java | 14 ++++-- .../org/apache/commons/csv/LexerTest.java | 13 +++++ 3 files changed, 64 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 81b6f193..1cdd73c4 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -208,6 +208,8 @@ public final class CSVFormat implements Serializable { private boolean allowMissingColumnNames; + private boolean allowTrailingText; + private boolean autoFlush; private Character commentMarker; @@ -264,6 +266,7 @@ public final class CSVFormat implements Serializable { this.autoFlush = csvFormat.autoFlush; this.quotedNullString = csvFormat.quotedNullString; this.duplicateHeaderMode = csvFormat.duplicateHeaderMode; + this.allowTrailingText = csvFormat.allowTrailingText; } /** @@ -301,6 +304,20 @@ public final class CSVFormat implements Serializable { return this; } + /** + * Sets whether to allow trailing text in a quoted field, after the closing quote. + * + * @param allowTrailingText the trailing text behavior, {@code true} to append that text to the field contents, {@code false} to throw + * an {@link IOException}. + * + * @return This instance. + * @since 1.10.0 + */ + public Builder setAllowTrailingText(final boolean allowTrailingText) { + this.allowTrailingText = allowTrailingText; + return this; + } + /** * Sets whether to flush on close. * @@ -810,7 +827,7 @@ public final class CSVFormat implements Serializable { * @see Predefined#Default */ public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false, - false, false, false, DuplicateHeaderMode.ALLOW_ALL); + false, false, false, DuplicateHeaderMode.ALLOW_ALL, false); /** * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary @@ -834,6 +851,7 @@ public final class CSVFormat implements Serializable { *
  • {@code setIgnoreEmptyLines(false)}
  • *
  • {@code setAllowMissingColumnNames(true)}
  • *
  • {@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}
  • + *
  • {@code setAllowTrailingText(true)}
  • * *

    * Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and @@ -846,6 +864,7 @@ public final class CSVFormat implements Serializable { public static final CSVFormat EXCEL = DEFAULT.builder() .setIgnoreEmptyLines(false) .setAllowMissingColumnNames(true) + .setAllowTrailingText(true) .build(); // @formatter:on @@ -1268,7 +1287,7 @@ public final class CSVFormat implements Serializable { */ public static CSVFormat newFormat(final char delimiter) { return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false, - DuplicateHeaderMode.ALLOW_ALL); + DuplicateHeaderMode.ALLOW_ALL, false); } static String[] toStringArray(final Object[] values) { @@ -1312,6 +1331,8 @@ public final class CSVFormat implements Serializable { private final boolean allowMissingColumnNames; + private final boolean allowTrailingText; + private final boolean autoFlush; private final Character commentMarker; // null if commenting is disabled @@ -1366,6 +1387,7 @@ public final class CSVFormat implements Serializable { this.autoFlush = builder.autoFlush; this.quotedNullString = builder.quotedNullString; this.duplicateHeaderMode = builder.duplicateHeaderMode; + this.allowTrailingText = builder.allowTrailingText; validate(); } @@ -1396,7 +1418,7 @@ public final class CSVFormat implements Serializable { final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush, - final DuplicateHeaderMode duplicateHeaderMode) { + final DuplicateHeaderMode duplicateHeaderMode, final boolean allowTrailingText) { this.delimiter = delimiter; this.quoteCharacter = quoteChar; this.quoteMode = quoteMode; @@ -1416,6 +1438,7 @@ public final class CSVFormat implements Serializable { this.autoFlush = autoFlush; this.quotedNullString = quoteCharacter + nullString + quoteCharacter; this.duplicateHeaderMode = duplicateHeaderMode; + this.allowTrailingText = allowTrailingText; validate(); } @@ -1469,7 +1492,8 @@ public final class CSVFormat implements Serializable { ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && - skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim; + skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim && + allowTrailingText == other.allowTrailingText; } /** @@ -1512,6 +1536,16 @@ public final class CSVFormat implements Serializable { return allowMissingColumnNames; } + /** + * Gets whether quoted fields allow trailing text after the closing quote. + * + * @return {@code true} if allowed, {@code false} to throw an {@link IOException}. + * @since 1.10.0 + */ + public boolean getAllowTrailingText() { + return allowTrailingText; + } + /** * Gets whether to flush on close. * @@ -1692,9 +1726,9 @@ public final class CSVFormat implements Serializable { int result = 1; result = prime * result + Arrays.hashCode(headers); result = prime * result + Arrays.hashCode(headerComments); - return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter, - ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, - skipHeaderRecord, trailingDelimiter, trim); + return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, allowTrailingText, autoFlush, commentMarker, delimiter, + escapeCharacter, ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, + recordSeparator, skipHeaderRecord, trailingDelimiter, trim); } /** diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 06b2c9c2..fd60b5ac 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -57,6 +57,7 @@ final class Lexer implements Closeable { private final boolean ignoreSurroundingSpaces; private final boolean ignoreEmptyLines; + private final boolean allowTrailingText; /** The input stream */ private final ExtendedBufferedReader reader; @@ -72,6 +73,7 @@ final class Lexer implements Closeable { this.commentStart = mapNullToDisabled(format.getCommentMarker()); this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces(); this.ignoreEmptyLines = format.getIgnoreEmptyLines(); + this.allowTrailingText = format.getAllowTrailingText(); this.delimiterBuf = new char[delimiter.length - 1]; this.escapeDelimiterBuf = new char[2 * delimiter.length - 1]; } @@ -364,10 +366,14 @@ final class Lexer implements Closeable { token.type = EORECORD; return token; } - if (!Character.isWhitespace((char)c)) { - // error invalid char between token and next delimiter - throw new IOException("(line " + getCurrentLineNumber() + - ") invalid char between encapsulated token and delimiter"); + if (allowTrailingText) { + token.content.append((char) c); + } else { + if (!Character.isWhitespace((char)c)) { + // error invalid char between token and next delimiter + throw new IOException("(line " + getCurrentLineNumber() + + ") invalid char between encapsulated token and delimiter"); + } } } } diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index cc8d728a..7edc7d86 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -431,4 +431,17 @@ public class LexerTest { lexer.trimTrailingSpaces(buffer); assertThat(lexer.nextToken(new Token()), matches(EOF, "")); } + + @Test + public void testTrailingTextAfterQuote() throws Exception { + final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\""; + try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(true).build())) { + assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b")); + assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b")); + assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\"")); + } + try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(false).build())) { + assertThrows(IOException.class, () -> lexer.nextToken(new Token())); + } + } } From d0ea9e3a000aa358a4960df6cfc8abd735a3d165 Mon Sep 17 00:00:00 2001 From: Damjan Jovanovic Date: Wed, 4 Jan 2023 20:23:36 +0200 Subject: [PATCH 2/2] Add a setting that controls whether the last field on the last line, if quoted, has to have a closing quote before the file ends. --- .../org/apache/commons/csv/CSVFormat.java | 46 ++++++++++++++++--- .../java/org/apache/commons/csv/Lexer.java | 14 ++++-- .../org/apache/commons/csv/LexerTest.java | 15 +++++- 3 files changed, 64 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 1cdd73c4..280f99f2 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -206,6 +206,8 @@ public final class CSVFormat implements Serializable { return new Builder(csvFormat); } + private boolean allowEofWithoutClosingQuote; + private boolean allowMissingColumnNames; private boolean allowTrailingText; @@ -267,6 +269,7 @@ public final class CSVFormat implements Serializable { this.quotedNullString = csvFormat.quotedNullString; this.duplicateHeaderMode = csvFormat.duplicateHeaderMode; this.allowTrailingText = csvFormat.allowTrailingText; + this.allowEofWithoutClosingQuote = csvFormat.allowEofWithoutClosingQuote; } /** @@ -291,6 +294,19 @@ public final class CSVFormat implements Serializable { return this; } + /** + * Sets whether the last field on the last line, if quoted, can have no closing quote when the file ends, {@code true} if this is ok, + * {@code false} if {@link IOException} should be thrown. + * + * @param allowEofWithoutClosingQuote whether to allow the last field on the last line to have a missing closing quote when the file ends, + * {@code true} if so, or {@code false} to cause an {@link IOException} to be thrown. + * @since 1.10.0 + */ + public Builder setAllowEofWithoutClosingQuote(final boolean allowEofWithoutClosingQuote) { + this.allowEofWithoutClosingQuote = allowEofWithoutClosingQuote; + return this; + } + /** * Sets the parser missing column names behavior, {@code true} to allow missing column names in the header line, {@code false} to cause an * {@link IllegalArgumentException} to be thrown. @@ -827,7 +843,7 @@ public final class CSVFormat implements Serializable { * @see Predefined#Default */ public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false, - false, false, false, DuplicateHeaderMode.ALLOW_ALL, false); + false, false, false, DuplicateHeaderMode.ALLOW_ALL, false, false); /** * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary @@ -852,6 +868,7 @@ public final class CSVFormat implements Serializable { *

  • {@code setAllowMissingColumnNames(true)}
  • *
  • {@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}
  • *
  • {@code setAllowTrailingText(true)}
  • + *
  • {@code setAllowEofWithoutClosingQuote(true)}
  • * *

    * Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and @@ -865,6 +882,7 @@ public final class CSVFormat implements Serializable { .setIgnoreEmptyLines(false) .setAllowMissingColumnNames(true) .setAllowTrailingText(true) + .setAllowEofWithoutClosingQuote(true) .build(); // @formatter:on @@ -1287,7 +1305,7 @@ public final class CSVFormat implements Serializable { */ public static CSVFormat newFormat(final char delimiter) { return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false, - DuplicateHeaderMode.ALLOW_ALL, false); + DuplicateHeaderMode.ALLOW_ALL, false, false); } static String[] toStringArray(final Object[] values) { @@ -1329,6 +1347,8 @@ public final class CSVFormat implements Serializable { private final DuplicateHeaderMode duplicateHeaderMode; + private final boolean allowEofWithoutClosingQuote; + private final boolean allowMissingColumnNames; private final boolean allowTrailingText; @@ -1388,6 +1408,7 @@ public final class CSVFormat implements Serializable { this.quotedNullString = builder.quotedNullString; this.duplicateHeaderMode = builder.duplicateHeaderMode; this.allowTrailingText = builder.allowTrailingText; + this.allowEofWithoutClosingQuote = builder.allowEofWithoutClosingQuote; validate(); } @@ -1418,7 +1439,7 @@ public final class CSVFormat implements Serializable { final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush, - final DuplicateHeaderMode duplicateHeaderMode, final boolean allowTrailingText) { + final DuplicateHeaderMode duplicateHeaderMode, final boolean allowTrailingText, final boolean allowEofWithoutClosingQuote) { this.delimiter = delimiter; this.quoteCharacter = quoteChar; this.quoteMode = quoteMode; @@ -1439,6 +1460,7 @@ public final class CSVFormat implements Serializable { this.quotedNullString = quoteCharacter + nullString + quoteCharacter; this.duplicateHeaderMode = duplicateHeaderMode; this.allowTrailingText = allowTrailingText; + this.allowEofWithoutClosingQuote = allowEofWithoutClosingQuote; validate(); } @@ -1493,7 +1515,7 @@ public final class CSVFormat implements Serializable { Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim && - allowTrailingText == other.allowTrailingText; + allowTrailingText == other.allowTrailingText && allowEofWithoutClosingQuote == other.allowEofWithoutClosingQuote; } /** @@ -1527,6 +1549,16 @@ public final class CSVFormat implements Serializable { return duplicateHeaderMode == DuplicateHeaderMode.ALLOW_ALL; } + /** + * Gets whether the file can end before the last field on the last line, if quoted, has a closing quote. + * + * @return {@code true} if so, {@code false} to throw an {@link IOException}. + * @since 1.10.0 + */ + public boolean getAllowEofWithoutClosingQuote() { + return allowEofWithoutClosingQuote; + } + /** * Gets whether missing column names are allowed when parsing the header line. * @@ -1726,9 +1758,9 @@ public final class CSVFormat implements Serializable { int result = 1; result = prime * result + Arrays.hashCode(headers); result = prime * result + Arrays.hashCode(headerComments); - return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, allowTrailingText, autoFlush, commentMarker, delimiter, - escapeCharacter, ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, - recordSeparator, skipHeaderRecord, trailingDelimiter, trim); + return prime * result + Objects.hash(duplicateHeaderMode, allowEofWithoutClosingQuote, allowMissingColumnNames, allowTrailingText, + autoFlush, commentMarker, delimiter, escapeCharacter, ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, + nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, skipHeaderRecord, trailingDelimiter, trim); } /** diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index fd60b5ac..c43c52ed 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -58,6 +58,7 @@ final class Lexer implements Closeable { private final boolean ignoreSurroundingSpaces; private final boolean ignoreEmptyLines; private final boolean allowTrailingText; + private final boolean allowEofWithoutClosingQuote; /** The input stream */ private final ExtendedBufferedReader reader; @@ -74,6 +75,7 @@ final class Lexer implements Closeable { this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces(); this.ignoreEmptyLines = format.getIgnoreEmptyLines(); this.allowTrailingText = format.getAllowTrailingText(); + this.allowEofWithoutClosingQuote = format.getAllowEofWithoutClosingQuote(); this.delimiterBuf = new char[delimiter.length - 1]; this.escapeDelimiterBuf = new char[2 * delimiter.length - 1]; } @@ -378,9 +380,15 @@ final class Lexer implements Closeable { } } } else if (isEndOfFile(c)) { - // error condition (end of file before end of token) - throw new IOException("(startline " + startLineNumber + - ") EOF reached before encapsulated token finished"); + if (allowEofWithoutClosingQuote) { + token.type = EOF; + token.isReady = true; // There is data at EOF + return token; + } else { + // error condition (end of file before end of token) + throw new IOException("(startline " + startLineNumber + + ") EOF reached before encapsulated token finished"); + } } else { // consume character token.content.append((char) c); diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index 7edc7d86..85199072 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -441,7 +441,20 @@ public class LexerTest { assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\"")); } try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(false).build())) { - assertThrows(IOException.class, () -> lexer.nextToken(new Token())); + assertThrows(IOException.class, () -> parser.nextToken(new Token())); + } + } + + @Test + public void testEOFWithoutClosingQuote() throws Exception { + final String code = "a,\"b"; + try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowEofWithoutClosingQuote(true).build())) { + assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); + assertThat(parser.nextToken(new Token()), matches(EOF, "b")); + } + try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowEofWithoutClosingQuote(false).build())) { + assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); + assertThrows(IOException.class, () -> parser.nextToken(new Token())); } } }