diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 81b6f193..280f99f2 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -206,8 +206,12 @@ public final class CSVFormat implements Serializable { return new Builder(csvFormat); } + private boolean allowEofWithoutClosingQuote; + private boolean allowMissingColumnNames; + private boolean allowTrailingText; + private boolean autoFlush; private Character commentMarker; @@ -264,6 +268,8 @@ public final class CSVFormat implements Serializable { this.autoFlush = csvFormat.autoFlush; this.quotedNullString = csvFormat.quotedNullString; this.duplicateHeaderMode = csvFormat.duplicateHeaderMode; + this.allowTrailingText = csvFormat.allowTrailingText; + this.allowEofWithoutClosingQuote = csvFormat.allowEofWithoutClosingQuote; } /** @@ -288,6 +294,19 @@ public final class CSVFormat implements Serializable { return this; } + /** + * Sets whether the last field on the last line, if quoted, can have no closing quote when the file ends, {@code true} if this is ok, + * {@code false} if {@link IOException} should be thrown. + * + * @param allowEofWithoutClosingQuote whether to allow the last field on the last line to have a missing closing quote when the file ends, + * {@code true} if so, or {@code false} to cause an {@link IOException} to be thrown. + * @since 1.10.0 + */ + public Builder setAllowEofWithoutClosingQuote(final boolean allowEofWithoutClosingQuote) { + this.allowEofWithoutClosingQuote = allowEofWithoutClosingQuote; + return this; + } + /** * Sets the parser missing column names behavior, {@code true} to allow missing column names in the header line, {@code false} to cause an * {@link IllegalArgumentException} to be thrown. @@ -301,6 +320,20 @@ public final class CSVFormat implements Serializable { return this; } + /** + * Sets whether to allow trailing text in a quoted field, after the closing quote. + * + * @param allowTrailingText the trailing text behavior, {@code true} to append that text to the field contents, {@code false} to throw + * an {@link IOException}. + * + * @return This instance. + * @since 1.10.0 + */ + public Builder setAllowTrailingText(final boolean allowTrailingText) { + this.allowTrailingText = allowTrailingText; + return this; + } + /** * Sets whether to flush on close. * @@ -810,7 +843,7 @@ public final class CSVFormat implements Serializable { * @see Predefined#Default */ public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false, - false, false, false, DuplicateHeaderMode.ALLOW_ALL); + false, false, false, DuplicateHeaderMode.ALLOW_ALL, false, false); /** * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary @@ -834,6 +867,8 @@ public final class CSVFormat implements Serializable { *
* Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and @@ -846,6 +881,8 @@ public final class CSVFormat implements Serializable { public static final CSVFormat EXCEL = DEFAULT.builder() .setIgnoreEmptyLines(false) .setAllowMissingColumnNames(true) + .setAllowTrailingText(true) + .setAllowEofWithoutClosingQuote(true) .build(); // @formatter:on @@ -1268,7 +1305,7 @@ public final class CSVFormat implements Serializable { */ public static CSVFormat newFormat(final char delimiter) { return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false, - DuplicateHeaderMode.ALLOW_ALL); + DuplicateHeaderMode.ALLOW_ALL, false, false); } static String[] toStringArray(final Object[] values) { @@ -1310,8 +1347,12 @@ public final class CSVFormat implements Serializable { private final DuplicateHeaderMode duplicateHeaderMode; + private final boolean allowEofWithoutClosingQuote; + private final boolean allowMissingColumnNames; + private final boolean allowTrailingText; + private final boolean autoFlush; private final Character commentMarker; // null if commenting is disabled @@ -1366,6 +1407,8 @@ public final class CSVFormat implements Serializable { this.autoFlush = builder.autoFlush; this.quotedNullString = builder.quotedNullString; this.duplicateHeaderMode = builder.duplicateHeaderMode; + this.allowTrailingText = builder.allowTrailingText; + this.allowEofWithoutClosingQuote = builder.allowEofWithoutClosingQuote; validate(); } @@ -1396,7 +1439,7 @@ public final class CSVFormat implements Serializable { final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush, - final DuplicateHeaderMode duplicateHeaderMode) { + final DuplicateHeaderMode duplicateHeaderMode, final boolean allowTrailingText, final boolean allowEofWithoutClosingQuote) { this.delimiter = delimiter; this.quoteCharacter = quoteChar; this.quoteMode = quoteMode; @@ -1416,6 +1459,8 @@ public final class CSVFormat implements Serializable { this.autoFlush = autoFlush; this.quotedNullString = quoteCharacter + nullString + quoteCharacter; this.duplicateHeaderMode = duplicateHeaderMode; + this.allowTrailingText = allowTrailingText; + this.allowEofWithoutClosingQuote = allowEofWithoutClosingQuote; validate(); } @@ -1469,7 +1514,8 @@ public final class CSVFormat implements Serializable { ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && - skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim; + skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim && + allowTrailingText == other.allowTrailingText && allowEofWithoutClosingQuote == other.allowEofWithoutClosingQuote; } /** @@ -1503,6 +1549,16 @@ public final class CSVFormat implements Serializable { return duplicateHeaderMode == DuplicateHeaderMode.ALLOW_ALL; } + /** + * Gets whether the file can end before the last field on the last line, if quoted, has a closing quote. + * + * @return {@code true} if so, {@code false} to throw an {@link IOException}. + * @since 1.10.0 + */ + public boolean getAllowEofWithoutClosingQuote() { + return allowEofWithoutClosingQuote; + } + /** * Gets whether missing column names are allowed when parsing the header line. * @@ -1512,6 +1568,16 @@ public final class CSVFormat implements Serializable { return allowMissingColumnNames; } + /** + * Gets whether quoted fields allow trailing text after the closing quote. + * + * @return {@code true} if allowed, {@code false} to throw an {@link IOException}. + * @since 1.10.0 + */ + public boolean getAllowTrailingText() { + return allowTrailingText; + } + /** * Gets whether to flush on close. * @@ -1692,9 +1758,9 @@ public final class CSVFormat implements Serializable { int result = 1; result = prime * result + Arrays.hashCode(headers); result = prime * result + Arrays.hashCode(headerComments); - return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter, - ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, - skipHeaderRecord, trailingDelimiter, trim); + return prime * result + Objects.hash(duplicateHeaderMode, allowEofWithoutClosingQuote, allowMissingColumnNames, allowTrailingText, + autoFlush, commentMarker, delimiter, escapeCharacter, ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, + nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, skipHeaderRecord, trailingDelimiter, trim); } /** diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 06b2c9c2..c43c52ed 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -57,6 +57,8 @@ final class Lexer implements Closeable { private final boolean ignoreSurroundingSpaces; private final boolean ignoreEmptyLines; + private final boolean allowTrailingText; + private final boolean allowEofWithoutClosingQuote; /** The input stream */ private final ExtendedBufferedReader reader; @@ -72,6 +74,8 @@ final class Lexer implements Closeable { this.commentStart = mapNullToDisabled(format.getCommentMarker()); this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces(); this.ignoreEmptyLines = format.getIgnoreEmptyLines(); + this.allowTrailingText = format.getAllowTrailingText(); + this.allowEofWithoutClosingQuote = format.getAllowEofWithoutClosingQuote(); this.delimiterBuf = new char[delimiter.length - 1]; this.escapeDelimiterBuf = new char[2 * delimiter.length - 1]; } @@ -364,17 +368,27 @@ final class Lexer implements Closeable { token.type = EORECORD; return token; } - if (!Character.isWhitespace((char)c)) { - // error invalid char between token and next delimiter - throw new IOException("(line " + getCurrentLineNumber() + - ") invalid char between encapsulated token and delimiter"); + if (allowTrailingText) { + token.content.append((char) c); + } else { + if (!Character.isWhitespace((char)c)) { + // error invalid char between token and next delimiter + throw new IOException("(line " + getCurrentLineNumber() + + ") invalid char between encapsulated token and delimiter"); + } } } } } else if (isEndOfFile(c)) { - // error condition (end of file before end of token) - throw new IOException("(startline " + startLineNumber + - ") EOF reached before encapsulated token finished"); + if (allowEofWithoutClosingQuote) { + token.type = EOF; + token.isReady = true; // There is data at EOF + return token; + } else { + // error condition (end of file before end of token) + throw new IOException("(startline " + startLineNumber + + ") EOF reached before encapsulated token finished"); + } } else { // consume character token.content.append((char) c); diff --git a/src/test/java/org/apache/commons/csv/LexerTest.java b/src/test/java/org/apache/commons/csv/LexerTest.java index cc8d728a..85199072 100644 --- a/src/test/java/org/apache/commons/csv/LexerTest.java +++ b/src/test/java/org/apache/commons/csv/LexerTest.java @@ -431,4 +431,30 @@ public class LexerTest { lexer.trimTrailingSpaces(buffer); assertThat(lexer.nextToken(new Token()), matches(EOF, "")); } + + @Test + public void testTrailingTextAfterQuote() throws Exception { + final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\""; + try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(true).build())) { + assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b")); + assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b")); + assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\"")); + } + try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(false).build())) { + assertThrows(IOException.class, () -> parser.nextToken(new Token())); + } + } + + @Test + public void testEOFWithoutClosingQuote() throws Exception { + final String code = "a,\"b"; + try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowEofWithoutClosingQuote(true).build())) { + assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); + assertThat(parser.nextToken(new Token()), matches(EOF, "b")); + } + try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowEofWithoutClosingQuote(false).build())) { + assertThat(parser.nextToken(new Token()), matches(TOKEN, "a")); + assertThrows(IOException.class, () -> parser.nextToken(new Token())); + } + } }