Add support for trailing text after the closing quote, for Excel compatibility.

This commit is contained in:
Damjan Jovanovic 2022-12-27 04:38:29 +02:00
parent 86b2bfa9c1
commit ed0ca22321
3 changed files with 64 additions and 11 deletions

View File

@ -208,6 +208,8 @@ public final class CSVFormat implements Serializable {
private boolean allowMissingColumnNames; private boolean allowMissingColumnNames;
private boolean allowTrailingText;
private boolean autoFlush; private boolean autoFlush;
private Character commentMarker; private Character commentMarker;
@ -264,6 +266,7 @@ public final class CSVFormat implements Serializable {
this.autoFlush = csvFormat.autoFlush; this.autoFlush = csvFormat.autoFlush;
this.quotedNullString = csvFormat.quotedNullString; this.quotedNullString = csvFormat.quotedNullString;
this.duplicateHeaderMode = csvFormat.duplicateHeaderMode; this.duplicateHeaderMode = csvFormat.duplicateHeaderMode;
this.allowTrailingText = csvFormat.allowTrailingText;
} }
/** /**
@ -301,6 +304,20 @@ public final class CSVFormat implements Serializable {
return this; return this;
} }
/**
* Sets whether to allow trailing text in a quoted field, after the closing quote.
*
* @param allowTrailingText the trailing text behavior, {@code true} to append that text to the field contents, {@code false} to throw
* an {@link IOException}.
*
* @return This instance.
* @since 1.10.0
*/
public Builder setAllowTrailingText(final boolean allowTrailingText) {
this.allowTrailingText = allowTrailingText;
return this;
}
/** /**
* Sets whether to flush on close. * Sets whether to flush on close.
* *
@ -810,7 +827,7 @@ public final class CSVFormat implements Serializable {
* @see Predefined#Default * @see Predefined#Default
*/ */
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false, public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false,
false, false, false, DuplicateHeaderMode.ALLOW_ALL); false, false, false, DuplicateHeaderMode.ALLOW_ALL, false);
/** /**
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary
@ -834,6 +851,7 @@ public final class CSVFormat implements Serializable {
* <li>{@code setIgnoreEmptyLines(false)}</li> * <li>{@code setIgnoreEmptyLines(false)}</li>
* <li>{@code setAllowMissingColumnNames(true)}</li> * <li>{@code setAllowMissingColumnNames(true)}</li>
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li> * <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
* <li>{@code setAllowTrailingText(true)}</li>
* </ul> * </ul>
* <p> * <p>
* Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and * Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and
@ -846,6 +864,7 @@ public final class CSVFormat implements Serializable {
public static final CSVFormat EXCEL = DEFAULT.builder() public static final CSVFormat EXCEL = DEFAULT.builder()
.setIgnoreEmptyLines(false) .setIgnoreEmptyLines(false)
.setAllowMissingColumnNames(true) .setAllowMissingColumnNames(true)
.setAllowTrailingText(true)
.build(); .build();
// @formatter:on // @formatter:on
@ -1268,7 +1287,7 @@ public final class CSVFormat implements Serializable {
*/ */
public static CSVFormat newFormat(final char delimiter) { public static CSVFormat newFormat(final char delimiter) {
return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false, return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false,
DuplicateHeaderMode.ALLOW_ALL); DuplicateHeaderMode.ALLOW_ALL, false);
} }
static String[] toStringArray(final Object[] values) { static String[] toStringArray(final Object[] values) {
@ -1312,6 +1331,8 @@ public final class CSVFormat implements Serializable {
private final boolean allowMissingColumnNames; private final boolean allowMissingColumnNames;
private final boolean allowTrailingText;
private final boolean autoFlush; private final boolean autoFlush;
private final Character commentMarker; // null if commenting is disabled private final Character commentMarker; // null if commenting is disabled
@ -1366,6 +1387,7 @@ public final class CSVFormat implements Serializable {
this.autoFlush = builder.autoFlush; this.autoFlush = builder.autoFlush;
this.quotedNullString = builder.quotedNullString; this.quotedNullString = builder.quotedNullString;
this.duplicateHeaderMode = builder.duplicateHeaderMode; this.duplicateHeaderMode = builder.duplicateHeaderMode;
this.allowTrailingText = builder.allowTrailingText;
validate(); validate();
} }
@ -1396,7 +1418,7 @@ public final class CSVFormat implements Serializable {
final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames, final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames,
final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush, final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush,
final DuplicateHeaderMode duplicateHeaderMode) { final DuplicateHeaderMode duplicateHeaderMode, final boolean allowTrailingText) {
this.delimiter = delimiter; this.delimiter = delimiter;
this.quoteCharacter = quoteChar; this.quoteCharacter = quoteChar;
this.quoteMode = quoteMode; this.quoteMode = quoteMode;
@ -1416,6 +1438,7 @@ public final class CSVFormat implements Serializable {
this.autoFlush = autoFlush; this.autoFlush = autoFlush;
this.quotedNullString = quoteCharacter + nullString + quoteCharacter; this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
this.duplicateHeaderMode = duplicateHeaderMode; this.duplicateHeaderMode = duplicateHeaderMode;
this.allowTrailingText = allowTrailingText;
validate(); validate();
} }
@ -1469,7 +1492,8 @@ public final class CSVFormat implements Serializable {
ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces &&
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode &&
Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) &&
skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim; skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim &&
allowTrailingText == other.allowTrailingText;
} }
/** /**
@ -1512,6 +1536,16 @@ public final class CSVFormat implements Serializable {
return allowMissingColumnNames; return allowMissingColumnNames;
} }
/**
* Gets whether quoted fields allow trailing text after the closing quote.
*
* @return {@code true} if allowed, {@code false} to throw an {@link IOException}.
* @since 1.10.0
*/
public boolean getAllowTrailingText() {
return allowTrailingText;
}
/** /**
* Gets whether to flush on close. * Gets whether to flush on close.
* *
@ -1692,9 +1726,9 @@ public final class CSVFormat implements Serializable {
int result = 1; int result = 1;
result = prime * result + Arrays.hashCode(headers); result = prime * result + Arrays.hashCode(headers);
result = prime * result + Arrays.hashCode(headerComments); result = prime * result + Arrays.hashCode(headerComments);
return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter, return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, allowTrailingText, autoFlush, commentMarker, delimiter,
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, escapeCharacter, ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString,
skipHeaderRecord, trailingDelimiter, trim); recordSeparator, skipHeaderRecord, trailingDelimiter, trim);
} }
/** /**

View File

@ -57,6 +57,7 @@ final class Lexer implements Closeable {
private final boolean ignoreSurroundingSpaces; private final boolean ignoreSurroundingSpaces;
private final boolean ignoreEmptyLines; private final boolean ignoreEmptyLines;
private final boolean allowTrailingText;
/** The input stream */ /** The input stream */
private final ExtendedBufferedReader reader; private final ExtendedBufferedReader reader;
@ -72,6 +73,7 @@ final class Lexer implements Closeable {
this.commentStart = mapNullToDisabled(format.getCommentMarker()); this.commentStart = mapNullToDisabled(format.getCommentMarker());
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces(); this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
this.ignoreEmptyLines = format.getIgnoreEmptyLines(); this.ignoreEmptyLines = format.getIgnoreEmptyLines();
this.allowTrailingText = format.getAllowTrailingText();
this.delimiterBuf = new char[delimiter.length - 1]; this.delimiterBuf = new char[delimiter.length - 1];
this.escapeDelimiterBuf = new char[2 * delimiter.length - 1]; this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
} }
@ -364,6 +366,9 @@ final class Lexer implements Closeable {
token.type = EORECORD; token.type = EORECORD;
return token; return token;
} }
if (allowTrailingText) {
token.content.append((char) c);
} else {
if (!Character.isWhitespace((char)c)) { if (!Character.isWhitespace((char)c)) {
// error invalid char between token and next delimiter // error invalid char between token and next delimiter
throw new IOException("(line " + getCurrentLineNumber() + throw new IOException("(line " + getCurrentLineNumber() +
@ -371,6 +376,7 @@ final class Lexer implements Closeable {
} }
} }
} }
}
} else if (isEndOfFile(c)) { } else if (isEndOfFile(c)) {
// error condition (end of file before end of token) // error condition (end of file before end of token)
throw new IOException("(startline " + startLineNumber + throw new IOException("(startline " + startLineNumber +

View File

@ -431,4 +431,17 @@ public class LexerTest {
lexer.trimTrailingSpaces(buffer); lexer.trimTrailingSpaces(buffer);
assertThat(lexer.nextToken(new Token()), matches(EOF, "")); assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
} }
@Test
public void testTrailingTextAfterQuote() throws Exception {
final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(true).build())) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
}
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(false).build())) {
assertThrows(IOException.class, () -> lexer.nextToken(new Token()));
}
}
} }