Merge pull request #295 from DamjanJovanovic/master

Add support for trailing text after the closing quote, and EOF without a final closing quote, for Excel compatibility
This commit is contained in:
Gary Gregory 2023-01-21 13:42:37 -05:00 committed by GitHub
commit b1bdb99c42
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 120 additions and 14 deletions

View File

@ -206,8 +206,12 @@ public final class CSVFormat implements Serializable {
return new Builder(csvFormat); return new Builder(csvFormat);
} }
private boolean allowEofWithoutClosingQuote;
private boolean allowMissingColumnNames; private boolean allowMissingColumnNames;
private boolean allowTrailingText;
private boolean autoFlush; private boolean autoFlush;
private Character commentMarker; private Character commentMarker;
@ -264,6 +268,8 @@ public final class CSVFormat implements Serializable {
this.autoFlush = csvFormat.autoFlush; this.autoFlush = csvFormat.autoFlush;
this.quotedNullString = csvFormat.quotedNullString; this.quotedNullString = csvFormat.quotedNullString;
this.duplicateHeaderMode = csvFormat.duplicateHeaderMode; this.duplicateHeaderMode = csvFormat.duplicateHeaderMode;
this.allowTrailingText = csvFormat.allowTrailingText;
this.allowEofWithoutClosingQuote = csvFormat.allowEofWithoutClosingQuote;
} }
/** /**
@ -288,6 +294,19 @@ public final class CSVFormat implements Serializable {
return this; return this;
} }
/**
* Sets whether the last field on the last line, if quoted, can have no closing quote when the file ends, {@code true} if this is ok,
* {@code false} if {@link IOException} should be thrown.
*
* @param allowEofWithoutClosingQuote whether to allow the last field on the last line to have a missing closing quote when the file ends,
* {@code true} if so, or {@code false} to cause an {@link IOException} to be thrown.
* @since 1.10.0
*/
public Builder setAllowEofWithoutClosingQuote(final boolean allowEofWithoutClosingQuote) {
this.allowEofWithoutClosingQuote = allowEofWithoutClosingQuote;
return this;
}
/** /**
* Sets the parser missing column names behavior, {@code true} to allow missing column names in the header line, {@code false} to cause an * Sets the parser missing column names behavior, {@code true} to allow missing column names in the header line, {@code false} to cause an
* {@link IllegalArgumentException} to be thrown. * {@link IllegalArgumentException} to be thrown.
@ -301,6 +320,20 @@ public final class CSVFormat implements Serializable {
return this; return this;
} }
/**
* Sets whether to allow trailing text in a quoted field, after the closing quote.
*
* @param allowTrailingText the trailing text behavior, {@code true} to append that text to the field contents, {@code false} to throw
* an {@link IOException}.
*
* @return This instance.
* @since 1.10.0
*/
public Builder setAllowTrailingText(final boolean allowTrailingText) {
this.allowTrailingText = allowTrailingText;
return this;
}
/** /**
* Sets whether to flush on close. * Sets whether to flush on close.
* *
@ -810,7 +843,7 @@ public final class CSVFormat implements Serializable {
* @see Predefined#Default * @see Predefined#Default
*/ */
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false, public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false,
false, false, false, DuplicateHeaderMode.ALLOW_ALL); false, false, false, DuplicateHeaderMode.ALLOW_ALL, false, false);
/** /**
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary
@ -834,6 +867,8 @@ public final class CSVFormat implements Serializable {
* <li>{@code setIgnoreEmptyLines(false)}</li> * <li>{@code setIgnoreEmptyLines(false)}</li>
* <li>{@code setAllowMissingColumnNames(true)}</li> * <li>{@code setAllowMissingColumnNames(true)}</li>
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li> * <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
* <li>{@code setAllowTrailingText(true)}</li>
* <li>{@code setAllowEofWithoutClosingQuote(true)}</li>
* </ul> * </ul>
* <p> * <p>
* Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and * Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and
@ -846,6 +881,8 @@ public final class CSVFormat implements Serializable {
public static final CSVFormat EXCEL = DEFAULT.builder() public static final CSVFormat EXCEL = DEFAULT.builder()
.setIgnoreEmptyLines(false) .setIgnoreEmptyLines(false)
.setAllowMissingColumnNames(true) .setAllowMissingColumnNames(true)
.setAllowTrailingText(true)
.setAllowEofWithoutClosingQuote(true)
.build(); .build();
// @formatter:on // @formatter:on
@ -1268,7 +1305,7 @@ public final class CSVFormat implements Serializable {
*/ */
public static CSVFormat newFormat(final char delimiter) { public static CSVFormat newFormat(final char delimiter) {
return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false, return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false,
DuplicateHeaderMode.ALLOW_ALL); DuplicateHeaderMode.ALLOW_ALL, false, false);
} }
static String[] toStringArray(final Object[] values) { static String[] toStringArray(final Object[] values) {
@ -1310,8 +1347,12 @@ public final class CSVFormat implements Serializable {
private final DuplicateHeaderMode duplicateHeaderMode; private final DuplicateHeaderMode duplicateHeaderMode;
private final boolean allowEofWithoutClosingQuote;
private final boolean allowMissingColumnNames; private final boolean allowMissingColumnNames;
private final boolean allowTrailingText;
private final boolean autoFlush; private final boolean autoFlush;
private final Character commentMarker; // null if commenting is disabled private final Character commentMarker; // null if commenting is disabled
@ -1366,6 +1407,8 @@ public final class CSVFormat implements Serializable {
this.autoFlush = builder.autoFlush; this.autoFlush = builder.autoFlush;
this.quotedNullString = builder.quotedNullString; this.quotedNullString = builder.quotedNullString;
this.duplicateHeaderMode = builder.duplicateHeaderMode; this.duplicateHeaderMode = builder.duplicateHeaderMode;
this.allowTrailingText = builder.allowTrailingText;
this.allowEofWithoutClosingQuote = builder.allowEofWithoutClosingQuote;
validate(); validate();
} }
@ -1396,7 +1439,7 @@ public final class CSVFormat implements Serializable {
final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames, final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames,
final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush, final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush,
final DuplicateHeaderMode duplicateHeaderMode) { final DuplicateHeaderMode duplicateHeaderMode, final boolean allowTrailingText, final boolean allowEofWithoutClosingQuote) {
this.delimiter = delimiter; this.delimiter = delimiter;
this.quoteCharacter = quoteChar; this.quoteCharacter = quoteChar;
this.quoteMode = quoteMode; this.quoteMode = quoteMode;
@ -1416,6 +1459,8 @@ public final class CSVFormat implements Serializable {
this.autoFlush = autoFlush; this.autoFlush = autoFlush;
this.quotedNullString = quoteCharacter + nullString + quoteCharacter; this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
this.duplicateHeaderMode = duplicateHeaderMode; this.duplicateHeaderMode = duplicateHeaderMode;
this.allowTrailingText = allowTrailingText;
this.allowEofWithoutClosingQuote = allowEofWithoutClosingQuote;
validate(); validate();
} }
@ -1469,7 +1514,8 @@ public final class CSVFormat implements Serializable {
ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces &&
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode && Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode &&
Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) && Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) &&
skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim; skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim &&
allowTrailingText == other.allowTrailingText && allowEofWithoutClosingQuote == other.allowEofWithoutClosingQuote;
} }
/** /**
@ -1503,6 +1549,16 @@ public final class CSVFormat implements Serializable {
return duplicateHeaderMode == DuplicateHeaderMode.ALLOW_ALL; return duplicateHeaderMode == DuplicateHeaderMode.ALLOW_ALL;
} }
/**
* Gets whether the file can end before the last field on the last line, if quoted, has a closing quote.
*
* @return {@code true} if so, {@code false} to throw an {@link IOException}.
* @since 1.10.0
*/
public boolean getAllowEofWithoutClosingQuote() {
return allowEofWithoutClosingQuote;
}
/** /**
* Gets whether missing column names are allowed when parsing the header line. * Gets whether missing column names are allowed when parsing the header line.
* *
@ -1512,6 +1568,16 @@ public final class CSVFormat implements Serializable {
return allowMissingColumnNames; return allowMissingColumnNames;
} }
/**
* Gets whether quoted fields allow trailing text after the closing quote.
*
* @return {@code true} if allowed, {@code false} to throw an {@link IOException}.
* @since 1.10.0
*/
public boolean getAllowTrailingText() {
return allowTrailingText;
}
/** /**
* Gets whether to flush on close. * Gets whether to flush on close.
* *
@ -1692,9 +1758,9 @@ public final class CSVFormat implements Serializable {
int result = 1; int result = 1;
result = prime * result + Arrays.hashCode(headers); result = prime * result + Arrays.hashCode(headers);
result = prime * result + Arrays.hashCode(headerComments); result = prime * result + Arrays.hashCode(headerComments);
return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter, return prime * result + Objects.hash(duplicateHeaderMode, allowEofWithoutClosingQuote, allowMissingColumnNames, allowTrailingText,
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, autoFlush, commentMarker, delimiter, escapeCharacter, ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces,
skipHeaderRecord, trailingDelimiter, trim); nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, skipHeaderRecord, trailingDelimiter, trim);
} }
/** /**

View File

@ -57,6 +57,8 @@ final class Lexer implements Closeable {
private final boolean ignoreSurroundingSpaces; private final boolean ignoreSurroundingSpaces;
private final boolean ignoreEmptyLines; private final boolean ignoreEmptyLines;
private final boolean allowTrailingText;
private final boolean allowEofWithoutClosingQuote;
/** The input stream */ /** The input stream */
private final ExtendedBufferedReader reader; private final ExtendedBufferedReader reader;
@ -72,6 +74,8 @@ final class Lexer implements Closeable {
this.commentStart = mapNullToDisabled(format.getCommentMarker()); this.commentStart = mapNullToDisabled(format.getCommentMarker());
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces(); this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
this.ignoreEmptyLines = format.getIgnoreEmptyLines(); this.ignoreEmptyLines = format.getIgnoreEmptyLines();
this.allowTrailingText = format.getAllowTrailingText();
this.allowEofWithoutClosingQuote = format.getAllowEofWithoutClosingQuote();
this.delimiterBuf = new char[delimiter.length - 1]; this.delimiterBuf = new char[delimiter.length - 1];
this.escapeDelimiterBuf = new char[2 * delimiter.length - 1]; this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
} }
@ -364,17 +368,27 @@ final class Lexer implements Closeable {
token.type = EORECORD; token.type = EORECORD;
return token; return token;
} }
if (!Character.isWhitespace((char)c)) { if (allowTrailingText) {
// error invalid char between token and next delimiter token.content.append((char) c);
throw new IOException("(line " + getCurrentLineNumber() + } else {
") invalid char between encapsulated token and delimiter"); if (!Character.isWhitespace((char)c)) {
// error invalid char between token and next delimiter
throw new IOException("(line " + getCurrentLineNumber() +
") invalid char between encapsulated token and delimiter");
}
} }
} }
} }
} else if (isEndOfFile(c)) { } else if (isEndOfFile(c)) {
// error condition (end of file before end of token) if (allowEofWithoutClosingQuote) {
throw new IOException("(startline " + startLineNumber + token.type = EOF;
") EOF reached before encapsulated token finished"); token.isReady = true; // There is data at EOF
return token;
} else {
// error condition (end of file before end of token)
throw new IOException("(startline " + startLineNumber +
") EOF reached before encapsulated token finished");
}
} else { } else {
// consume character // consume character
token.content.append((char) c); token.content.append((char) c);

View File

@ -431,4 +431,30 @@ public class LexerTest {
lexer.trimTrailingSpaces(buffer); lexer.trimTrailingSpaces(buffer);
assertThat(lexer.nextToken(new Token()), matches(EOF, "")); assertThat(lexer.nextToken(new Token()), matches(EOF, ""));
} }
@Test
public void testTrailingTextAfterQuote() throws Exception {
final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(true).build())) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
}
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowTrailingText(false).build())) {
assertThrows(IOException.class, () -> parser.nextToken(new Token()));
}
}
@Test
public void testEOFWithoutClosingQuote() throws Exception {
final String code = "a,\"b";
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowEofWithoutClosingQuote(true).build())) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
}
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setAllowEofWithoutClosingQuote(false).build())) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
assertThrows(IOException.class, () -> parser.nextToken(new Token()));
}
}
} }