A different take on PR #303

Add support for trailing text after the closing quote, and EOF without a
final closing quote, for Excel compatibility. Fix a unit test and add a
RAT exclude for the sample CSV file.
This commit is contained in:
Gary Gregory 2024-03-12 19:41:43 -04:00
parent b069c2dcf9
commit c137e80407
6 changed files with 170 additions and 63 deletions

View File

@ -23,7 +23,7 @@
<version>67</version>
</parent>
<artifactId>commons-csv</artifactId>
<version>1.10.1-SNAPSHOT</version>
<version>1.11.0-SNAPSHOT</version>
<name>Apache Commons CSV</name>
<url>https://commons.apache.org/proper/commons-csv/</url>
<inceptionYear>2005</inceptionYear>
@ -161,7 +161,7 @@
</distributionManagement>
<properties>
<commons.release.version>1.10.1</commons.release.version>
<commons.release.version>1.11.0</commons.release.version>
<commons.release.desc>(Java 8 or above)</commons.release.desc>
<!-- The RC version used in the staging repository URL. -->
<commons.rc.version>RC1</commons.rc.version>

View File

@ -42,7 +42,9 @@
<body>
<release version="1.10.1" date="YYYY-MM-DD" description="Feature and bug fix release (Java 8 or above)">
<!-- ADD -->
<action issue="CSV-308" type="fix" dev="ggregory" due-to="Buddhi De Silva, Gary Gregory">[Javadoc] Add example to CSVFormat#setHeaderComments() #344.</action>
<action issue="CSV-308" type="add" dev="ggregory" due-to="Buddhi De Silva, Gary Gregory">[Javadoc] Add example to CSVFormat#setHeaderComments() #344.</action>
<action type="add" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Add and use CSVFormat#setTrailingData(boolean) in CSVFormat.EXCEL for Excel compatibility #303.</action>
<action type="add" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Add and use CSVFormat#setLenientEof(boolean) in CSVFormat.EXCEL for Excel compatibility #303.</action>
<!-- FIX -->
<action type="fix" issue="CSV-306" dev="ggregory" due-to="Sam Ng, Bruno P. Kinoshita">Replace deprecated method in user guide, update external link #324, #325.</action>
<action type="fix" dev="ggregory" due-to="Seth Falco, Bruno P. Kinoshita">Document duplicate header behavior #309.</action>
@ -53,6 +55,7 @@
<action type="fix" issue="CSV-311" dev="ggregory" due-to="Christian Feuersaenger, Gary Gregory">OutOfMemory for very long rows despite using column value of type Reader.</action>
<action type="fix" dev="ggregory" due-to="Gary Gregory">Use try-with-resources to manage JDBC Clob in CSVPrinter.printRecords(ResultSet).</action>
<action type="fix" dev="ggregory" due-to="Gary Gregory">JDBC Blob columns are now output as Base64 instead of Object#toString(), which usually is InputStream#toString().</action>
<action type="fix" dev="ggregory" due-to="DamjanJovanovic, Gary Gregory">Support unusual Excel use cases: Add support for trailing data after the closing quote, and EOF without a final closing quote #303.</action>
<!-- UPDATE -->
<action type="update" dev="ggregory" due-to="Gary Gregory">Bump commons-io:commons-io: from 2.11.0 to 2.15.1.</action>
<action type="update" dev="ggregory" due-to="Gary Gregory, Dependabot">Bump commons-parent from 57 to 67.</action>

View File

@ -248,6 +248,10 @@ public final class CSVFormat implements Serializable {
private boolean skipHeaderRecord;
private boolean lenientEof;
private boolean trailingData;
private boolean trailingDelimiter;
private boolean trim;
@ -267,6 +271,8 @@ public final class CSVFormat implements Serializable {
this.headers = csvFormat.headers;
this.skipHeaderRecord = csvFormat.skipHeaderRecord;
this.ignoreHeaderCase = csvFormat.ignoreHeaderCase;
this.lenientEof = csvFormat.lenientEof;
this.trailingData = csvFormat.trailingData;
this.trailingDelimiter = csvFormat.trailingDelimiter;
this.trim = csvFormat.trim;
this.autoFlush = csvFormat.autoFlush;
@ -689,6 +695,18 @@ public final class CSVFormat implements Serializable {
return this;
}
/**
* Sets whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
*
* @param lenientEof whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
* @return This instance.
* @since 1.11.0
*/
public Builder setLenientEof(final boolean lenientEof) {
this.lenientEof = lenientEof;
return this;
}
/**
* Sets the String to convert to and from {@code null}. No substitution occurs if {@code null}.
*
@ -785,6 +803,18 @@ public final class CSVFormat implements Serializable {
return this;
}
/**
* Sets whether reading trailing data is allowed in records, helps Excel compatibility.
*
* @param trailingData whether reading trailing data is allowed in records, helps Excel compatibility.
* @return This instance.
* @since 1.11.0
*/
public Builder setTrailingData(final boolean trailingData) {
this.trailingData = trailingData;
return this;
}
/**
* Sets whether to add a trailing delimiter.
*
@ -914,7 +944,7 @@ public final class CSVFormat implements Serializable {
* @see Predefined#Default
*/
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false,
false, false, false, DuplicateHeaderMode.ALLOW_ALL);
false, false, false, DuplicateHeaderMode.ALLOW_ALL, false, false);
/**
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale-dependent, it might be necessary
@ -935,9 +965,11 @@ public final class CSVFormat implements Serializable {
* <li>{@code setDelimiter(',')}</li>
* <li>{@code setQuote('"')}</li>
* <li>{@code setRecordSeparator("\r\n")}</li>
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
* <li>{@code setIgnoreEmptyLines(false)}</li>
* <li>{@code setAllowMissingColumnNames(true)}</li>
* <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
* <li>{@code setTrailingData(true)}</li>
* <li>{@code setLenientEof(true)}</li>
* </ul>
* <p>
* Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and
@ -950,6 +982,8 @@ public final class CSVFormat implements Serializable {
public static final CSVFormat EXCEL = DEFAULT.builder()
.setIgnoreEmptyLines(false)
.setAllowMissingColumnNames(true)
.setTrailingData(true)
.setLenientEof(true)
.build();
// @formatter:on
@ -1372,7 +1406,7 @@ public final class CSVFormat implements Serializable {
*/
public static CSVFormat newFormat(final char delimiter) {
return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false,
DuplicateHeaderMode.ALLOW_ALL);
DuplicateHeaderMode.ALLOW_ALL, false, false);
}
static String[] toStringArray(final Object[] values) {
@ -1455,6 +1489,10 @@ public final class CSVFormat implements Serializable {
private final boolean skipHeaderRecord;
private final boolean lenientEof;
private final boolean trailingData;
private final boolean trailingDelimiter;
private final boolean trim;
@ -1474,6 +1512,8 @@ public final class CSVFormat implements Serializable {
this.headers = builder.headers;
this.skipHeaderRecord = builder.skipHeaderRecord;
this.ignoreHeaderCase = builder.ignoreHeaderCase;
this.lenientEof = builder.lenientEof;
this.trailingData = builder.trailingData;
this.trailingDelimiter = builder.trailingDelimiter;
this.trim = builder.trim;
this.autoFlush = builder.autoFlush;
@ -1494,22 +1534,24 @@ public final class CSVFormat implements Serializable {
* @param ignoreEmptyLines {@code true} when the parser should skip empty lines.
* @param recordSeparator the line separator to use for output.
* @param nullString the line separator to use for output.
* @param headerComments the comments to be printed by the Printer before the actual CSV data.
* @param header the header
* @param skipHeaderRecord if {@code true} the header row will be skipped
* @param allowMissingColumnNames if {@code true} the missing column names are allowed when parsing the header line
* @param ignoreHeaderCase if {@code true} header names will be accessed ignoring case when parsing input
* @param trim if {@code true} next record value will be trimmed
* @param trailingDelimiter if {@code true} the trailing delimiter wil be added before record separator (if set)
* @param autoFlush if {@code true} the underlying stream will be flushed before closing
* @param duplicateHeaderMode the behavior when handling duplicate headers
* @param headerComments the comments to be printed by the Printer before the actual CSV data..
* @param header the header.
* @param skipHeaderRecord if {@code true} the header row will be skipped.
* @param allowMissingColumnNames if {@code true} the missing column names are allowed when parsing the header line.
* @param ignoreHeaderCase if {@code true} header names will be accessed ignoring case when parsing input.
* @param trim if {@code true} next record value will be trimmed.
* @param trailingDelimiter if {@code true} the trailing delimiter wil be added before record separator (if set)..
* @param autoFlush if {@code true} the underlying stream will be flushed before closing.
* @param duplicateHeaderMode the behavior when handling duplicate headers.
* @param trailingData whether reading trailing data is allowed in records, helps Excel compatibility.
* @param lenientEof whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
* @throws IllegalArgumentException if the delimiter is a line break character.
*/
private CSVFormat(final String delimiter, final Character quoteChar, final QuoteMode quoteMode, final Character commentStart, final Character escape,
final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames,
final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush,
final DuplicateHeaderMode duplicateHeaderMode) {
final DuplicateHeaderMode duplicateHeaderMode, final boolean trailingData, final boolean lenientEof) {
this.delimiter = delimiter;
this.quoteCharacter = quoteChar;
this.quoteMode = quoteMode;
@ -1524,6 +1566,8 @@ public final class CSVFormat implements Serializable {
this.headers = clone(header);
this.skipHeaderRecord = skipHeaderRecord;
this.ignoreHeaderCase = ignoreHeaderCase;
this.lenientEof = lenientEof;
this.trailingData = trailingData;
this.trailingDelimiter = trailingDelimiter;
this.trim = trim;
this.autoFlush = autoFlush;
@ -1571,18 +1615,23 @@ public final class CSVFormat implements Serializable {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final CSVFormat other = (CSVFormat) obj;
return duplicateHeaderMode == other.duplicateHeaderMode && allowMissingColumnNames == other.allowMissingColumnNames &&
autoFlush == other.autoFlush && Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(headers, other.headers) &&
Arrays.equals(headerComments, other.headerComments) && ignoreEmptyLines == other.ignoreEmptyLines &&
ignoreHeaderCase == other.ignoreHeaderCase && ignoreSurroundingSpaces == other.ignoreSurroundingSpaces &&
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) && quoteMode == other.quoteMode &&
Objects.equals(quotedNullString, other.quotedNullString) && Objects.equals(recordSeparator, other.recordSeparator) &&
skipHeaderRecord == other.skipHeaderRecord && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
return allowMissingColumnNames == other.allowMissingColumnNames && autoFlush == other.autoFlush &&
Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
duplicateHeaderMode == other.duplicateHeaderMode && Objects.equals(escapeCharacter, other.escapeCharacter) &&
Arrays.equals(headerComments, other.headerComments) && Arrays.equals(headers, other.headers) &&
ignoreEmptyLines == other.ignoreEmptyLines && ignoreHeaderCase == other.ignoreHeaderCase &&
ignoreSurroundingSpaces == other.ignoreSurroundingSpaces && lenientEof == other.lenientEof &&
Objects.equals(nullString, other.nullString) && Objects.equals(quoteCharacter, other.quoteCharacter) &&
quoteMode == other.quoteMode && Objects.equals(quotedNullString, other.quotedNullString) &&
Objects.equals(recordSeparator, other.recordSeparator) && skipHeaderRecord == other.skipHeaderRecord &&
trailingData == other.trailingData && trailingDelimiter == other.trailingDelimiter && trim == other.trim;
}
private void escape(final char c, final Appendable appendable) throws IOException {
@ -1808,6 +1857,16 @@ public final class CSVFormat implements Serializable {
return ignoreSurroundingSpaces;
}
/**
* Gets whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
*
* @return whether reading end-of-file is allowed even when input is malformed, helps Excel compatibility.
* @since 1.11.0
*/
public boolean getLenientEof() {
return lenientEof;
}
/**
* Gets the String to convert to and from {@code null}.
* <ul>
@ -1857,6 +1916,16 @@ public final class CSVFormat implements Serializable {
return skipHeaderRecord;
}
/**
* Gets whether reading trailing data is allowed in records, helps Excel compatibility.
*
* @return whether reading trailing data is allowed in records, helps Excel compatibility.
* @since 1.11.0
*/
public boolean getTrailingData() {
return trailingData;
}
/**
* Gets whether to add a trailing delimiter.
*
@ -1881,11 +1950,12 @@ public final class CSVFormat implements Serializable {
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + Arrays.hashCode(headers);
result = prime * result + Arrays.hashCode(headerComments);
return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter,
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator,
skipHeaderRecord, trailingDelimiter, trim);
result = prime * result + Arrays.hashCode(headers);
result = prime * result + Objects.hash(allowMissingColumnNames, autoFlush, commentMarker, delimiter, duplicateHeaderMode, escapeCharacter,
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, lenientEof, nullString, quoteCharacter, quoteMode, quotedNullString,
recordSeparator, skipHeaderRecord, trailingData, trailingDelimiter, trim);
return result;
}
/**
@ -2006,6 +2076,26 @@ public final class CSVFormat implements Serializable {
return new CSVPrinter(new OutputStreamWriter(new FileOutputStream(out), charset), this);
}
private void print(final InputStream inputStream, final Appendable out, final boolean newRecord) throws IOException {
// InputStream is never null here
// There is nothing to escape when quoting is used which is the default.
if (!newRecord) {
append(getDelimiterString(), out);
}
final boolean quoteCharacterSet = isQuoteCharacterSet();
if (quoteCharacterSet) {
append(getQuoteCharacter().charValue(), out);
}
// Stream the input to the output without reading or holding the whole value in memory.
// AppendableOutputStream cannot "close" an Appendable.
try (OutputStream outputStream = new Base64OutputStream(new AppendableOutputStream<>(out))) {
IOUtils.copy(inputStream, outputStream);
}
if (quoteCharacterSet) {
append(getQuoteCharacter().charValue(), out);
}
}
/**
* Prints the {@code value} as the next value on the line to {@code out}. The value will be escaped or encapsulated as needed. Useful when one wants to
* avoid creating CSVPrinters. Trims the value if {@link #getTrim()} is true.
@ -2081,26 +2171,6 @@ public final class CSVFormat implements Serializable {
return print(Files.newBufferedWriter(out, charset));
}
private void print(final InputStream inputStream, final Appendable out, final boolean newRecord) throws IOException {
// InputStream is never null here
// There is nothing to escape when quoting is used which is the default.
if (!newRecord) {
append(getDelimiterString(), out);
}
final boolean quoteCharacterSet = isQuoteCharacterSet();
if (quoteCharacterSet) {
append(getQuoteCharacter().charValue(), out);
}
// Stream the input to the output without reading or holding the whole value in memory.
// AppendableOutputStream cannot "close" an Appendable.
try (OutputStream outputStream = new Base64OutputStream(new AppendableOutputStream<>(out))) {
IOUtils.copy(inputStream, outputStream);
}
if (quoteCharacterSet) {
append(getQuoteCharacter().charValue(), out);
}
}
private void print(final Reader reader, final Appendable out, final boolean newRecord) throws IOException {
// Reader is never null here
if (!newRecord) {

View File

@ -53,9 +53,10 @@ final class Lexer implements Closeable {
private final char escape;
private final char quoteChar;
private final char commentStart;
private final boolean ignoreSurroundingSpaces;
private final boolean ignoreEmptyLines;
private final boolean lenientEof;
private final boolean trailingData;
/** The input stream */
private final ExtendedBufferedReader reader;
@ -71,6 +72,8 @@ final class Lexer implements Closeable {
this.commentStart = mapNullToDisabled(format.getCommentMarker());
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
this.ignoreEmptyLines = format.getIgnoreEmptyLines();
this.lenientEof = format.getLenientEof();
this.trailingData = format.getTrailingData();
this.delimiterBuf = new char[delimiter.length - 1];
this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
}
@ -364,14 +367,21 @@ final class Lexer implements Closeable {
token.type = EORECORD;
return token;
}
if (!Character.isWhitespace((char)c)) {
if (trailingData) {
token.content.append((char) c);
} else if (!Character.isWhitespace((char) c)) {
// error invalid char between token and next delimiter
throw new IOException("Invalid char between encapsulated token and delimiter at line: " +
getCurrentLineNumber() + ", position: " + getCharacterPosition());
throw new IOException(String.format("Invalid char between encapsulated token and delimiter at line: %,d, position: %,d",
getCurrentLineNumber(), getCharacterPosition()));
}
}
}
} else if (isEndOfFile(c)) {
if (lenientEof) {
token.type = Token.Type.EOF;
token.isReady = true; // There is data at EOF
return token;
}
// error condition (end of file before end of token)
throw new IOException("(startline " + startLineNumber +
") EOF reached before encapsulated token finished");

View File

@ -299,7 +299,6 @@ public class CSVParserTest {
}
@Test
@Disabled("PR 295 does not work")
public void testCSV141Excel() throws Exception {
testCSV141Ok(CSVFormat.EXCEL);
}
@ -357,16 +356,15 @@ public class CSVParserTest {
record = parser.nextRecord();
assertEquals("1414770318327", record.get(0));
assertEquals("android.widget.EditText", record.get(1));
assertEquals("pass sem1", record.get(2));
assertEquals(3, record.size());
// row 4
assertEquals("pass sem1\n1414770318628\"", record.get(2));
assertEquals("android.widget.EditText", record.get(3));
assertEquals("pass sem1 _84*|*", record.get(4));
assertEquals("0", record.get(5));
assertEquals("pass sem1\n", record.get(6));
assertEquals(7, record.size());
// EOF
record = parser.nextRecord();
assertEquals("1414770318628", record.get(0));
assertEquals("android.widget.EditText", record.get(1));
assertEquals("pass sem1 _84*|*", record.get(2));
assertEquals("0", record.get(3));
assertEquals("pass sem1", record.get(4));
assertEquals(5, record.size());
assertNull(record);
}
}

View File

@ -196,6 +196,19 @@ public class LexerTest {
}
}
@Test
public void testEOFWithoutClosingQuote() throws Exception {
final String code = "a,\"b";
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(true).build())) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
assertThat(parser.nextToken(new Token()), matches(EOF, "b"));
}
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setLenientEof(false).build())) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
assertThrows(IOException.class, () -> parser.nextToken(new Token()));
}
}
@Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
public void testEscapedBackspace() throws Exception {
try (final Lexer lexer = createLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping)) {
@ -423,6 +436,19 @@ public class LexerTest {
}
}
@Test
public void testTrailingTextAfterQuote() throws Exception {
final String code = "\"a\" b,\"a\" \" b,\"a\" b \"\"";
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(true).build())) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a b"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a \" b"));
assertThat(parser.nextToken(new Token()), matches(EOF, "a b \"\""));
}
try (final Lexer parser = createLexer(code, CSVFormat.Builder.create().setTrailingData(false).build())) {
assertThrows(IOException.class, () -> parser.nextToken(new Token()));
}
}
@Test
public void testTrimTrailingSpacesZeroLength() throws Exception {
final StringBuilder buffer = new StringBuilder("");