CSV-264: Added DuplicateHeaderMode for flexibility with header strictness. (#114)

* csv-264: added duplicateheadermode for flexibility with header strictness

* fix: use assertthrows and update docs
This commit is contained in:
Seth Falco 2022-02-19 18:07:44 +01:00 committed by GitHub
parent caa59a3c09
commit 4fdfc59b1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 200 additions and 24 deletions

View File

@ -188,8 +188,6 @@ public final class CSVFormat implements Serializable {
return new Builder(csvFormat); return new Builder(csvFormat);
} }
private boolean allowDuplicateHeaderNames;
private boolean allowMissingColumnNames; private boolean allowMissingColumnNames;
private boolean autoFlush; private boolean autoFlush;
@ -198,6 +196,8 @@ public final class CSVFormat implements Serializable {
private String delimiter; private String delimiter;
private DuplicateHeaderMode duplicateHeaderMode;
private Character escapeCharacter; private Character escapeCharacter;
private String[] headerComments; private String[] headerComments;
@ -245,7 +245,7 @@ public final class CSVFormat implements Serializable {
this.trim = csvFormat.trim; this.trim = csvFormat.trim;
this.autoFlush = csvFormat.autoFlush; this.autoFlush = csvFormat.autoFlush;
this.quotedNullString = csvFormat.quotedNullString; this.quotedNullString = csvFormat.quotedNullString;
this.allowDuplicateHeaderNames = csvFormat.allowDuplicateHeaderNames; this.duplicateHeaderMode = csvFormat.duplicateHeaderMode;
} }
/** /**
@ -262,9 +262,23 @@ public final class CSVFormat implements Serializable {
* *
* @param allowDuplicateHeaderNames the duplicate header names behavior, true to allow, false to disallow. * @param allowDuplicateHeaderNames the duplicate header names behavior, true to allow, false to disallow.
* @return This instance. * @return This instance.
* @deprecated Use {@link #setDuplicateHeaderMode(DuplicateHeaderMode)}.
*/ */
@Deprecated
public Builder setAllowDuplicateHeaderNames(final boolean allowDuplicateHeaderNames) { public Builder setAllowDuplicateHeaderNames(final boolean allowDuplicateHeaderNames) {
this.allowDuplicateHeaderNames = allowDuplicateHeaderNames; final DuplicateHeaderMode mode = allowDuplicateHeaderNames ? DuplicateHeaderMode.ALLOW_ALL : DuplicateHeaderMode.ALLOW_EMPTY;
setDuplicateHeaderMode(mode);
return this;
}
/**
* Sets the duplicate header names behavior.
*
* @param duplicateHeaderMode the duplicate header names behavior
* @return This instance.
*/
public Builder setDuplicateHeaderMode(final DuplicateHeaderMode duplicateHeaderMode) {
this.duplicateHeaderMode = duplicateHeaderMode;
return this; return this;
} }
@ -760,7 +774,8 @@ public final class CSVFormat implements Serializable {
} }
/** /**
* Standard Comma Separated Value format, as for {@link #RFC4180} but allowing empty lines. * Standard Comma Separated Value format, as for {@link #RFC4180} but allowing
* empty lines.
* *
* <p> * <p>
* The {@link Builder} settings are: * The {@link Builder} settings are:
@ -770,13 +785,13 @@ public final class CSVFormat implements Serializable {
* <li>{@code setQuote('"')}</li> * <li>{@code setQuote('"')}</li>
* <li>{@code setRecordSeparator("\r\n")}</li> * <li>{@code setRecordSeparator("\r\n")}</li>
* <li>{@code setIgnoreEmptyLines(true)}</li> * <li>{@code setIgnoreEmptyLines(true)}</li>
* <li>{@code setAllowDuplicateHeaderNames(true)}</li> * <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
* </ul> * </ul>
* *
* @see Predefined#Default * @see Predefined#Default
*/ */
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false, public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, null, null, null, false, false, false,
false, false, false, true); false, false, false, DuplicateHeaderMode.ALLOW_ALL);
/** /**
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary
@ -799,7 +814,7 @@ public final class CSVFormat implements Serializable {
* <li>{@code setRecordSeparator("\r\n")}</li> * <li>{@code setRecordSeparator("\r\n")}</li>
* <li>{@code setIgnoreEmptyLines(false)}</li> * <li>{@code setIgnoreEmptyLines(false)}</li>
* <li>{@code setAllowMissingColumnNames(true)}</li> * <li>{@code setAllowMissingColumnNames(true)}</li>
* <li>{@code setAllowDuplicateHeaderNames(true)}</li> * <li>{@code setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL)}</li>
* </ul> * </ul>
* <p> * <p>
* Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and * Note: This is currently like {@link #RFC4180} plus {@link Builder#setAllowMissingColumnNames(boolean) Builder#setAllowMissingColumnNames(true)} and
@ -1220,7 +1235,7 @@ public final class CSVFormat implements Serializable {
*/ */
public static CSVFormat newFormat(final char delimiter) { public static CSVFormat newFormat(final char delimiter) {
return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false, return new CSVFormat(String.valueOf(delimiter), null, null, null, null, false, false, null, null, null, null, false, false, false, false, false, false,
true); DuplicateHeaderMode.ALLOW_ALL);
} }
static String[] toStringArray(final Object[] values) { static String[] toStringArray(final Object[] values) {
@ -1262,7 +1277,7 @@ public final class CSVFormat implements Serializable {
return CSVFormat.Predefined.valueOf(format).getFormat(); return CSVFormat.Predefined.valueOf(format).getFormat();
} }
private final boolean allowDuplicateHeaderNames; private final DuplicateHeaderMode duplicateHeaderMode;
private final boolean allowMissingColumnNames; private final boolean allowMissingColumnNames;
@ -1319,7 +1334,7 @@ public final class CSVFormat implements Serializable {
this.trim = builder.trim; this.trim = builder.trim;
this.autoFlush = builder.autoFlush; this.autoFlush = builder.autoFlush;
this.quotedNullString = builder.quotedNullString; this.quotedNullString = builder.quotedNullString;
this.allowDuplicateHeaderNames = builder.allowDuplicateHeaderNames; this.duplicateHeaderMode = builder.duplicateHeaderMode;
validate(); validate();
} }
@ -1343,14 +1358,14 @@ public final class CSVFormat implements Serializable {
* @param trim TODO Doc me. * @param trim TODO Doc me.
* @param trailingDelimiter TODO Doc me. * @param trailingDelimiter TODO Doc me.
* @param autoFlush TODO Doc me. * @param autoFlush TODO Doc me.
* @param allowDuplicateHeaderNames TODO Doc me. * @param duplicateHeaderMode the behavior when handling duplicate headers
* @throws IllegalArgumentException if the delimiter is a line break character. * @throws IllegalArgumentException if the delimiter is a line break character.
*/ */
private CSVFormat(final String delimiter, final Character quoteChar, final QuoteMode quoteMode, final Character commentStart, final Character escape, private CSVFormat(final String delimiter, final Character quoteChar, final QuoteMode quoteMode, final Character commentStart, final Character escape,
final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, final boolean ignoreSurroundingSpaces, final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames, final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames,
final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush, final boolean ignoreHeaderCase, final boolean trim, final boolean trailingDelimiter, final boolean autoFlush,
final boolean allowDuplicateHeaderNames) { final DuplicateHeaderMode duplicateHeaderMode) {
this.delimiter = delimiter; this.delimiter = delimiter;
this.quoteCharacter = quoteChar; this.quoteCharacter = quoteChar;
this.quoteMode = quoteMode; this.quoteMode = quoteMode;
@ -1369,7 +1384,7 @@ public final class CSVFormat implements Serializable {
this.trim = trim; this.trim = trim;
this.autoFlush = autoFlush; this.autoFlush = autoFlush;
this.quotedNullString = quoteCharacter + nullString + quoteCharacter; this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
this.allowDuplicateHeaderNames = allowDuplicateHeaderNames; this.duplicateHeaderMode = duplicateHeaderMode;
validate(); validate();
} }
@ -1416,7 +1431,7 @@ public final class CSVFormat implements Serializable {
return false; return false;
} }
final CSVFormat other = (CSVFormat) obj; final CSVFormat other = (CSVFormat) obj;
return allowDuplicateHeaderNames == other.allowDuplicateHeaderNames && allowMissingColumnNames == other.allowMissingColumnNames && return duplicateHeaderMode == other.duplicateHeaderMode && allowMissingColumnNames == other.allowMissingColumnNames &&
autoFlush == other.autoFlush && Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) && autoFlush == other.autoFlush && Objects.equals(commentMarker, other.commentMarker) && Objects.equals(delimiter, other.delimiter) &&
Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(header, other.header) && Objects.equals(escapeCharacter, other.escapeCharacter) && Arrays.equals(header, other.header) &&
Arrays.equals(headerComments, other.headerComments) && ignoreEmptyLines == other.ignoreEmptyLines && Arrays.equals(headerComments, other.headerComments) && ignoreEmptyLines == other.ignoreEmptyLines &&
@ -1450,9 +1465,21 @@ public final class CSVFormat implements Serializable {
* *
* @return whether duplicate header names are allowed * @return whether duplicate header names are allowed
* @since 1.7 * @since 1.7
* @deprecated Use {@link #getDuplicateHeaderMode()}.
*/ */
@Deprecated
public boolean getAllowDuplicateHeaderNames() { public boolean getAllowDuplicateHeaderNames() {
return allowDuplicateHeaderNames; return duplicateHeaderMode == DuplicateHeaderMode.ALLOW_ALL;
}
/**
* Gets how duplicate headers are handled.
*
* @return if duplicate header values are allowed, allowed conditionally, or disallowed.
* @since 1.9.0
*/
public DuplicateHeaderMode getDuplicateHeaderMode() {
return duplicateHeaderMode;
} }
/** /**
@ -1633,7 +1660,7 @@ public final class CSVFormat implements Serializable {
int result = 1; int result = 1;
result = prime * result + Arrays.hashCode(header); result = prime * result + Arrays.hashCode(header);
result = prime * result + Arrays.hashCode(headerComments); result = prime * result + Arrays.hashCode(headerComments);
return prime * result + Objects.hash(allowDuplicateHeaderNames, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter, return prime * result + Objects.hash(duplicateHeaderMode, allowMissingColumnNames, autoFlush, commentMarker, delimiter, escapeCharacter,
ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator, ignoreEmptyLines, ignoreHeaderCase, ignoreSurroundingSpaces, nullString, quoteCharacter, quoteMode, quotedNullString, recordSeparator,
skipHeaderRecord, trailingDelimiter, trim); skipHeaderRecord, trailingDelimiter, trim);
} }
@ -2235,7 +2262,7 @@ public final class CSVFormat implements Serializable {
} }
// validate header // validate header
if (header != null && !allowDuplicateHeaderNames) { if (header != null && duplicateHeaderMode != DuplicateHeaderMode.ALLOW_ALL) {
final Set<String> dupCheck = new HashSet<>(); final Set<String> dupCheck = new HashSet<>();
for (final String hdr : header) { for (final String hdr : header) {
if (!dupCheck.add(hdr)) { if (!dupCheck.add(hdr)) {
@ -2254,7 +2281,7 @@ public final class CSVFormat implements Serializable {
*/ */
@Deprecated @Deprecated
public CSVFormat withAllowDuplicateHeaderNames() { public CSVFormat withAllowDuplicateHeaderNames() {
return builder().setAllowDuplicateHeaderNames(true).build(); return builder().setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_ALL).build();
} }
/** /**
@ -2267,7 +2294,8 @@ public final class CSVFormat implements Serializable {
*/ */
@Deprecated @Deprecated
public CSVFormat withAllowDuplicateHeaderNames(final boolean allowDuplicateHeaderNames) { public CSVFormat withAllowDuplicateHeaderNames(final boolean allowDuplicateHeaderNames) {
return builder().setAllowDuplicateHeaderNames(allowDuplicateHeaderNames).build(); final DuplicateHeaderMode mode = allowDuplicateHeaderNames ? DuplicateHeaderMode.ALLOW_ALL : DuplicateHeaderMode.ALLOW_EMPTY;
return builder().setDuplicateHeaderMode(mode).build();
} }
/** /**

View File

@ -497,12 +497,16 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"A header name is missing in " + Arrays.toString(headerRecord)); "A header name is missing in " + Arrays.toString(headerRecord));
} }
// Note: This will always allow a duplicate header if the header is empty
final boolean containsHeader = header != null && hdrMap.containsKey(header); final boolean containsHeader = header != null && hdrMap.containsKey(header);
if (containsHeader && !emptyHeader && !this.format.getAllowDuplicateHeaderNames()) { final DuplicateHeaderMode headerMode = this.format.getDuplicateHeaderMode();
final boolean duplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_ALL;
final boolean emptyDuplicatesAllowed = headerMode == DuplicateHeaderMode.ALLOW_EMPTY;
if (containsHeader && !duplicatesAllowed && !(emptyHeader && emptyDuplicatesAllowed)) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
String.format( String.format(
"The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.Builder.setDuplicateHeaderMode().",
header, Arrays.toString(headerRecord))); header, Arrays.toString(headerRecord)));
} }
if (header != null) { if (header != null) {

View File

@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.csv;
/**
* Determines how duplicate header fields should be handled
* if {@link CSVFormat#withHeader(String...)} is not null.
*
* @since 1.9.0
*/
public enum DuplicateHeaderMode {
/**
* Allows all duplicate headers.
*/
ALLOW_ALL,
/**
* Allows duplicate headers only if they're empty strings or null.
*/
ALLOW_EMPTY,
/**
* Disallows duplicate headers entirely.
*/
DISALLOW
}

View File

@ -19,5 +19,5 @@
"-//Checkstyle//DTD SuppressionFilter Configuration 1.2//EN" "-//Checkstyle//DTD SuppressionFilter Configuration 1.2//EN"
"https://checkstyle.org/dtds/suppressions_1_2.dtd"> "https://checkstyle.org/dtds/suppressions_1_2.dtd">
<suppressions> <suppressions>
<suppress checks="LineLength" files="[\\/]CSVParser\.java$" lines="511"/> <suppress checks="LineLength" files="[\\/]CSVParser\.java$" lines="515"/>
</suppressions> </suppressions>

View File

@ -260,6 +260,10 @@ public class CSVFormatTest {
final Object a = method.invoke(CSVFormat.DEFAULT, QuoteMode.MINIMAL); final Object a = method.invoke(CSVFormat.DEFAULT, QuoteMode.MINIMAL);
final Object b = method.invoke(CSVFormat.DEFAULT, QuoteMode.ALL); final Object b = method.invoke(CSVFormat.DEFAULT, QuoteMode.ALL);
assertNotEquals(name, type, a, b); assertNotEquals(name, type, a, b);
} else if ("org.apache.commons.csv.DuplicateHeaderMode".equals(type)) {
final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] {DuplicateHeaderMode.ALLOW_ALL});
final Object b = method.invoke(CSVFormat.DEFAULT, new Object[] {DuplicateHeaderMode.DISALLOW});
assertNotEquals(name, type, a, b);
} else if ("java.lang.Object[]".equals(type)){ } else if ("java.lang.Object[]".equals(type)){
final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] {new Object[] {null, null}}); final Object a = method.invoke(CSVFormat.DEFAULT, new Object[] {new Object[] {null, null}});
final Object b = method.invoke(CSVFormat.DEFAULT, new Object[] {new Object[] {new Object(), new Object()}}); final Object b = method.invoke(CSVFormat.DEFAULT, new Object[] {new Object[] {new Object(), new Object()}});
@ -1295,6 +1299,15 @@ public class CSVFormatTest {
} }
@Test
public void testWithEmptyDuplicates() {
final CSVFormat formatWithEmptyDuplicates =
CSVFormat.DEFAULT.builder().setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_EMPTY).build();
assertEquals(DuplicateHeaderMode.ALLOW_EMPTY, formatWithEmptyDuplicates.getDuplicateHeaderMode());
assertFalse(formatWithEmptyDuplicates.getAllowDuplicateHeaderNames());
}
@Test @Test
public void testWithEscapeCRThrowsExceptions() { public void testWithEscapeCRThrowsExceptions() {
assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.withEscape(CR)); assertThrows(IllegalArgumentException.class, () -> CSVFormat.DEFAULT.withEscape(CR));

View File

@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.csv.issues;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.DuplicateHeaderMode;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertThrows;
import java.io.IOException;
import java.io.StringReader;
/**
* When {@link CSVFormat#withHeader(String...)} is not null; duplicate headers
* with empty strings should not be allowed.
*
* @see <a href="https://issues.apache.org/jira/browse/CSV-264">Jira Ticker</a>
*/
public class JiraCsv264Test {
private static final String CSV_STRING = "\"\",\"B\",\"\"\n" +
"\"1\",\"2\",\"3\"\n" +
"\"4\",\"5\",\"6\"";
/**
* A CSV file with a random gap in the middle.
*/
private static final String CSV_STRING_GAP = "\"A\",\"B\",\"\",\"\",\"E\"\n" +
"\"1\",\"2\",\"\",\"\",\"5\"\n" +
"\"6\",\"7\",\"\",\"\",\"10\"";
@Test
public void testJiraCsv264() throws IOException {
final CSVFormat csvFormat = CSVFormat.DEFAULT
.builder()
.setHeader()
.setDuplicateHeaderMode(DuplicateHeaderMode.DISALLOW)
.setAllowMissingColumnNames(true)
.build();
try (StringReader reader = new StringReader(CSV_STRING)) {
assertThrows(IllegalArgumentException.class, () -> csvFormat.parse(reader));
}
}
@Test
public void testJiraCsv264WithGapAllowEmpty() throws IOException {
final CSVFormat csvFormat = CSVFormat.DEFAULT
.builder()
.setHeader()
.setDuplicateHeaderMode(DuplicateHeaderMode.ALLOW_EMPTY)
.setAllowMissingColumnNames(true)
.build();
try (StringReader reader = new StringReader(CSV_STRING_GAP)) {
csvFormat.parse(reader);
}
}
@Test
public void testJiraCsv264WithGapDisallow() throws IOException {
final CSVFormat csvFormat = CSVFormat.DEFAULT
.builder()
.setHeader()
.setDuplicateHeaderMode(DuplicateHeaderMode.DISALLOW)
.setAllowMissingColumnNames(true)
.build();
try (StringReader reader = new StringReader(CSV_STRING_GAP)) {
assertThrows(IllegalArgumentException.class, () -> csvFormat.parse(reader));
}
}
}