[CSV-239] Add CSVRecord.getHeaderNames and allow duplicate headers (#41)

* [CSV-239] Cannot get headers in column order from CSVRecord.
* getHeaderNames returns all headers in column order including repeats which are allowed as per RFC 4180
* add CSVFormat.withAllowDuplicateHeaderNames()

* [CSV-239] Cannot get headers in column order from CSVRecord.
* only wrap headerNames with unmodifiableList if non-empty
* fix and enhance CSVRecord.toMap javadoc

* [CSV-239] Cannot get headers in column order from CSVRecord.
* fix exception messages

* [CSV-239] Cannot get headers in column order from CSVRecord.
* fix whitespace

* [CSV-239] Cannot get headers in column order from CSVRecord.
* simplify if statement

* [CSV-239] Cannot get headers in column order from CSVRecord.
* fix indentation
* add javadoc to Headers class
* rename method to createHeaders
* use String.format to build error message
* initialize header names List with appropriate size
This commit is contained in:
Dave Moten 2019-05-24 22:11:17 +10:00 committed by Gary Gregory
parent 4d2616b7a5
commit 030fb8e37c
4 changed files with 128 additions and 39 deletions

View File

@ -260,12 +260,13 @@ public final class CSVFormat implements Serializable {
* <li>{@code withQuote('"')}</li>
* <li>{@code withRecordSeparator("\r\n")}</li>
* <li>{@code withIgnoreEmptyLines(true)}</li>
* <li>{@code withAllowDuplicateHeaderNames(true)}</li>
* </ul>
*
* @see Predefined#Default
*/
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF,
null, null, null, false, false, false, false, false, false);
null, null, null, false, false, false, false, false, false, true);
/**
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is
@ -288,6 +289,7 @@ public final class CSVFormat implements Serializable {
* <li>{@code {@link #withRecordSeparator(String) withRecordSeparator("\r\n")}}</li>
* <li>{@code {@link #withIgnoreEmptyLines(boolean) withIgnoreEmptyLines(false)}}</li>
* <li>{@code {@link #withAllowMissingColumnNames(boolean) withAllowMissingColumnNames(true)}}</li>
* <li>{@code {@link #withAllowDuplicateHeaderNames(boolean) withAllowDuplicateHeaderNames(true)}}</li>
* </ul>
* <p>
* Note: This is currently like {@link #RFC4180} plus {@link #withAllowMissingColumnNames(boolean)
@ -671,7 +673,7 @@ public final class CSVFormat implements Serializable {
*/
public static CSVFormat newFormat(final char delimiter) {
return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false,
false, false, false, false);
false, false, false, false, true);
}
/**
@ -721,6 +723,8 @@ public final class CSVFormat implements Serializable {
private final boolean trim;
private final boolean autoFlush;
private final boolean allowDuplicateHeaderNames;
/**
* Creates a customized CSV format.
@ -766,7 +770,7 @@ public final class CSVFormat implements Serializable {
final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord,
final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim,
final boolean trailingDelimiter, final boolean autoFlush) {
final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames) {
this.delimiter = delimiter;
this.quoteCharacter = quoteChar;
this.quoteMode = quoteMode;
@ -785,6 +789,7 @@ public final class CSVFormat implements Serializable {
this.trim = trim;
this.autoFlush = autoFlush;
this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
this.allowDuplicateHeaderNames = allowDuplicateHeaderNames;
validate();
}
@ -1686,7 +1691,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNames) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -1701,7 +1707,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withAutoFlush(final boolean autoFlush) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -1736,7 +1743,8 @@ public final class CSVFormat implements Serializable {
}
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -1754,7 +1762,8 @@ public final class CSVFormat implements Serializable {
}
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -1785,7 +1794,8 @@ public final class CSVFormat implements Serializable {
}
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escape, ignoreSurroundingSpaces,
ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -1941,7 +1951,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withHeader(final String... header) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -1962,7 +1973,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withHeaderComments(final Object... headerComments) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -1987,7 +1999,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withIgnoreEmptyLines(final boolean ignoreEmptyLines) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2013,7 +2026,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withIgnoreHeaderCase(final boolean ignoreHeaderCase) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2038,7 +2052,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2057,7 +2072,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withNullString(final String nullString) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2088,7 +2104,8 @@ public final class CSVFormat implements Serializable {
}
return new CSVFormat(delimiter, quoteChar, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces,
ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2102,7 +2119,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withQuoteMode(final QuoteMode quoteModePolicy) {
return new CSVFormat(delimiter, quoteCharacter, quoteModePolicy, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2140,7 +2158,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withRecordSeparator(final String recordSeparator) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2167,7 +2186,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withSkipHeaderRecord(final boolean skipHeaderRecord) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2208,7 +2228,8 @@ public final class CSVFormat implements Serializable {
public CSVFormat withTrailingDelimiter(final boolean trailingDelimiter) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
/**
@ -2233,6 +2254,22 @@ public final class CSVFormat implements Serializable {
public CSVFormat withTrim(final boolean trim) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
public CSVFormat withAllowDuplicateHeaderNames(boolean allowDuplicateHeaderNames) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}
public CSVFormat withAllowDuplicateHeaderNames() {
return withAllowDuplicateHeaderNames(true);
}
public boolean getAllowDuplicateHeaderNames() {
return allowDuplicateHeaderNames;
}
}

View File

@ -410,8 +410,9 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
this.format = format;
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.csvRecordIterator = new CSVRecordIterator();
this.headerMap = createHeaderMap(); // 1st
this.headerNames = createHeaderNames(this.headerMap); // 2nd
Headers headers = createHeaders();
this.headerMap = headers.headerMap;
this.headerNames = headers.headerNames;
this.characterOffset = characterOffset;
this.recordNumber = recordNumber - 1;
}
@ -445,14 +446,35 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
new LinkedHashMap<>();
}
/**
* Header information based on name and position.
*/
private static final class Headers {
/**
* Header column positions (0-based)
*/
final Map<String, Integer> headerMap;
/**
* Header names in column order
*/
final List<String> headerNames;
Headers(Map<String, Integer> headerMap, List<String> headerNames) {
this.headerMap = headerMap;
this.headerNames = headerNames;
}
}
/**
* Creates the name to index mapping if the format defines a header.
*
* @return null if the format has no header.
* @throws IOException if there is a problem reading the header or skipping the first record
*/
private Map<String, Integer> createHeaderMap() throws IOException {
private Headers createHeaders() throws IOException {
Map<String, Integer> hdrMap = null;
List<String> headerNames = null;
final String[] formatHeader = this.format.getHeader();
if (formatHeader != null) {
hdrMap = createEmptyHeaderMap();
@ -476,27 +498,34 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
final String header = headerRecord[i];
final boolean containsHeader = header == null ? false : hdrMap.containsKey(header);
final boolean emptyHeader = header == null || header.trim().isEmpty();
if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) {
throw new IllegalArgumentException("The header contains a duplicate name: \"" + header
+ "\" in " + Arrays.toString(headerRecord));
if (containsHeader) {
if (!emptyHeader && !this.format.getAllowDuplicateHeaderNames()) {
throw new IllegalArgumentException(
String.format("The header contains a duplicate name: \"%s\" in %s."
+ " If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().",
header, Arrays.toString(headerRecord)));
}
if (emptyHeader && !this.format.getAllowMissingColumnNames()) {
throw new IllegalArgumentException(
"A header name is missing in " + Arrays.toString(headerRecord));
}
}
if (header != null) {
hdrMap.put(header, Integer.valueOf(i));
if (headerNames == null) {
headerNames = new ArrayList<>(headerRecord.length);
}
headerNames.add(header);
}
}
}
}
if (headerNames == null) {
headerNames = Collections.emptyList(); //immutable
} else {
headerNames = Collections.unmodifiableList(headerNames);
}
return hdrMap;
}
private List<String> createHeaderNames(final Map<String, Integer> headerMap) {
// @formatter:off
return headerMap == null ? null
: headerMap.entrySet().stream()
.sorted(Map.Entry.comparingByValue())
.map(Map.Entry::getKey)
.collect(Collectors.collectingAndThen(Collectors.toList(), Collections::unmodifiableList));
// @formatter:on
return new Headers(hdrMap, headerNames);
}
/**

View File

@ -265,7 +265,7 @@ public final class CSVRecord implements Serializable, Iterable<String> {
}
/**
* Copies this record into a new Map. The new map is not connect
* Copies this record into a new Map of header name to record value.
*
* @return A new Map. The map is empty if the record has no headers.
*/

View File

@ -43,6 +43,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -292,10 +293,24 @@ public class CSVParserTest {
}
@Test(expected = IllegalArgumentException.class)
public void testDuplicateHeaders() throws Exception {
public void testDuplicateHeadersNotAllowed() throws Exception {
CSVParser.parse("a,b,a\n1,2,3\nx,y,z",
CSVFormat.DEFAULT.withHeader(new String[] {}).withAllowDuplicateHeaderNames(false));
}
@Test
public void testDuplicateHeadersAllowedByDefault() throws Exception {
CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader(new String[] {}));
}
@Test
public void testEmptyFileHeaderParsing() throws Exception {
try (final CSVParser parser = CSVParser.parse("", CSVFormat.DEFAULT.withFirstRecordAsHeader())) {
assertNull(parser.nextRecord());
assertTrue(parser.getHeaderNames().isEmpty());
}
}
@Test
public void testEmptyFile() throws Exception {
try (final CSVParser parser = CSVParser.parse("", CSVFormat.DEFAULT)) {
@ -1151,6 +1166,14 @@ public class CSVParserTest {
assertEquals("3", record.get("Z"));
Assert.assertEquals(3, record.size());
}
@Test
public void testRepeatedHeadersAreReturnedInCSVRecordHeaderNames() throws IOException {
final Reader in = new StringReader("header1,header2,header1\n1,2,3\n4,5,6");
final Iterator<CSVRecord> records = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().parse(in).iterator();
final CSVRecord record = records.next();
assertEquals(Arrays.asList("header1", "header2", "header1"), record.getParser().getHeaderNames());
}
private void validateLineNumbers(final String lineSeparator) throws IOException {
try (final CSVParser parser = CSVParser.parse("a" + lineSeparator + "b" + lineSeparator + "c",