[CSV-239] Add CSVRecord.getHeaderNames and allow duplicate headers (#41)

* [CSV-239] Cannot get headers in column order from CSVRecord. * getHeaderNames returns all headers in column order including repeats which are allowed as per RFC 4180 * add CSVFormat.withAllowDuplicateHeaderNames() * [CSV-239] Cannot get headers in column order from CSVRecord. * only wrap headerNames with unmodifiableList if non-empty * fix and enhance CSVRecord.toMap javadoc * [CSV-239] Cannot get headers in column order from CSVRecord. * fix exception messages * [CSV-239] Cannot get headers in column order from CSVRecord. * fix whitespace * [CSV-239] Cannot get headers in column order from CSVRecord. * simplify if statement * [CSV-239] Cannot get headers in column order from CSVRecord. * fix indentation * add javadoc to Headers class * rename method to createHeaders * use String.format to build error message * initialize header names List with appropriate size
2019-05-24 22:11:17 +10:00 · 2019-05-24 22:11:17 +10:00 · 030fb8e37c
parent 4d2616b7a5
commit 030fb8e37c
4 changed files with 128 additions and 39 deletions
--- a/src/main/java/org/apache/commons/csv/CSVFormat.java
+++ b/src/main/java/org/apache/commons/csv/CSVFormat.java
@ -260,12 +260,13 @@ public final class CSVFormat implements Serializable {
     * <li>{@code withQuote('"')}</li>
     * <li>{@code withRecordSeparator("\r\n")}</li>
     * <li>{@code withIgnoreEmptyLines(true)}</li>
+     * <li>{@code withAllowDuplicateHeaderNames(true)}</li>
     * </ul>
     *
     * @see Predefined#Default
     */
    public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF,
-            null, null, null, false, false, false, false, false, false);
+            null, null, null, false, false, false, false, false, false, true);

    /**
     * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is
@ -288,6 +289,7 @@ public final class CSVFormat implements Serializable {
     * <li>{@code {@link #withRecordSeparator(String) withRecordSeparator("\r\n")}}</li>
     * <li>{@code {@link #withIgnoreEmptyLines(boolean) withIgnoreEmptyLines(false)}}</li>
     * <li>{@code {@link #withAllowMissingColumnNames(boolean) withAllowMissingColumnNames(true)}}</li>
+     * <li>{@code {@link #withAllowDuplicateHeaderNames(boolean) withAllowDuplicateHeaderNames(true)}}</li>
     * </ul>
     * <p>
     * Note: This is currently like {@link #RFC4180} plus {@link #withAllowMissingColumnNames(boolean)
@ -671,7 +673,7 @@ public final class CSVFormat implements Serializable {
     */
    public static CSVFormat newFormat(final char delimiter) {
        return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false,
-                false, false, false, false);
+                false, false, false, false, true);
    }

    /**
@ -721,6 +723,8 @@ public final class CSVFormat implements Serializable {
    private final boolean trim;

    private final boolean autoFlush;
+    
+    private final boolean allowDuplicateHeaderNames;

    /**
     * Creates a customized CSV format.
@ -766,7 +770,7 @@ public final class CSVFormat implements Serializable {
            final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
            final Object[] headerComments, final String[] header, final boolean skipHeaderRecord,
            final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim,
-            final boolean trailingDelimiter, final boolean autoFlush) {
+            final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames) {
        this.delimiter = delimiter;
        this.quoteCharacter = quoteChar;
        this.quoteMode = quoteMode;
@ -785,6 +789,7 @@ public final class CSVFormat implements Serializable {
        this.trim = trim;
        this.autoFlush = autoFlush;
        this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
+        this.allowDuplicateHeaderNames = allowDuplicateHeaderNames;
        validate();
    }

@ -1686,7 +1691,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNames) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, 
+                allowDuplicateHeaderNames);
    }

    /**
@ -1701,7 +1707,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withAutoFlush(final boolean autoFlush) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -1736,7 +1743,8 @@ public final class CSVFormat implements Serializable {
        }
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -1754,7 +1762,8 @@ public final class CSVFormat implements Serializable {
        }
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -1785,7 +1794,8 @@ public final class CSVFormat implements Serializable {
        }
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escape, ignoreSurroundingSpaces,
                ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
-                allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -1941,7 +1951,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withHeader(final String... header) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -1962,7 +1973,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withHeaderComments(final Object... headerComments) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -1987,7 +1999,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withIgnoreEmptyLines(final boolean ignoreEmptyLines) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2013,7 +2026,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withIgnoreHeaderCase(final boolean ignoreHeaderCase) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2038,7 +2052,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2057,7 +2072,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withNullString(final String nullString) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2088,7 +2104,8 @@ public final class CSVFormat implements Serializable {
        }
        return new CSVFormat(delimiter, quoteChar, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces,
                ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
-                allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2102,7 +2119,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withQuoteMode(final QuoteMode quoteModePolicy) {
        return new CSVFormat(delimiter, quoteCharacter, quoteModePolicy, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2140,7 +2158,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withRecordSeparator(final String recordSeparator) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2167,7 +2186,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withSkipHeaderRecord(final boolean skipHeaderRecord) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2208,7 +2228,8 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withTrailingDelimiter(final boolean trailingDelimiter) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }

    /**
@ -2233,6 +2254,22 @@ public final class CSVFormat implements Serializable {
    public CSVFormat withTrim(final boolean trim) {
        return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
    }
+    
+    public CSVFormat withAllowDuplicateHeaderNames(boolean allowDuplicateHeaderNames) {
+    	return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
+                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
+    }
+
+    public CSVFormat withAllowDuplicateHeaderNames() {
+    	return withAllowDuplicateHeaderNames(true);
+    }
+    
+    public boolean getAllowDuplicateHeaderNames() {
+		return allowDuplicateHeaderNames;
+	}
 }
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@ -410,8 +410,9 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
        this.format = format;
        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
        this.csvRecordIterator = new CSVRecordIterator();
-        this.headerMap = createHeaderMap(); // 1st
-        this.headerNames = createHeaderNames(this.headerMap); // 2nd
+        Headers headers = createHeaders();
+        this.headerMap = headers.headerMap;
+        this.headerNames = headers.headerNames;
        this.characterOffset = characterOffset;
        this.recordNumber = recordNumber - 1;
    }
@ -445,14 +446,35 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
                new LinkedHashMap<>();
    }

+    /**
+     * Header information based on name and position.
+     */
+    private static final class Headers {
+        /**
+         * Header column positions (0-based)
+         */
+        final Map<String, Integer> headerMap;
+        
+        /**
+         * Header names in column order
+         */
+        final List<String> headerNames;
+
+        Headers(Map<String, Integer> headerMap, List<String> headerNames) {
+            this.headerMap = headerMap;
+            this.headerNames = headerNames;
+        }
+    }
+    
    /**
     * Creates the name to index mapping if the format defines a header.
     *
     * @return null if the format has no header.
     * @throws IOException if there is a problem reading the header or skipping the first record
     */
-    private Map<String, Integer> createHeaderMap() throws IOException {
+    private Headers createHeaders() throws IOException {
        Map<String, Integer> hdrMap = null;
+        List<String> headerNames = null;
        final String[] formatHeader = this.format.getHeader();
        if (formatHeader != null) {
            hdrMap = createEmptyHeaderMap();
@ -476,27 +498,34 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
                    final String header = headerRecord[i];
                    final boolean containsHeader = header == null ? false : hdrMap.containsKey(header);
                    final boolean emptyHeader = header == null || header.trim().isEmpty();
-                    if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) {
-                        throw new IllegalArgumentException("The header contains a duplicate name: \"" + header
-                                + "\" in " + Arrays.toString(headerRecord));
+                    if (containsHeader) {
+                        if (!emptyHeader && !this.format.getAllowDuplicateHeaderNames()) {
+                            throw new IllegalArgumentException(
+                                    String.format("The header contains a duplicate name: \"%s\" in %s."
+                                        + " If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", 
+                                        header, Arrays.toString(headerRecord)));
+                        }
+                        if (emptyHeader && !this.format.getAllowMissingColumnNames()) {
+                            throw new IllegalArgumentException(
+                                    "A header name is missing in " + Arrays.toString(headerRecord));
+                        }
                    }
                    if (header != null) {
                        hdrMap.put(header, Integer.valueOf(i));
+                        if (headerNames == null) {
+                        	headerNames = new ArrayList<>(headerRecord.length);
+                        }
+                        headerNames.add(header);
                    }
                }
            }
+        } 
+        if (headerNames == null) {
+        	headerNames = Collections.emptyList(); //immutable
+        } else {
+        	headerNames = Collections.unmodifiableList(headerNames);
        }
-        return hdrMap;
-    }
-
-    private List<String> createHeaderNames(final Map<String, Integer> headerMap) {
-        // @formatter:off
-        return headerMap == null ? null
-            : headerMap.entrySet().stream()
-                .sorted(Map.Entry.comparingByValue())
-                .map(Map.Entry::getKey)
-                .collect(Collectors.collectingAndThen(Collectors.toList(), Collections::unmodifiableList));
-        // @formatter:on
+        return new Headers(hdrMap, headerNames);
    }

    /**
--- a/src/main/java/org/apache/commons/csv/CSVRecord.java
+++ b/src/main/java/org/apache/commons/csv/CSVRecord.java
@ -265,7 +265,7 @@ public final class CSVRecord implements Serializable, Iterable<String> {
    }

    /**
-     * Copies this record into a new Map. The new map is not connect
+     * Copies this record into a new Map of header name to record value.
     *
     * @return A new Map. The map is empty if the record has no headers.
     */
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@ -43,6 +43,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@ -292,10 +293,24 @@ public class CSVParserTest {
    }

    @Test(expected = IllegalArgumentException.class)
-    public void testDuplicateHeaders() throws Exception {
+    public void testDuplicateHeadersNotAllowed() throws Exception {
+        CSVParser.parse("a,b,a\n1,2,3\nx,y,z",
+                CSVFormat.DEFAULT.withHeader(new String[] {}).withAllowDuplicateHeaderNames(false));
+    }
+
+    @Test
+    public void testDuplicateHeadersAllowedByDefault() throws Exception {
        CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader(new String[] {}));
    }

+    @Test
+    public void testEmptyFileHeaderParsing() throws Exception {
+        try (final CSVParser parser = CSVParser.parse("", CSVFormat.DEFAULT.withFirstRecordAsHeader())) {
+            assertNull(parser.nextRecord());
+            assertTrue(parser.getHeaderNames().isEmpty());
+        }
+    }
+    
    @Test
    public void testEmptyFile() throws Exception {
        try (final CSVParser parser = CSVParser.parse("", CSVFormat.DEFAULT)) {
@ -1151,6 +1166,14 @@ public class CSVParserTest {
        assertEquals("3", record.get("Z"));
        Assert.assertEquals(3, record.size());
    }
+    
+    @Test
+    public void testRepeatedHeadersAreReturnedInCSVRecordHeaderNames() throws IOException {
+        final Reader in = new StringReader("header1,header2,header1\n1,2,3\n4,5,6");
+        final Iterator<CSVRecord> records = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().parse(in).iterator();
+        final CSVRecord record = records.next();
+        assertEquals(Arrays.asList("header1", "header2", "header1"), record.getParser().getHeaderNames());
+    }

    private void validateLineNumbers(final String lineSeparator) throws IOException {
        try (final CSVParser parser = CSVParser.parse("a" + lineSeparator + "b" + lineSeparator + "c",