NIFI-8232 CSV Parsers optionally allow/reject duplicate header names

Signed-off-by: Pierre Villard <pierre.villard.fr@gmail.com>

This closes #4828.
This commit is contained in:
Chris Sampson 2021-02-17 21:45:32 +00:00 committed by Pierre Villard
parent 418e2cc2cb
commit 3cb26aec72
No known key found for this signature in database
GPG Key ID: F92A93B30C07C6D5
7 changed files with 201 additions and 57 deletions

View File

@ -33,7 +33,7 @@ import java.util.Map;
public class CSVUtils {
private static Logger LOG = LoggerFactory.getLogger(CSVUtils.class);
private static final Logger LOG = LoggerFactory.getLogger(CSVUtils.class);
public static final AllowableValue CUSTOM = new AllowableValue("custom", "Custom Format",
"The format of the CSV is configured by using the properties of this Controller Service, such as Value Separator");
@ -136,6 +136,20 @@ public class CSVUtils {
.defaultValue("UTF-8")
.required(true)
.build();
public static final PropertyDescriptor ALLOW_DUPLICATE_HEADER_NAMES = new PropertyDescriptor.Builder()
.name("csvutils-allow-duplicate-header-names")
.displayName("Allow Duplicate Header Names")
.description("Whether duplicate header names are allowed. Header names are case-sensitive, for example \"name\" and \"Name\" are treated as separate fields. " +
"Handling of duplicate header names is CSV Parser specific (where applicable):\n" +
"* Apache Commons CSV - duplicate headers will result in column data \"shifting\" right with new fields " +
"created for \"unknown_field_index_X\" where \"X\" is the CSV column index number\n" +
"* Jackson CSV - duplicate headers will be de-duplicated with the field value being that of the right-most " +
"duplicate CSV column")
.expressionLanguageSupported(ExpressionLanguageScope.NONE)
.allowableValues("true", "false")
.defaultValue("true")
.required(false)
.build();
// CSV Format fields for writers only
public static final AllowableValue QUOTE_ALL = new AllowableValue("ALL", "Quote All Values", "All values will be quoted using the configured quote character.");
@ -177,6 +191,10 @@ public class CSVUtils {
.required(true)
.build();
private CSVUtils() {
// intentionally blank, prevents instantiation
}
public static boolean isDynamicCSVFormat(final PropertyContext context) {
final String formatName = context.getProperty(CSV_FORMAT).getValue();
return formatName.equalsIgnoreCase(CUSTOM.getValue())
@ -208,8 +226,8 @@ public class CSVUtils {
}
}
private static Character getCharUnescapedJava(final PropertyContext context, final PropertyDescriptor property, final Map<String, String> variables) {
String value = context.getProperty(property).evaluateAttributeExpressions(variables).getValue();
private static Character getValueSeparatorCharUnescapedJava(final PropertyContext context, final Map<String, String> variables) {
String value = context.getProperty(VALUE_SEPARATOR).evaluateAttributeExpressions(variables).getValue();
if (value != null) {
String unescaped = unescapeJava(value);
@ -218,13 +236,9 @@ public class CSVUtils {
}
}
LOG.warn("'{}' property evaluated to an invalid value: \"{}\". It must be a single character. The property value will be ignored.", property.getName(), value);
LOG.warn("'{}' property evaluated to an invalid value: \"{}\". It must be a single character. The property value will be ignored.", VALUE_SEPARATOR.getName(), value);
if (property.getDefaultValue() != null) {
return property.getDefaultValue().charAt(0);
} else {
return null;
}
return VALUE_SEPARATOR.getDefaultValue().charAt(0);
}
private static Character getCharUnescaped(final PropertyContext context, final PropertyDescriptor property, final Map<String, String> variables) {
@ -247,7 +261,7 @@ public class CSVUtils {
}
private static CSVFormat buildCustomFormat(final PropertyContext context, final Map<String, String> variables) {
final Character valueSeparator = getCharUnescapedJava(context, VALUE_SEPARATOR, variables);
final Character valueSeparator = getValueSeparatorCharUnescapedJava(context, variables);
CSVFormat format = CSVFormat.newFormat(valueSeparator)
.withAllowMissingColumnNames()
.withIgnoreEmptyLines();
@ -293,6 +307,11 @@ public class CSVUtils {
format = format.withRecordSeparator(separator);
}
final PropertyValue allowDuplicateHeaderNames = context.getProperty(ALLOW_DUPLICATE_HEADER_NAMES);
if (allowDuplicateHeaderNames != null && allowDuplicateHeaderNames.isSet()) {
format = format.withAllowDuplicateHeaderNames(allowDuplicateHeaderNames.asBoolean());
}
return format;
}
@ -306,7 +325,7 @@ public class CSVUtils {
public static String unescape(final String input) {
if (input == null) {
return input;
return null;
}
return input.replace("\\t", "\t")

View File

@ -36,7 +36,7 @@ public class CSVUtilsTest {
@Test
public void testIsDynamicCSVFormatWithStaticProperties() {
PropertyContext context = createContext("|", "'", "^", "~");
PropertyContext context = createContext("|", "'", "^", "~", "true");
boolean isDynamicCSVFormat = CSVUtils.isDynamicCSVFormat(context);
@ -45,7 +45,7 @@ public class CSVUtilsTest {
@Test
public void testIsDynamicCSVFormatWithDynamicValueSeparator() {
PropertyContext context = createContext("${csv.delimiter}", "'", "^", "~");
PropertyContext context = createContext("${csv.delimiter}", "'", "^", "~", "true");
boolean isDynamicCSVFormat = CSVUtils.isDynamicCSVFormat(context);
@ -54,7 +54,7 @@ public class CSVUtilsTest {
@Test
public void testIsDynamicCSVFormatWithDynamicQuoteCharacter() {
PropertyContext context = createContext("|", "${csv.quote}", "^", "~");
PropertyContext context = createContext("|", "${csv.quote}", "^", "~", "true");
boolean isDynamicCSVFormat = CSVUtils.isDynamicCSVFormat(context);
@ -63,7 +63,7 @@ public class CSVUtilsTest {
@Test
public void testIsDynamicCSVFormatWithDynamicEscapeCharacter() {
PropertyContext context = createContext("|", "'", "${csv.escape}", "~");
PropertyContext context = createContext("|", "'", "${csv.escape}", "~", "true");
boolean isDynamicCSVFormat = CSVUtils.isDynamicCSVFormat(context);
@ -72,7 +72,7 @@ public class CSVUtilsTest {
@Test
public void testIsDynamicCSVFormatWithDynamicCommentMarker() {
PropertyContext context = createContext("|", "'", "^", "${csv.comment}");
PropertyContext context = createContext("|", "'", "^", "${csv.comment}", "true");
boolean isDynamicCSVFormat = CSVUtils.isDynamicCSVFormat(context);
@ -81,7 +81,7 @@ public class CSVUtilsTest {
@Test
public void testCustomFormat() {
PropertyContext context = createContext("|", "'", "^", "~");
PropertyContext context = createContext("|", "'", "^", "~", "true");
CSVFormat csvFormat = CSVUtils.createCSVFormat(context, Collections.emptyMap());
@ -89,11 +89,12 @@ public class CSVUtilsTest {
assertEquals('\'', (char) csvFormat.getQuoteCharacter());
assertEquals('^', (char) csvFormat.getEscapeCharacter());
assertEquals('~', (char) csvFormat.getCommentMarker());
assertTrue(csvFormat.getAllowDuplicateHeaderNames());
}
@Test
public void testCustomFormatWithEL() {
PropertyContext context = createContext("${csv.delimiter}", "${csv.quote}", "${csv.escape}", "${csv.comment}");
PropertyContext context = createContext("${csv.delimiter}", "${csv.quote}", "${csv.escape}", "${csv.comment}", "false");
Map<String, String> attributes = new HashMap<>();
attributes.put("csv.delimiter", "|");
@ -107,11 +108,12 @@ public class CSVUtilsTest {
assertEquals('\'', (char) csvFormat.getQuoteCharacter());
assertEquals('^', (char) csvFormat.getEscapeCharacter());
assertEquals('~', (char) csvFormat.getCommentMarker());
assertFalse(csvFormat.getAllowDuplicateHeaderNames());
}
@Test
public void testCustomFormatWithELEmptyValues() {
PropertyContext context = createContext("${csv.delimiter}", "${csv.quote}", "${csv.escape}", "${csv.comment}");
PropertyContext context = createContext("${csv.delimiter}", "${csv.quote}", "${csv.escape}", "${csv.comment}", "true");
CSVFormat csvFormat = CSVUtils.createCSVFormat(context, Collections.emptyMap());
@ -123,7 +125,7 @@ public class CSVUtilsTest {
@Test
public void testCustomFormatWithELInvalidValues() {
PropertyContext context = createContext("${csv.delimiter}", "${csv.quote}", "${csv.escape}", "${csv.comment}");
PropertyContext context = createContext("${csv.delimiter}", "${csv.quote}", "${csv.escape}", "${csv.comment}", "true");
Map<String, String> attributes = new HashMap<>();
attributes.put("csv.delimiter", "invalid");
@ -139,13 +141,14 @@ public class CSVUtilsTest {
assertNull(csvFormat.getCommentMarker());
}
private PropertyContext createContext(String valueSeparator, String quoteChar, String escapeChar, String commentMarker) {
private PropertyContext createContext(String valueSeparator, String quoteChar, String escapeChar, String commentMarker, String allowDuplicateHeaderNames) {
Map<PropertyDescriptor, String> properties = new HashMap<>();
properties.put(CSVUtils.VALUE_SEPARATOR, valueSeparator);
properties.put(CSVUtils.QUOTE_CHAR, quoteChar);
properties.put(CSVUtils.ESCAPE_CHAR, escapeChar);
properties.put(CSVUtils.COMMENT_MARKER, commentMarker);
properties.put(CSVUtils.ALLOW_DUPLICATE_HEADER_NAMES, allowDuplicateHeaderNames);
return new MockConfigurationContext(properties, null);
}

View File

@ -110,6 +110,7 @@ public class CSVReader extends SchemaRegistryService implements RecordReaderFact
properties.add(CSVUtils.NULL_STRING);
properties.add(CSVUtils.TRIM_FIELDS);
properties.add(CSVUtils.CHARSET);
properties.add(CSVUtils.ALLOW_DUPLICATE_HEADER_NAMES);
return properties;
}
@ -146,17 +147,17 @@ public class CSVReader extends SchemaRegistryService implements RecordReaderFact
final RecordSchema schema = getSchema(variables, new NonCloseableInputStream(in), null);
in.reset();
CSVFormat csvFormat;
final CSVFormat format;
if (this.csvFormat != null) {
csvFormat = this.csvFormat;
format = this.csvFormat;
} else {
csvFormat = CSVUtils.createCSVFormat(context, variables);
format = CSVUtils.createCSVFormat(context, variables);
}
if(APACHE_COMMONS_CSV.getValue().equals(csvParser)) {
return new CSVRecordReader(in, logger, schema, csvFormat, firstLineIsHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat, charSet);
} else if(JACKSON_CSV.getValue().equals(csvParser)) {
return new JacksonCSVRecordReader(in, logger, schema, csvFormat, firstLineIsHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat, charSet);
if (APACHE_COMMONS_CSV.getValue().equals(csvParser)) {
return new CSVRecordReader(in, logger, schema, format, firstLineIsHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat, charSet);
} else if (JACKSON_CSV.getValue().equals(csvParser)) {
return new JacksonCSVRecordReader(in, logger, schema, format, firstLineIsHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat, charSet);
} else {
throw new IOException("Parser not supported");
}

View File

@ -24,9 +24,11 @@ import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.io.input.BOMInputStream;
@ -49,6 +51,7 @@ import com.fasterxml.jackson.dataformat.csv.CsvSchema;
public class JacksonCSVRecordReader extends AbstractCSVRecordReader {
private final MappingIterator<String[]> recordStream;
private List<String> rawFieldNames = null;
private boolean allowDuplicateHeaderNames;
private volatile static CsvMapper mapper = new CsvMapper().enable(CsvParser.Feature.WRAP_AS_ARRAY);
@ -75,6 +78,7 @@ public class JacksonCSVRecordReader extends AbstractCSVRecordReader {
csvSchemaBuilder = csvSchemaBuilder.setSkipFirstDataRow(true);
}
}
allowDuplicateHeaderNames = csvFormat.getAllowDuplicateHeaderNames();
CsvSchema csvSchema = csvSchemaBuilder.build();
@ -108,6 +112,17 @@ public class JacksonCSVRecordReader extends AbstractCSVRecordReader {
rawFieldNames = schema.getFieldNames();
} else {
rawFieldNames = Arrays.asList(csvRecord);
if (rawFieldNames.size() > schema.getFieldCount() && !allowDuplicateHeaderNames) {
final Set<String> deDupe = new HashSet<>(schema.getFieldCount());
for (final String name : rawFieldNames) {
if (!deDupe.add(name)) {
throw new IllegalArgumentException(String.format(
"The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().",
name, rawFieldNames
));
}
}
}
// Advance the stream to keep the record count correct
if (recordStream.hasNext()) {

View File

@ -66,9 +66,37 @@ public class TestCSVHeaderSchemaStrategy {
.allMatch(field -> field.getDataType().equals(RecordFieldType.STRING.getDataType())));
}
@Test
public void testContainsDuplicateHeaderNames() throws SchemaNotFoundException, IOException {
final String headerLine = "a, a, b";
final byte[] headerBytes = headerLine.getBytes();
final Map<PropertyDescriptor, String> properties = new HashMap<>();
properties.put(CSVUtils.CSV_FORMAT, CSVUtils.CUSTOM.getValue());
properties.put(CSVUtils.COMMENT_MARKER, "#");
properties.put(CSVUtils.VALUE_SEPARATOR, ",");
properties.put(CSVUtils.TRIM_FIELDS, "true");
properties.put(CSVUtils.QUOTE_CHAR, "\"");
properties.put(CSVUtils.ESCAPE_CHAR, "\\");
final ConfigurationContext context = new MockConfigurationContext(properties, null);
final CSVHeaderSchemaStrategy strategy = new CSVHeaderSchemaStrategy(context);
final RecordSchema schema;
try (final InputStream bais = new ByteArrayInputStream(headerBytes)) {
schema = strategy.getSchema(null, bais, null);
}
final List<String> expectedFieldNames = Arrays.asList("a", "b");
assertEquals(expectedFieldNames, schema.getFieldNames());
assertTrue(schema.getFields().stream()
.allMatch(field -> field.getDataType().equals(RecordFieldType.STRING.getDataType())));
}
@Test
public void testWithEL() throws SchemaNotFoundException, IOException {
final String headerLine = "\'a\'; b; c; d; e^;z; f";
final String headerLine = "'a'; b; c; d; e^;z; f";
final byte[] headerBytes = headerLine.getBytes();
final Map<PropertyDescriptor, String> properties = new HashMap<>();

View File

@ -32,7 +32,6 @@ import org.junit.Test;
import org.mockito.Mockito;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
@ -53,6 +52,7 @@ import java.util.TimeZone;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThrows;
public class TestCSVRecordReader {
private final DataType doubleDataType = RecordFieldType.DOUBLE.getDataType();
@ -71,12 +71,6 @@ public class TestCSVRecordReader {
RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat(), "ASCII");
}
private CSVRecordReader createReader(final InputStream in, final RecordSchema schema, CSVFormat format,
final String dateFormat, final String timeFormat, final String timestampFormat) throws IOException {
return new CSVRecordReader(in, Mockito.mock(ComponentLog.class), schema, format, true, false,
dateFormat, timeFormat, timestampFormat, "ASCII");
}
@Test
public void testUTF8() throws IOException, MalformedRecordException {
final String text = "name\n黃凱揚";
@ -178,7 +172,7 @@ public class TestCSVRecordReader {
final Record record = reader.nextRecord(false, false);
// When the values are not in the expected format, a String is returned unmodified
assertEquals("11/30/1983", (String)record.getValue("date"));
assertEquals("11/30/1983", record.getValue("date"));
}
}
@ -195,7 +189,7 @@ public class TestCSVRecordReader {
null, RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat(), "UTF-8")) {
final Record record = reader.nextRecord(false, false);
assertEquals("1983-01-01", (String)record.getValue("date"));
assertEquals("1983-01-01", record.getValue("date"));
}
}
@ -212,7 +206,7 @@ public class TestCSVRecordReader {
"", RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat(), "UTF-8")) {
final Record record = reader.nextRecord(false, false);
assertEquals("1983-01-01", (String)record.getValue("date"));
assertEquals("1983-01-01", record.getValue("date"));
}
}
@ -252,7 +246,7 @@ public class TestCSVRecordReader {
RecordFieldType.DATE.getDefaultFormat(), "HH-MM-SS", RecordFieldType.TIMESTAMP.getDefaultFormat(), "UTF-8")) {
final Record record = reader.nextRecord(false, false);
assertEquals("01:02:03", (String)record.getValue("time"));
assertEquals("01:02:03", record.getValue("time"));
}
}
@ -269,7 +263,7 @@ public class TestCSVRecordReader {
RecordFieldType.DATE.getDefaultFormat(), null, RecordFieldType.TIMESTAMP.getDefaultFormat(), "UTF-8")) {
final Record record = reader.nextRecord(false, false);
assertEquals("01:02:03", (String)record.getValue("time"));
assertEquals("01:02:03", record.getValue("time"));
}
}
@ -286,7 +280,7 @@ public class TestCSVRecordReader {
RecordFieldType.DATE.getDefaultFormat(), "", RecordFieldType.TIMESTAMP.getDefaultFormat(), "UTF-8")) {
final Record record = reader.nextRecord(false, false);
assertEquals("01:02:03", (String)record.getValue("time"));
assertEquals("01:02:03", record.getValue("time"));
}
}
@ -326,7 +320,7 @@ public class TestCSVRecordReader {
RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), "HH-MM-SS", "UTF-8")) {
final Record record = reader.nextRecord(false, false);
assertEquals("01:02:03", (String)record.getValue("timestamp"));
assertEquals("01:02:03", record.getValue("timestamp"));
}
}
@ -338,7 +332,7 @@ public class TestCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/single-bank-account.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/single-bank-account.csv");
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] record = reader.nextRecord().getValues();
@ -351,7 +345,7 @@ public class TestCSVRecordReader {
@Test
public void testExcelFormat() throws IOException, MalformedRecordException {
final List<RecordField> fields = new ArrayList<RecordField>();
final List<RecordField> fields = new ArrayList<>();
fields.add(new RecordField("fieldA", RecordFieldType.STRING.getDataType()));
fields.add(new RecordField("fieldB", RecordFieldType.STRING.getDataType()));
final RecordSchema schema = new SimpleRecordSchema(fields);
@ -379,7 +373,7 @@ public class TestCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/multi-bank-account.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account.csv");
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
@ -401,7 +395,7 @@ public class TestCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/extra-white-space.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/extra-white-space.csv");
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
@ -558,7 +552,6 @@ public class TestCSVRecordReader {
assertNull(reader.nextRecord());
}
}
@Test
@ -592,6 +585,50 @@ public class TestCSVRecordReader {
}
}
@Test
public void testDuplicateHeaderNames() throws IOException, MalformedRecordException {
final List<RecordField> fields = getDefaultFields();
final RecordSchema schema = new SimpleRecordSchema(fields);
final String headerLine = "id, id, name, name, balance, BALANCE, address, city, state, zipCode, country";
final String inputRecord = "1, Another ID, John, Smith, 40.80, 10.20, 123 My Street, My City, MS, 11111, USA";
final String csvData = headerLine + "\n" + inputRecord;
final byte[] inputData = csvData.getBytes();
// test nextRecord has shifted data columns right by 1 after the duplicate "id" & "name" header names
try (final InputStream bais = new ByteArrayInputStream(inputData);
final CSVRecordReader reader = createReader(bais, schema, format)) {
final Record record = reader.nextRecord(false, false);
assertNotNull(record);
assertEquals("1", record.getValue("id"));
assertEquals("Another ID", record.getValue("name"));
assertEquals("John", record.getValue("balance"));
assertEquals("Smith", record.getValue("BALANCE"));
assertEquals("40.80", record.getValue("address"));
assertEquals("10.20", record.getValue("city"));
assertEquals("123 My Street", record.getValue("state"));
assertEquals("My City", record.getValue("zipCode"));
assertEquals("MS", record.getValue("country"));
assertEquals("11111", record.getValue("unknown_field_index_9"));
assertEquals("USA", record.getValue("unknown_field_index_10"));
assertNull(reader.nextRecord(false, false));
}
// confirm duplicate headers cause an exception when requested
final CSVFormat disallowDuplicateHeadersFormat = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"').withAllowDuplicateHeaderNames(false);
try (final InputStream bais = new ByteArrayInputStream(inputData)) {
final IllegalArgumentException iae = assertThrows(IllegalArgumentException.class, () -> createReader(bais, schema, disallowDuplicateHeadersFormat));
assertEquals(
"The header contains a duplicate name: \"id\" in [id, id, name, name, balance, BALANCE, address, city, state, zipCode, country]. " +
"If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().",
iae.getMessage()
);
}
}
@Test
public void testMultipleRecordsEscapedWithSpecialChar() throws IOException, MalformedRecordException {
@ -603,7 +640,7 @@ public class TestCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/multi-bank-account_escapedchar.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_escapedchar.csv");
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();

View File

@ -32,7 +32,6 @@ import org.junit.Test;
import org.mockito.Mockito;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
@ -45,6 +44,7 @@ import java.util.TimeZone;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThrows;
public class TestJacksonCSVRecordReader {
private final DataType doubleDataType = RecordFieldType.DOUBLE.getDataType();
@ -113,7 +113,7 @@ public class TestJacksonCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/single-bank-account.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/single-bank-account.csv");
final JacksonCSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] record = reader.nextRecord().getValues();
@ -126,7 +126,7 @@ public class TestJacksonCSVRecordReader {
@Test
public void testExcelFormat() throws IOException, MalformedRecordException {
final List<RecordField> fields = new ArrayList<RecordField>();
final List<RecordField> fields = new ArrayList<>();
fields.add(new RecordField("fieldA", RecordFieldType.STRING.getDataType()));
fields.add(new RecordField("fieldB", RecordFieldType.STRING.getDataType()));
final RecordSchema schema = new SimpleRecordSchema(fields);
@ -154,7 +154,7 @@ public class TestJacksonCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/multi-bank-account.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account.csv");
final JacksonCSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
@ -176,7 +176,7 @@ public class TestJacksonCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/extra-white-space.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/extra-white-space.csv");
final JacksonCSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
@ -333,7 +333,6 @@ public class TestJacksonCSVRecordReader {
assertNull(reader.nextRecord());
}
}
@Test
@ -367,6 +366,48 @@ public class TestJacksonCSVRecordReader {
}
}
@Test
public void testDuplicateHeaderNames() throws IOException, MalformedRecordException {
final List<RecordField> fields = getDefaultFields();
final RecordSchema schema = new SimpleRecordSchema(fields);
final String headerLine = "id, id, name, name, balance, BALANCE, address, city, state, zipCode, country";
final String inputRecord = "1, Another ID, John, Smith, 40.80, 10.20, 123 My Street, My City, MS, 11111, USA";
final String csvData = headerLine + "\n" + inputRecord;
final byte[] inputData = csvData.getBytes();
// test nextRecord has ignored the first "id" and "name" columns
try (final InputStream bais = new ByteArrayInputStream(inputData);
final JacksonCSVRecordReader reader = createReader(bais, schema, format)) {
final Record record = reader.nextRecord(false, false);
assertNotNull(record);
assertEquals("Another ID", record.getValue("id"));
assertEquals("Smith", record.getValue("name"));
assertEquals("40.80", record.getValue("balance"));
assertEquals("123 My Street", record.getValue("address"));
assertEquals("My City", record.getValue("city"));
assertEquals("MS", record.getValue("state"));
assertEquals("11111", record.getValue("zipCode"));
assertEquals("USA", record.getValue("country"));
assertNull(reader.nextRecord(false, false));
}
// confirm duplicate headers cause an exception when requested
final CSVFormat disallowDuplicateHeadersFormat = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"').withAllowDuplicateHeaderNames(false);
try (final InputStream bais = new ByteArrayInputStream(inputData);
final JacksonCSVRecordReader reader = createReader(bais, schema, disallowDuplicateHeadersFormat)) {
final IllegalArgumentException iae = assertThrows(IllegalArgumentException.class, () -> reader.nextRecord(false, false));
assertEquals(
"The header contains a duplicate name: \"id\" in [id, id, name, name, balance, BALANCE, address, city, state, zipCode, country]. " +
"If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().",
iae.getMessage()
);
}
}
@Test
public void testMultipleRecordsEscapedWithSpecialChar() throws IOException, MalformedRecordException {
@ -378,7 +419,7 @@ public class TestJacksonCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/multi-bank-account_escapedchar.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_escapedchar.csv");
final JacksonCSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
@ -400,7 +441,7 @@ public class TestJacksonCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/single-bank-account.csv"));
try (final InputStream fis = new FileInputStream("src/test/resources/csv/single-bank-account.csv");
final JacksonCSVRecordReader reader = createReader(fis, schema, formatWithNullRecordSeparator)) {
final Object[] record = reader.nextRecord().getValues();