NIFI-9884 - JacksonCSVRecordReader ignores specified encoding

NIFI-9884 - JacksonCSVRecordReader ignores specified encoding; test case for ISO-8859-1

Signed-off-by: Matthew Burgess <mattyb149@apache.org>

This closes #5941
This commit is contained in:
Paul Grey 2022-04-06 12:46:06 -04:00 committed by Matthew Burgess
parent 1cf4e72084
commit 2c83149c6d
No known key found for this signature in database
GPG Key ID: 05D3DEB8126DAD24
2 changed files with 27 additions and 2 deletions

View File

@ -59,7 +59,7 @@ public class JacksonCSVRecordReader extends AbstractCSVRecordReader {
final String dateFormat, final String timeFormat, final String timestampFormat, final String encoding) throws IOException {
super(logger, schema, hasHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat);
final Reader reader = new InputStreamReader(new BOMInputStream(in));
final Reader reader = new InputStreamReader(new BOMInputStream(in), encoding);
CsvSchema.Builder csvSchemaBuilder = CsvSchema.builder()
.setColumnSeparator(csvFormat.getDelimiter())

View File

@ -34,6 +34,7 @@ import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@ -69,7 +70,7 @@ public class TestJacksonCSVRecordReader {
fields.add(new RecordField("name", RecordFieldType.STRING.getDataType()));
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream bais = new ByteArrayInputStream(text.getBytes());
try (final InputStream bais = new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8));
final JacksonCSVRecordReader reader = new JacksonCSVRecordReader(bais, Mockito.mock(ComponentLog.class), schema, format, true, false,
RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat(), "UTF-8")) {
@ -80,6 +81,30 @@ public class TestJacksonCSVRecordReader {
}
}
@Test
public void testISO8859() throws IOException, MalformedRecordException {
final String text = "name\nÄËÖÜ";
final byte[] bytesUTF = text.getBytes(StandardCharsets.UTF_8);
final byte[] bytes8859 = text.getBytes(StandardCharsets.ISO_8859_1);
assertEquals(13, bytesUTF.length, "expected size=13 for UTF-8 representation of test data");
assertEquals(9, bytes8859.length, "expected size=9 for ISO-8859-1 representation of test data");
final List<RecordField> fields = new ArrayList<>();
fields.add(new RecordField("name", RecordFieldType.STRING.getDataType()));
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream bais = new ByteArrayInputStream(text.getBytes(StandardCharsets.ISO_8859_1));
final JacksonCSVRecordReader reader = new JacksonCSVRecordReader(bais, Mockito.mock(ComponentLog.class), schema, format, true, false,
RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat(),
StandardCharsets.ISO_8859_1.name())) {
final Record record = reader.nextRecord();
final String name = (String)record.getValue("name");
assertEquals("ÄËÖÜ", name);
}
}
@Test
public void testDate() throws IOException, MalformedRecordException {
final String dateValue = "1983-11-30";