NIFI-5983: handling parse problems in recordReader implementations

Fixed Checkstyle violation

Signed-off-by: Matthew Burgess <mattyb149@apache.org>

This closes #3282
This commit is contained in:
Endre Zoltan Kovacs 2019-01-31 14:47:21 +01:00 committed by Matthew Burgess
parent 35147a620f
commit 24a7d480c8
4 changed files with 93 additions and 40 deletions

View File

@ -119,6 +119,11 @@
<artifactId>caffeine</artifactId>
<version>2.6.2</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>27.0.1-jre</version>
</dependency>
</dependencies>
<build>
<plugins>

View File

@ -24,6 +24,8 @@ import org.apache.nifi.serialization.record.MapRecord;
import org.apache.nifi.serialization.record.Record;
import org.apache.nifi.serialization.record.RecordSchema;
import com.google.common.base.Throwables;
import java.io.IOException;
import java.util.Map;
@ -33,14 +35,21 @@ public abstract class AvroRecordReader implements RecordReader {
@Override
public Record nextRecord(final boolean coerceTypes, final boolean dropUnknownFields) throws IOException, MalformedRecordException {
GenericRecord record = nextAvroRecord();
if (record == null) {
return null;
try {
GenericRecord record = nextAvroRecord();
if (record == null) {
return null;
}
final RecordSchema schema = getSchema();
final Map<String, Object> values = AvroTypeUtil.convertAvroRecordToMap(record, schema);
return new MapRecord(schema, values);
} catch (IOException e) {
throw e;
} catch (MalformedRecordException e) {
throw e;
} catch (Exception e) {
throw new MalformedRecordException("Error while getting next record. Root cause: " + Throwables.getRootCause(e), e);
}
final RecordSchema schema = getSchema();
final Map<String, Object> values = AvroTypeUtil.convertAvroRecordToMap(record, schema);
return new MapRecord(schema, values);
}
}

View File

@ -42,6 +42,8 @@ import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordFieldType;
import org.apache.nifi.serialization.record.RecordSchema;
import com.google.common.base.Throwables;
public class CSVRecordReader extends AbstractCSVRecordReader {
private final CSVParser csvParser;
@ -72,45 +74,49 @@ public class CSVRecordReader extends AbstractCSVRecordReader {
@Override
public Record nextRecord(final boolean coerceTypes, final boolean dropUnknownFields) throws IOException, MalformedRecordException {
final RecordSchema schema = getSchema();
final List<RecordField> recordFields = getRecordFields();
final int numFieldNames = recordFields.size();
try {
final RecordSchema schema = getSchema();
for (final CSVRecord csvRecord : csvParser) {
final Map<String, Object> values = new LinkedHashMap<>(recordFields.size() * 2);
for (int i = 0; i < csvRecord.size(); i++) {
final String rawValue = csvRecord.get(i);
final List<RecordField> recordFields = getRecordFields();
final int numFieldNames = recordFields.size();
for (final CSVRecord csvRecord : csvParser) {
final Map<String, Object> values = new LinkedHashMap<>(recordFields.size() * 2);
for (int i = 0; i < csvRecord.size(); i++) {
final String rawValue = csvRecord.get(i);
final String rawFieldName;
final DataType dataType;
if (i >= numFieldNames) {
if (!dropUnknownFields) {
values.put("unknown_field_index_" + i, rawValue);
final String rawFieldName;
final DataType dataType;
if (i >= numFieldNames) {
if (!dropUnknownFields) {
values.put("unknown_field_index_" + i, rawValue);
}
continue;
} else {
final RecordField recordField = recordFields.get(i);
rawFieldName = recordField.getFieldName();
dataType = recordField.getDataType();
}
continue;
} else {
final RecordField recordField = recordFields.get(i);
rawFieldName = recordField.getFieldName();
dataType = recordField.getDataType();
final Object value;
if (coerceTypes) {
value = convert(rawValue, dataType, rawFieldName);
} else {
// The CSV Reader is going to return all fields as Strings, because CSV doesn't have any way to
// dictate a field type. As a result, we will use the schema that we have to attempt to convert
// the value into the desired type if it's a simple type.
value = convertSimpleIfPossible(rawValue, dataType, rawFieldName);
}
values.put(rawFieldName, value);
}
final Object value;
if (coerceTypes) {
value = convert(rawValue, dataType, rawFieldName);
} else {
// The CSV Reader is going to return all fields as Strings, because CSV doesn't have any way to
// dictate a field type. As a result, we will use the schema that we have to attempt to convert
// the value into the desired type if it's a simple type.
value = convertSimpleIfPossible(rawValue, dataType, rawFieldName);
}
values.put(rawFieldName, value);
return new MapRecord(schema, values, coerceTypes, dropUnknownFields);
}
return new MapRecord(schema, values, coerceTypes, dropUnknownFields);
} catch (Exception e) {
throw new MalformedRecordException("Error while getting next record. Root cause: " + Throwables.getRootCause(e), e);
}
return null;

View File

@ -17,6 +17,7 @@
package org.apache.nifi.csv;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.QuoteMode;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.serialization.MalformedRecordException;
import org.apache.nifi.serialization.SimpleRecordSchema;
@ -27,22 +28,30 @@ import org.apache.nifi.serialization.record.RecordSchema;
import org.junit.Test;
import org.mockito.Mockito;
import com.google.common.base.Throwables;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import static org.hamcrest.CoreMatchers.instanceOf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThat;
public class ITApacheCSVRecordReader {
private final CSVFormat format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"');
private List<RecordField> getDefaultFields() {
return createStringFields(new String[]{"id", "name", "balance", "address", "city", "state", "zipCode", "country"});
}
private List<RecordField> createStringFields(String[] fieldNames) {
final List<RecordField> fields = new ArrayList<>();
for (final String fieldName : new String[]{"id", "name", "balance", "address", "city", "state", "zipCode", "country"}) {
for (final String fieldName : fieldNames) {
fields.add(new RecordField(fieldName, RecordFieldType.STRING.getDataType()));
}
return fields;
@ -71,4 +80,28 @@ public class ITApacheCSVRecordReader {
assertEquals(NUM_LINES, numRecords);
}
}
@Test
public void testExceptionThrownOnParseProblem() throws IOException, MalformedRecordException {
CSVFormat csvFormat = CSVFormat.DEFAULT.withFirstRecordAsHeader().withQuoteMode(QuoteMode.ALL).withTrim().withDelimiter(',');
final int NUM_LINES = 25;
StringBuilder sb = new StringBuilder("\"id\",\"name\",\"balance\"");
for (int i = 0; i < NUM_LINES; i++) {
sb.append(String.format("\"%s\",\"John Doe\",\"4750.89D\"\n", i));
}
// cause a parse problem
sb.append(String.format("\"%s\"dieParser,\"John Doe\",\"4750.89D\"\n", NUM_LINES ));
sb.append(String.format("\"%s\",\"John Doe\",\"4750.89D\"\n", NUM_LINES + 1));
final RecordSchema schema = new SimpleRecordSchema(createStringFields(new String[] {"id", "name", "balance"}));
try (final InputStream bais = new ByteArrayInputStream(sb.toString().getBytes());
final CSVRecordReader reader = new CSVRecordReader(bais, Mockito.mock(ComponentLog.class), schema, csvFormat, true, false,
RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat(), "UTF-8")) {
while (reader.nextRecord() != null) {}
} catch (Exception e) {
assertThat(e, instanceOf(MalformedRecordException.class));
assertThat(Throwables.getRootCause(e), instanceOf(IOException.class));
}
}
}