NIFI-8761 Enable not setting a value for Escape Character in CSVReade… (#5249)

NIFI-8761 Enable not setting a value for Escape Character in CSVReader controller service

Co-authored-by: Pierre Villard <pierre.villard.fr@gmail.com>
This commit is contained in:
timeabarna 2021-08-24 14:38:16 +02:00 committed by GitHub
parent a652280fbb
commit 9ebdd4bdf1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 286 additions and 71 deletions

View File

@ -67,7 +67,7 @@ public class CSVUtils {
.name("Quote Character")
.description("The character that is used to quote values so that escape characters do not have to be used. If the property has been specified via Expression Language " +
"but the expression gets evaluated to an invalid Quote Character at runtime, then it will be skipped and the default Quote Character will be used.")
.addValidator(new CSVValidators.SingleCharacterValidator())
.addValidator(CSVValidators.SINGLE_CHAR_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.dependsOn(CSV_FORMAT, CUSTOM)
.defaultValue("\"")
@ -101,7 +101,7 @@ public class CSVUtils {
public static final PropertyDescriptor COMMENT_MARKER = new PropertyDescriptor.Builder()
.name("Comment Marker")
.description("The character that is used to denote the start of a comment. Any line that begins with this comment will be ignored.")
.addValidator(new CSVValidators.SingleCharacterValidator())
.addValidator(CSVValidators.SINGLE_CHAR_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.dependsOn(CSV_FORMAT, CUSTOM)
.required(false)
@ -109,8 +109,9 @@ public class CSVUtils {
public static final PropertyDescriptor ESCAPE_CHAR = new PropertyDescriptor.Builder()
.name("Escape Character")
.description("The character that is used to escape characters that would otherwise have a specific meaning to the CSV Parser. If the property has been specified via Expression Language " +
"but the expression gets evaluated to an invalid Escape Character at runtime, then it will be skipped and the default Escape Character will be used.")
.addValidator(new CSVValidators.SingleCharacterValidator())
"but the expression gets evaluated to an invalid Escape Character at runtime, then it will be skipped and the default Escape Character will be used. " +
"Setting it to an empty string means no escape character should be used.")
.addValidator(CSVValidators.EMPTY_OR_SINGLE_CHAR_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.dependsOn(CSV_FORMAT, CUSTOM)
.defaultValue("\\")
@ -284,7 +285,7 @@ public class CSVUtils {
final Character quoteChar = getCharUnescaped(context, QUOTE_CHAR, variables);
format = format.withQuote(quoteChar);
final Character escapeChar = getCharUnescaped(context, ESCAPE_CHAR, variables);
final Character escapeChar = context.getProperty(CSVUtils.ESCAPE_CHAR).evaluateAttributeExpressions(variables).getValue().isEmpty() ? null : getCharUnescaped(context, ESCAPE_CHAR, variables);
format = format.withEscape(escapeChar);
format = format.withTrim(context.getProperty(TRIM_FIELDS).asBoolean());

View File

@ -25,57 +25,17 @@ import java.util.HashSet;
import java.util.Set;
public class CSVValidators {
private static final Set<String> illegalChars = new HashSet<>();
public static class SingleCharacterValidator implements Validator {
private static final Set<String> illegalChars = new HashSet<>();
static {
illegalChars.add("\r");
illegalChars.add("\n");
}
@Override
public ValidationResult validate(final String subject, final String input, final ValidationContext context) {
if (input == null) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation("Input is null for this property")
.build();
}
if (!context.isExpressionLanguageSupported(subject) || !context.isExpressionLanguagePresent(input)) {
final String unescaped = CSVUtils.unescape(input);
if (unescaped.length() != 1) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation("Value must be exactly 1 character but was " + input.length() + " in length")
.build();
}
if (illegalChars.contains(unescaped)) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation(input + " is not a valid character for this property")
.build();
}
}
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(true)
.build();
}
static {
illegalChars.add("\r");
illegalChars.add("\n");
}
public static final Validator SINGLE_CHAR_VALIDATOR = createSingleCharValidator(false);
public static final Validator EMPTY_OR_SINGLE_CHAR_VALIDATOR = createSingleCharValidator(true);
public static final Validator UNESCAPED_SINGLE_CHAR_VALIDATOR = new Validator() {
@Override
public ValidationResult validate(final String subject, final String input, final ValidationContext context) {
@ -101,4 +61,57 @@ public class CSVValidators {
}
};
private static Validator createSingleCharValidator(final boolean canBeEmpty) {
return new Validator() {
@Override
public ValidationResult validate(String subject, String input, ValidationContext context) {
if (input == null) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation("Input is null for this property")
.build();
}
if (input.isEmpty()) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(canBeEmpty)
.explanation("Value must be exactly 1 character but was 0 in length")
.build();
}
if (!context.isExpressionLanguageSupported(subject) || !context.isExpressionLanguagePresent(input)) {
final String unescaped = CSVUtils.unescape(input);
if (unescaped.length() != 1) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation("Value must be exactly 1 character but was " + input.length() + " in length")
.build();
}
if (illegalChars.contains(unescaped)) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation(input + " is not a valid character for this property")
.build();
}
}
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(true)
.build();
}
};
}
}

View File

@ -119,7 +119,7 @@ public class CSVUtilsTest {
assertEquals(',', csvFormat.getDelimiter());
assertEquals('"', (char) csvFormat.getQuoteCharacter());
assertEquals('\\', (char) csvFormat.getEscapeCharacter());
assertNull(csvFormat.getEscapeCharacter());
assertNull(csvFormat.getCommentMarker());
}

View File

@ -146,7 +146,8 @@
<exclude>src/test/resources/csv/extra-white-space.csv</exclude>
<exclude>src/test/resources/csv/multi-bank-account.csv</exclude>
<exclude>src/test/resources/csv/single-bank-account.csv</exclude>
<exclude>src/test/resources/csv/multi-bank-account_escapedchar.csv</exclude>
<exclude>src/test/resources/csv/multi-bank-account_escapechar.csv</exclude>
<exclude>src/test/resources/csv/multi-bank-account_spec_delimiter.csv</exclude>
<exclude>src/test/resources/csv/prov-events.csv</exclude>
<exclude>src/test/resources/grok/error-with-stack-trace.log</exclude>
<exclude>src/test/resources/grok/nifi-log-sample-multiline-with-stacktrace.log</exclude>

View File

@ -620,7 +620,7 @@ public class TestCSVRecordReader {
}
@Test
public void testMultipleRecordsEscapedWithSpecialChar() throws IOException, MalformedRecordException {
public void testMultipleRecordsDelimitedWithSpecialChar() throws IOException, MalformedRecordException {
char delimiter = StringEscapeUtils.unescapeJava("\u0001").charAt(0);
@ -630,7 +630,7 @@ public class TestCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_escapedchar.csv");
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_spec_delimiter.csv");
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
@ -645,6 +645,46 @@ public class TestCSVRecordReader {
}
}
@Test
public void testMultipleRecordsEscapedWithChar() throws IOException {
final CSVFormat format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"').withDelimiter(",".charAt(0)).withEscape("\\".charAt(0));
final List<RecordField> fields = getDefaultFields();
fields.replaceAll(f -> f.getFieldName().equals("balance") ? new RecordField("balance", doubleDataType) : f);
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_escapechar.csv");
final CSVRecordReader reader = createReader(fis, schema, format)) {
assertThrows(MalformedRecordException.class, () -> reader.nextRecord());
}
}
@Test
public void testMultipleRecordsEscapedWithNull() throws IOException, MalformedRecordException {
final CSVFormat format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"').withDelimiter(",".charAt(0)).withEscape(null);
final List<RecordField> fields = getDefaultFields();
fields.replaceAll(f -> f.getFieldName().equals("balance") ? new RecordField("balance", doubleDataType) : f);
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_escapechar.csv");
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
final Object[] firstExpectedValues = new Object[] {"1", "John Doe\\", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"};
Assert.assertArrayEquals(firstExpectedValues, firstRecord);
final Object[] secondRecord = reader.nextRecord().getValues();
final Object[] secondExpectedValues = new Object[] {"2", "Jane Doe", 4820.09D, "321 Your Street", "Your City", "NY", "33333", "USA"};
Assert.assertArrayEquals(secondExpectedValues, secondRecord);
assertNull(reader.nextRecord());
}
}
@Test
public void testQuote() throws IOException, MalformedRecordException {
final CSVFormat format = CSVFormat.RFC4180.withFirstRecordAsHeader().withTrim().withQuote('"');

View File

@ -33,16 +33,25 @@ public class TestCSVValidators {
/*** SingleCharValidator **/
@Test
public void testSingleCharNullValue() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
Validator validator = CSVValidators.SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", null, mockContext);
assertEquals("Input is null for this property", result.getExplanation());
assertFalse(result.isValid());
}
@Test
public void testSingleCharEmptyValue() {
Validator validator = CSVValidators.SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "", mockContext);
assertEquals("Value must be exactly 1 character but was 0 in length", result.getExplanation());
assertFalse(result.isValid());
}
@Test
public void testSingleCharTab() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
Validator validator = CSVValidators.SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "\\t", mockContext);
assertTrue(result.isValid());
@ -50,24 +59,16 @@ public class TestCSVValidators {
@Test
public void testSingleCharIllegalChar() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
Validator validator = CSVValidators.SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "\\r", mockContext);
assertEquals("\\r is not a valid character for this property", result.getExplanation());
assertFalse(result.isValid());
}
@Test
public void testSingleCharGoodChar() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "'", mockContext);
assertTrue(result.isValid());
}
@Test
public void testSingleCharExpressionLanguage() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
Validator validator = CSVValidators.SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
Mockito.when(mockContext.isExpressionLanguageSupported(Mockito.any())).thenReturn(true);
Mockito.when(mockContext.isExpressionLanguagePresent(Mockito.any())).thenReturn(true);
@ -75,6 +76,66 @@ public class TestCSVValidators {
assertTrue(result.isValid());
}
@Test
public void testSingleCharGoodChar() {
Validator validator = CSVValidators.SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "'", mockContext);
assertTrue(result.isValid());
}
/*** Empty Or SingleCharValidator **/
@Test
public void testEmptySingleCharNullValue() {
Validator validator = CSVValidators.EMPTY_OR_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", null, mockContext);
assertEquals("Input is null for this property", result.getExplanation());
assertFalse(result.isValid());
}
@Test
public void testEmptySingleCharTab() {
Validator validator = CSVValidators.EMPTY_OR_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "\\t", mockContext);
assertTrue(result.isValid());
}
@Test
public void testEmptySingleCharIllegalChar() {
Validator validator = CSVValidators.EMPTY_OR_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "\\r", mockContext);
assertEquals("\\r is not a valid character for this property", result.getExplanation());
assertFalse(result.isValid());
}
@Test
public void testEmptySingleCharExpressionLanguage() {
Validator validator = CSVValidators.EMPTY_OR_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
Mockito.when(mockContext.isExpressionLanguageSupported(Mockito.any())).thenReturn(true);
Mockito.when(mockContext.isExpressionLanguagePresent(Mockito.any())).thenReturn(true);
ValidationResult result = validator.validate("EscapeChar", "${csv.escape}", mockContext);
assertTrue(result.isValid());
}
@Test
public void testEmptySingleCharGoodChar() {
Validator validator = CSVValidators.EMPTY_OR_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "'", mockContext);
assertTrue(result.isValid());
}
@Test
public void testEmptySingleCharEmptyChar() {
Validator validator = CSVValidators.EMPTY_OR_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "", mockContext);
assertTrue(result.isValid());
}
/*** Unescaped SingleCharValidator **/

View File

@ -402,7 +402,7 @@ public class TestJacksonCSVRecordReader {
}
@Test
public void testMultipleRecordsEscapedWithSpecialChar() throws IOException, MalformedRecordException {
public void testMultipleRecordsDelimitedWithSpecialChar() throws IOException, MalformedRecordException {
char delimiter = StringEscapeUtils.unescapeJava("\u0001").charAt(0);
@ -412,7 +412,7 @@ public class TestJacksonCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_escapedchar.csv");
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_spec_delimiter.csv");
final JacksonCSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
@ -427,6 +427,46 @@ public class TestJacksonCSVRecordReader {
}
}
@Test
public void testMultipleRecordsEscapedWithChar() throws IOException {
final CSVFormat format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"').withDelimiter(",".charAt(0)).withEscape("\\".charAt(0));
final List<RecordField> fields = getDefaultFields();
fields.replaceAll(f -> f.getFieldName().equals("balance") ? new RecordField("balance", doubleDataType) : f);
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_escapechar.csv");
final JacksonCSVRecordReader reader = createReader(fis, schema, format)) {
assertThrows(NumberFormatException.class, () -> reader.nextRecord());
}
}
@Test
public void testMultipleRecordsEscapedWithNull() throws IOException, MalformedRecordException {
final CSVFormat format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"').withDelimiter(",".charAt(0)).withEscape(null);
final List<RecordField> fields = getDefaultFields();
fields.replaceAll(f -> f.getFieldName().equals("balance") ? new RecordField("balance", doubleDataType) : f);
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream("src/test/resources/csv/multi-bank-account_escapechar.csv");
final JacksonCSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
final Object[] firstExpectedValues = new Object[] {"1", "John Doe\\", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"};
Assert.assertArrayEquals(firstExpectedValues, firstRecord);
final Object[] secondRecord = reader.nextRecord().getValues();
final Object[] secondExpectedValues = new Object[] {"2", "Jane Doe", 4820.09D, "321 Your Street", "Your City", "NY", "33333", "USA"};
Assert.assertArrayEquals(secondExpectedValues, secondRecord);
assertNull(reader.nextRecord());
}
}
@Test
public void testNullRecordSeparator() throws IOException, MalformedRecordException {
final List<RecordField> fields = getDefaultFields();

View File

@ -330,6 +330,62 @@ public class TestWriteCSVResult {
assertEquals("id,name,dob\n1,,1/1/1970\n", output);
}
@Test
public void testEscapeCharInValueWriteRecord() throws IOException {
final CSVFormat csvFormat = CSVFormat.DEFAULT.withEscape('\\').withQuote("\"".charAt(0)).withRecordSeparator("\n");
final List<RecordField> fields = new ArrayList<>();
fields.add(new RecordField("id", RecordFieldType.STRING.getDataType()));
fields.add(new RecordField("name", RecordFieldType.STRING.getDataType()));
final RecordSchema schema = new SimpleRecordSchema(fields);
final Map<String, Object> values = new LinkedHashMap<>();
values.put("id", "1\\");
values.put("name", "John Doe");
final Record record = new MapRecord(schema, values);
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final String output;
try (final WriteCSVResult writer = new WriteCSVResult(csvFormat, schema, new SchemaNameAsAttribute(), baos,
RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat(), true, "ASCII")) {
writer.beginRecordSet();
writer.write(record);
writer.finishRecordSet();
writer.flush();
output = baos.toString();
}
assertEquals("id,name\n\"1\\\\\",John Doe\n", output);
}
@Test
public void testEmptyEscapeCharWriteRecord() throws IOException {
final CSVFormat csvFormat = CSVFormat.DEFAULT.withEscape(null).withQuote("\"".charAt(0)).withRecordSeparator("\n");
final List<RecordField> fields = new ArrayList<>();
fields.add(new RecordField("id", RecordFieldType.STRING.getDataType()));
fields.add(new RecordField("name", RecordFieldType.STRING.getDataType()));
final RecordSchema schema = new SimpleRecordSchema(fields);
final Map<String, Object> values = new LinkedHashMap<>();
values.put("id", "1\\");
values.put("name", "John Doe");
final Record record = new MapRecord(schema, values);
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final String output;
try (final WriteCSVResult writer = new WriteCSVResult(csvFormat, schema, new SchemaNameAsAttribute(), baos,
RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat(), true, "ASCII")) {
writer.beginRecordSet();
writer.write(record);
writer.finishRecordSet();
writer.flush();
output = baos.toString();
}
assertEquals("id,name\n1\\,John Doe\n", output);
}
private DateFormat getDateFormat(final String format) {
final DateFormat df = new SimpleDateFormat(format);

View File

@ -0,0 +1,3 @@
id, name, balance, address, city, state, zipCode, country
1, John Doe\, "4750.89", "123 My Street", My City, MS, 11111, USA
2, Jane Doe, 4820.09, 321 Your Street, Your City, NY, 33333, USA
1 id name balance address city state zipCode country
2 1 John Doe\ 4750.89 123 My Street My City MS 11111 USA
3 2 Jane Doe 4820.09 321 Your Street Your City NY 33333 USA