NIFI-4416 CSVRecordReader does not accept escaped character as delimiter

Signed-off-by: Matthew Burgess <mattyb149@apache.org>

This closes #2172
This commit is contained in:
Arun Manivannan 2017-09-25 22:24:38 +08:00 committed by Matthew Burgess
parent 1ca4fa3a83
commit 3b1b326b13
7 changed files with 257 additions and 76 deletions

View File

@ -99,6 +99,7 @@
<exclude>src/test/resources/csv/extra-white-space.csv</exclude>
<exclude>src/test/resources/csv/multi-bank-account.csv</exclude>
<exclude>src/test/resources/csv/single-bank-account.csv</exclude>
<exclude>src/test/resources/csv/multi-bank-account_escapedchar.csv</exclude>
<exclude>src/test/resources/grok/error-with-stack-trace.log</exclude>
<exclude>src/test/resources/grok/nifi-log-sample-multiline-with-stacktrace.log</exclude>
<exclude>src/test/resources/grok/nifi-log-sample.log</exclude>

View File

@ -19,6 +19,7 @@ package org.apache.nifi.csv;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.QuoteMode;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.PropertyValue;
@ -48,7 +49,7 @@ public class CSVUtils {
static final PropertyDescriptor VALUE_SEPARATOR = new PropertyDescriptor.Builder()
.name("Value Separator")
.description("The character that is used to separate values/fields in a CSV Record")
.addValidator(new SingleCharacterValidator())
.addValidator(CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR)
.expressionLanguageSupported(false)
.defaultValue(",")
.required(true)
@ -56,7 +57,7 @@ public class CSVUtils {
static final PropertyDescriptor QUOTE_CHAR = new PropertyDescriptor.Builder()
.name("Quote Character")
.description("The character that is used to quote values so that escape characters do not have to be used")
.addValidator(new SingleCharacterValidator())
.addValidator(new CSVValidators.SingleCharacterValidator())
.expressionLanguageSupported(false)
.defaultValue("\"")
.required(true)
@ -89,14 +90,14 @@ public class CSVUtils {
static final PropertyDescriptor COMMENT_MARKER = new PropertyDescriptor.Builder()
.name("Comment Marker")
.description("The character that is used to denote the start of a comment. Any line that begins with this comment will be ignored.")
.addValidator(new SingleCharacterValidator())
.addValidator(new CSVValidators.SingleCharacterValidator())
.expressionLanguageSupported(false)
.required(false)
.build();
static final PropertyDescriptor ESCAPE_CHAR = new PropertyDescriptor.Builder()
.name("Escape Character")
.description("The character that is used to escape characters that would otherwise have a specific meaning to the CSV Parser.")
.addValidator(new SingleCharacterValidator())
.addValidator(new CSVValidators.SingleCharacterValidator())
.expressionLanguageSupported(false)
.defaultValue("\\")
.required(true)
@ -179,12 +180,16 @@ public class CSVUtils {
}
}
private static char getUnescapedChar(final ConfigurationContext context, final PropertyDescriptor property) {
return StringEscapeUtils.unescapeJava(context.getProperty(property).getValue()).charAt(0);
}
private static char getChar(final ConfigurationContext context, final PropertyDescriptor property) {
return CSVUtils.unescape(context.getProperty(property).getValue()).charAt(0);
}
private static CSVFormat buildCustomFormat(final ConfigurationContext context) {
final char valueSeparator = getChar(context, VALUE_SEPARATOR);
final char valueSeparator = getUnescapedChar(context, VALUE_SEPARATOR);
CSVFormat format = CSVFormat.newFormat(valueSeparator)
.withAllowMissingColumnNames()
.withIgnoreEmptyLines();

View File

@ -0,0 +1,109 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.csv;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
import java.util.HashSet;
import java.util.Set;
public class CSVValidators {
public static class SingleCharacterValidator implements Validator {
private static final Set<String> illegalChars = new HashSet<>();
static {
illegalChars.add("\r");
illegalChars.add("\n");
}
@Override
public ValidationResult validate(final String subject, final String input, final ValidationContext context) {
if (input == null) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation("Input is null for this property")
.build();
}
final String unescaped = CSVUtils.unescape(input);
if (unescaped.length() != 1) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation("Value must be exactly 1 character but was " + input.length() + " in length")
.build();
}
if (illegalChars.contains(unescaped)) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation(input + " is not a valid character for this property")
.build();
}
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(true)
.build();
}
}
public static final Validator UNESCAPED_SINGLE_CHAR_VALIDATOR = new Validator() {
@Override
public ValidationResult validate(final String subject, final String input, final ValidationContext context) {
if (input == null) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation("Input is null for this property")
.build();
}
String unescapeString = unescapeString(input);
return new ValidationResult.Builder()
.subject(subject)
.input(unescapeString)
.explanation("Only non-null single characters are supported")
.valid((unescapeString.length() == 1 && unescapeString.charAt(0) != 0) || context.isExpressionLanguagePresent(input))
.build();
}
private String unescapeString(String input) {
if (input != null && input.length() > 1) {
input = StringEscapeUtils.unescapeJava(input);
}
return input;
}
};
}

View File

@ -1,62 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.csv;
import java.util.HashSet;
import java.util.Set;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
public class SingleCharacterValidator implements Validator {
private static final Set<String> illegalChars = new HashSet<>();
static {
illegalChars.add("\r");
illegalChars.add("\n");
}
@Override
public ValidationResult validate(final String subject, final String input, final ValidationContext context) {
final String unescaped = CSVUtils.unescape(input);
if (unescaped.length() != 1) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation("Value must be exactly 1 character but was " + input.length() + " in length")
.build();
}
if (illegalChars.contains(input)) {
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(false)
.explanation(input + " is not a valid character for this property")
.build();
}
return new ValidationResult.Builder()
.input(input)
.subject(subject)
.valid(true)
.build();
}
}

View File

@ -33,6 +33,7 @@ import java.util.List;
import java.util.TimeZone;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.serialization.MalformedRecordException;
import org.apache.nifi.serialization.SimpleRecordSchema;
@ -57,7 +58,7 @@ public class TestCSVRecordReader {
return fields;
}
private CSVRecordReader createReader(final InputStream in, final RecordSchema schema) throws IOException {
private CSVRecordReader createReader(final InputStream in, final RecordSchema schema, CSVFormat format) throws IOException {
return new CSVRecordReader(in, Mockito.mock(ComponentLog.class), schema, format, true, false,
RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat());
}
@ -93,7 +94,7 @@ public class TestCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/single-bank-account.csv"));
final CSVRecordReader reader = createReader(fis, schema)) {
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] record = reader.nextRecord().getValues();
final Object[] expectedValues = new Object[] {"1", "John Doe", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"};
@ -111,7 +112,7 @@ public class TestCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/multi-bank-account.csv"));
final CSVRecordReader reader = createReader(fis, schema)) {
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
final Object[] firstExpectedValues = new Object[] {"1", "John Doe", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"};
@ -133,7 +134,7 @@ public class TestCSVRecordReader {
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/extra-white-space.csv"));
final CSVRecordReader reader = createReader(fis, schema)) {
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
final Object[] firstExpectedValues = new Object[] {"1", "John Doe", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"};
@ -160,7 +161,7 @@ public class TestCSVRecordReader {
final byte[] inputData = csvData.getBytes();
try (final InputStream bais = new ByteArrayInputStream(inputData);
final CSVRecordReader reader = createReader(bais, schema)) {
final CSVRecordReader reader = createReader(bais, schema, format)) {
final Record record = reader.nextRecord();
assertNotNull(record);
@ -190,7 +191,7 @@ public class TestCSVRecordReader {
// test nextRecord does not contain a 'continent' field
try (final InputStream bais = new ByteArrayInputStream(inputData);
final CSVRecordReader reader = createReader(bais, schema)) {
final CSVRecordReader reader = createReader(bais, schema, format)) {
final Record record = reader.nextRecord();
assertNotNull(record);
@ -210,7 +211,7 @@ public class TestCSVRecordReader {
// test nextRawRecord does contain 'continent' field
try (final InputStream bais = new ByteArrayInputStream(inputData);
final CSVRecordReader reader = createReader(bais, schema)) {
final CSVRecordReader reader = createReader(bais, schema, format)) {
final Record record = reader.nextRecord(false, false);
assertNotNull(record);
@ -241,7 +242,7 @@ public class TestCSVRecordReader {
final byte[] inputData = csvData.getBytes();
try (final InputStream bais = new ByteArrayInputStream(inputData);
final CSVRecordReader reader = createReader(bais, schema)) {
final CSVRecordReader reader = createReader(bais, schema, format)) {
final Record record = reader.nextRecord();
assertNotNull(record);
@ -304,7 +305,7 @@ public class TestCSVRecordReader {
// test nextRecord does not contain a 'continent' field
try (final InputStream bais = new ByteArrayInputStream(inputData);
final CSVRecordReader reader = createReader(bais, schema)) {
final CSVRecordReader reader = createReader(bais, schema, format)) {
final Record record = reader.nextRecord(false, false);
assertNotNull(record);
@ -322,4 +323,30 @@ public class TestCSVRecordReader {
assertNull(reader.nextRecord(false, false));
}
}
@Test
public void testMultipleRecordsEscapedWithSpecialChar() throws IOException, MalformedRecordException {
char delimiter = StringEscapeUtils.unescapeJava("\u0001").charAt(0);
final CSVFormat format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"').withDelimiter(delimiter);
final List<RecordField> fields = getDefaultFields();
fields.replaceAll(f -> f.getFieldName().equals("balance") ? new RecordField("balance", doubleDataType) : f);
final RecordSchema schema = new SimpleRecordSchema(fields);
try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/multi-bank-account_escapedchar.csv"));
final CSVRecordReader reader = createReader(fis, schema, format)) {
final Object[] firstRecord = reader.nextRecord().getValues();
final Object[] firstExpectedValues = new Object[] {"1", "John Doe", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"};
Assert.assertArrayEquals(firstExpectedValues, firstRecord);
final Object[] secondRecord = reader.nextRecord().getValues();
final Object[] secondExpectedValues = new Object[] {"2", "Jane Doe", 4820.09D, "321 Your Street", "Your City", "NY", "33333", "USA"};
Assert.assertArrayEquals(secondExpectedValues, secondRecord);
assertNull(reader.nextRecord());
}
}
}

View File

@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.csv;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
import org.junit.Test;
import org.mockito.Mockito;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class TestCSVValidators {
/*** SingleCharValidator **/
@Test
public void testSingleCharNullValue() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", null, mockContext);
assertEquals("Input is null for this property", result.getExplanation());
assertFalse(result.isValid());
}
@Test
public void testSingleCharTab() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "\\t", mockContext);
assertTrue(result.isValid());
}
@Test
public void testSingleCharIllegalChar() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "\\r", mockContext);
assertEquals("\\r is not a valid character for this property", result.getExplanation());
assertFalse(result.isValid());
}
@Test
public void testSingleCharGoodChar() {
CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator();
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("EscapeChar", "'", mockContext);
assertTrue(result.isValid());
}
/*** Unescaped SingleCharValidator **/
@Test
public void testUnEscapedSingleCharNullValue() {
Validator validator = CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("Delimiter", null, mockContext);
assertEquals("Input is null for this property", result.getExplanation());
assertFalse(result.isValid());
}
@Test
public void testUnescapedSingleCharUnicodeChar() {
Validator validator = CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("Delimiter", "\\u0001", mockContext);
assertTrue(result.isValid());
}
@Test
public void testUnescapedSingleCharGoodChar() {
Validator validator = CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR;
ValidationContext mockContext = Mockito.mock(ValidationContext.class);
ValidationResult result = validator.validate("Delimiter", ",", mockContext);
assertTrue(result.isValid());
}
}

View File

@ -0,0 +1,3 @@
idnamebalanceaddresscitystatezipCodecountry
1John Doe"4750.89""123 My Street"My CityMS11111USA
2Jane Doe4820.09321 Your StreetYour CityNY33333USA
Can't render this file because it contains an unexpected character in line 2 and column 12.