From 3b1b326b1316d3ec2cc186ae6a7c77228ab1ed22 Mon Sep 17 00:00:00 2001 From: Arun Manivannan Date: Mon, 25 Sep 2017 22:24:38 +0800 Subject: [PATCH] NIFI-4416 CSVRecordReader does not accept escaped character as delimiter Signed-off-by: Matthew Burgess This closes #2172 --- .../pom.xml | 1 + .../java/org/apache/nifi/csv/CSVUtils.java | 15 ++- .../org/apache/nifi/csv/CSVValidators.java | 109 ++++++++++++++++++ .../nifi/csv/SingleCharacterValidator.java | 62 ---------- .../apache/nifi/csv/TestCSVRecordReader.java | 45 ++++++-- .../apache/nifi/csv/TestCSVValidators.java | 98 ++++++++++++++++ .../csv/multi-bank-account_escapedchar.csv | 3 + 7 files changed, 257 insertions(+), 76 deletions(-) create mode 100644 nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVValidators.java delete mode 100644 nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/SingleCharacterValidator.java create mode 100644 nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/csv/TestCSVValidators.java create mode 100644 nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/csv/multi-bank-account_escapedchar.csv diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml index bd9675356c..bdcffaef90 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/pom.xml @@ -99,6 +99,7 @@ src/test/resources/csv/extra-white-space.csv src/test/resources/csv/multi-bank-account.csv src/test/resources/csv/single-bank-account.csv + src/test/resources/csv/multi-bank-account_escapedchar.csv src/test/resources/grok/error-with-stack-trace.log src/test/resources/grok/nifi-log-sample-multiline-with-stacktrace.log src/test/resources/grok/nifi-log-sample.log diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVUtils.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVUtils.java index 3f5dcf6a5b..17152aa5fc 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVUtils.java +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVUtils.java @@ -19,6 +19,7 @@ package org.apache.nifi.csv; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.QuoteMode; +import org.apache.commons.lang3.StringEscapeUtils; import org.apache.nifi.components.AllowableValue; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.components.PropertyValue; @@ -48,7 +49,7 @@ public class CSVUtils { static final PropertyDescriptor VALUE_SEPARATOR = new PropertyDescriptor.Builder() .name("Value Separator") .description("The character that is used to separate values/fields in a CSV Record") - .addValidator(new SingleCharacterValidator()) + .addValidator(CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR) .expressionLanguageSupported(false) .defaultValue(",") .required(true) @@ -56,7 +57,7 @@ public class CSVUtils { static final PropertyDescriptor QUOTE_CHAR = new PropertyDescriptor.Builder() .name("Quote Character") .description("The character that is used to quote values so that escape characters do not have to be used") - .addValidator(new SingleCharacterValidator()) + .addValidator(new CSVValidators.SingleCharacterValidator()) .expressionLanguageSupported(false) .defaultValue("\"") .required(true) @@ -89,14 +90,14 @@ public class CSVUtils { static final PropertyDescriptor COMMENT_MARKER = new PropertyDescriptor.Builder() .name("Comment Marker") .description("The character that is used to denote the start of a comment. Any line that begins with this comment will be ignored.") - .addValidator(new SingleCharacterValidator()) + .addValidator(new CSVValidators.SingleCharacterValidator()) .expressionLanguageSupported(false) .required(false) .build(); static final PropertyDescriptor ESCAPE_CHAR = new PropertyDescriptor.Builder() .name("Escape Character") .description("The character that is used to escape characters that would otherwise have a specific meaning to the CSV Parser.") - .addValidator(new SingleCharacterValidator()) + .addValidator(new CSVValidators.SingleCharacterValidator()) .expressionLanguageSupported(false) .defaultValue("\\") .required(true) @@ -179,12 +180,16 @@ public class CSVUtils { } } + private static char getUnescapedChar(final ConfigurationContext context, final PropertyDescriptor property) { + return StringEscapeUtils.unescapeJava(context.getProperty(property).getValue()).charAt(0); + } + private static char getChar(final ConfigurationContext context, final PropertyDescriptor property) { return CSVUtils.unescape(context.getProperty(property).getValue()).charAt(0); } private static CSVFormat buildCustomFormat(final ConfigurationContext context) { - final char valueSeparator = getChar(context, VALUE_SEPARATOR); + final char valueSeparator = getUnescapedChar(context, VALUE_SEPARATOR); CSVFormat format = CSVFormat.newFormat(valueSeparator) .withAllowMissingColumnNames() .withIgnoreEmptyLines(); diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVValidators.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVValidators.java new file mode 100644 index 0000000000..5979407c9f --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVValidators.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nifi.csv; + +import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.components.Validator; + +import java.util.HashSet; +import java.util.Set; + +public class CSVValidators { + + public static class SingleCharacterValidator implements Validator { + private static final Set illegalChars = new HashSet<>(); + + static { + illegalChars.add("\r"); + illegalChars.add("\n"); + } + + @Override + public ValidationResult validate(final String subject, final String input, final ValidationContext context) { + + if (input == null) { + return new ValidationResult.Builder() + .input(input) + .subject(subject) + .valid(false) + .explanation("Input is null for this property") + .build(); + } + + final String unescaped = CSVUtils.unescape(input); + if (unescaped.length() != 1) { + return new ValidationResult.Builder() + .input(input) + .subject(subject) + .valid(false) + .explanation("Value must be exactly 1 character but was " + input.length() + " in length") + .build(); + } + + if (illegalChars.contains(unescaped)) { + return new ValidationResult.Builder() + .input(input) + .subject(subject) + .valid(false) + .explanation(input + " is not a valid character for this property") + .build(); + } + + return new ValidationResult.Builder() + .input(input) + .subject(subject) + .valid(true) + .build(); + } + + } + + public static final Validator UNESCAPED_SINGLE_CHAR_VALIDATOR = new Validator() { + @Override + public ValidationResult validate(final String subject, final String input, final ValidationContext context) { + + if (input == null) { + return new ValidationResult.Builder() + .input(input) + .subject(subject) + .valid(false) + .explanation("Input is null for this property") + .build(); + } + + String unescapeString = unescapeString(input); + + return new ValidationResult.Builder() + .subject(subject) + .input(unescapeString) + .explanation("Only non-null single characters are supported") + .valid((unescapeString.length() == 1 && unescapeString.charAt(0) != 0) || context.isExpressionLanguagePresent(input)) + .build(); + } + + private String unescapeString(String input) { + if (input != null && input.length() > 1) { + input = StringEscapeUtils.unescapeJava(input); + } + return input; + } + }; + +} \ No newline at end of file diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/SingleCharacterValidator.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/SingleCharacterValidator.java deleted file mode 100644 index b24dea9dcb..0000000000 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/SingleCharacterValidator.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nifi.csv; - -import java.util.HashSet; -import java.util.Set; - -import org.apache.nifi.components.ValidationContext; -import org.apache.nifi.components.ValidationResult; -import org.apache.nifi.components.Validator; - -public class SingleCharacterValidator implements Validator { - private static final Set illegalChars = new HashSet<>(); - static { - illegalChars.add("\r"); - illegalChars.add("\n"); - } - - @Override - public ValidationResult validate(final String subject, final String input, final ValidationContext context) { - final String unescaped = CSVUtils.unescape(input); - if (unescaped.length() != 1) { - return new ValidationResult.Builder() - .input(input) - .subject(subject) - .valid(false) - .explanation("Value must be exactly 1 character but was " + input.length() + " in length") - .build(); - } - - if (illegalChars.contains(input)) { - return new ValidationResult.Builder() - .input(input) - .subject(subject) - .valid(false) - .explanation(input + " is not a valid character for this property") - .build(); - } - - return new ValidationResult.Builder() - .input(input) - .subject(subject) - .valid(true) - .build(); - } - -} diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/csv/TestCSVRecordReader.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/csv/TestCSVRecordReader.java index 3533a43406..576132fb9f 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/csv/TestCSVRecordReader.java +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/csv/TestCSVRecordReader.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.TimeZone; import org.apache.commons.csv.CSVFormat; +import org.apache.commons.lang3.StringEscapeUtils; import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.serialization.MalformedRecordException; import org.apache.nifi.serialization.SimpleRecordSchema; @@ -57,7 +58,7 @@ public class TestCSVRecordReader { return fields; } - private CSVRecordReader createReader(final InputStream in, final RecordSchema schema) throws IOException { + private CSVRecordReader createReader(final InputStream in, final RecordSchema schema, CSVFormat format) throws IOException { return new CSVRecordReader(in, Mockito.mock(ComponentLog.class), schema, format, true, false, RecordFieldType.DATE.getDefaultFormat(), RecordFieldType.TIME.getDefaultFormat(), RecordFieldType.TIMESTAMP.getDefaultFormat()); } @@ -93,7 +94,7 @@ public class TestCSVRecordReader { final RecordSchema schema = new SimpleRecordSchema(fields); try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/single-bank-account.csv")); - final CSVRecordReader reader = createReader(fis, schema)) { + final CSVRecordReader reader = createReader(fis, schema, format)) { final Object[] record = reader.nextRecord().getValues(); final Object[] expectedValues = new Object[] {"1", "John Doe", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"}; @@ -111,7 +112,7 @@ public class TestCSVRecordReader { final RecordSchema schema = new SimpleRecordSchema(fields); try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/multi-bank-account.csv")); - final CSVRecordReader reader = createReader(fis, schema)) { + final CSVRecordReader reader = createReader(fis, schema, format)) { final Object[] firstRecord = reader.nextRecord().getValues(); final Object[] firstExpectedValues = new Object[] {"1", "John Doe", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"}; @@ -133,7 +134,7 @@ public class TestCSVRecordReader { final RecordSchema schema = new SimpleRecordSchema(fields); try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/extra-white-space.csv")); - final CSVRecordReader reader = createReader(fis, schema)) { + final CSVRecordReader reader = createReader(fis, schema, format)) { final Object[] firstRecord = reader.nextRecord().getValues(); final Object[] firstExpectedValues = new Object[] {"1", "John Doe", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"}; @@ -160,7 +161,7 @@ public class TestCSVRecordReader { final byte[] inputData = csvData.getBytes(); try (final InputStream bais = new ByteArrayInputStream(inputData); - final CSVRecordReader reader = createReader(bais, schema)) { + final CSVRecordReader reader = createReader(bais, schema, format)) { final Record record = reader.nextRecord(); assertNotNull(record); @@ -190,7 +191,7 @@ public class TestCSVRecordReader { // test nextRecord does not contain a 'continent' field try (final InputStream bais = new ByteArrayInputStream(inputData); - final CSVRecordReader reader = createReader(bais, schema)) { + final CSVRecordReader reader = createReader(bais, schema, format)) { final Record record = reader.nextRecord(); assertNotNull(record); @@ -210,7 +211,7 @@ public class TestCSVRecordReader { // test nextRawRecord does contain 'continent' field try (final InputStream bais = new ByteArrayInputStream(inputData); - final CSVRecordReader reader = createReader(bais, schema)) { + final CSVRecordReader reader = createReader(bais, schema, format)) { final Record record = reader.nextRecord(false, false); assertNotNull(record); @@ -241,7 +242,7 @@ public class TestCSVRecordReader { final byte[] inputData = csvData.getBytes(); try (final InputStream bais = new ByteArrayInputStream(inputData); - final CSVRecordReader reader = createReader(bais, schema)) { + final CSVRecordReader reader = createReader(bais, schema, format)) { final Record record = reader.nextRecord(); assertNotNull(record); @@ -304,7 +305,7 @@ public class TestCSVRecordReader { // test nextRecord does not contain a 'continent' field try (final InputStream bais = new ByteArrayInputStream(inputData); - final CSVRecordReader reader = createReader(bais, schema)) { + final CSVRecordReader reader = createReader(bais, schema, format)) { final Record record = reader.nextRecord(false, false); assertNotNull(record); @@ -322,4 +323,30 @@ public class TestCSVRecordReader { assertNull(reader.nextRecord(false, false)); } } + + @Test + public void testMultipleRecordsEscapedWithSpecialChar() throws IOException, MalformedRecordException { + + char delimiter = StringEscapeUtils.unescapeJava("\u0001").charAt(0); + + final CSVFormat format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().withQuote('"').withDelimiter(delimiter); + final List fields = getDefaultFields(); + fields.replaceAll(f -> f.getFieldName().equals("balance") ? new RecordField("balance", doubleDataType) : f); + + final RecordSchema schema = new SimpleRecordSchema(fields); + + try (final InputStream fis = new FileInputStream(new File("src/test/resources/csv/multi-bank-account_escapedchar.csv")); + final CSVRecordReader reader = createReader(fis, schema, format)) { + + final Object[] firstRecord = reader.nextRecord().getValues(); + final Object[] firstExpectedValues = new Object[] {"1", "John Doe", 4750.89D, "123 My Street", "My City", "MS", "11111", "USA"}; + Assert.assertArrayEquals(firstExpectedValues, firstRecord); + + final Object[] secondRecord = reader.nextRecord().getValues(); + final Object[] secondExpectedValues = new Object[] {"2", "Jane Doe", 4820.09D, "321 Your Street", "Your City", "NY", "33333", "USA"}; + Assert.assertArrayEquals(secondExpectedValues, secondRecord); + + assertNull(reader.nextRecord()); + } + } } diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/csv/TestCSVValidators.java b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/csv/TestCSVValidators.java new file mode 100644 index 0000000000..5c7c9e278a --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/java/org/apache/nifi/csv/TestCSVValidators.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nifi.csv; + +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.components.Validator; +import org.junit.Test; +import org.mockito.Mockito; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + + +public class TestCSVValidators { + + /*** SingleCharValidator **/ + @Test + public void testSingleCharNullValue() { + + CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator(); + ValidationContext mockContext = Mockito.mock(ValidationContext.class); + ValidationResult result = validator.validate("EscapeChar", null, mockContext); + assertEquals("Input is null for this property", result.getExplanation()); + assertFalse(result.isValid()); + } + + @Test + public void testSingleCharTab() { + CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator(); + ValidationContext mockContext = Mockito.mock(ValidationContext.class); + ValidationResult result = validator.validate("EscapeChar", "\\t", mockContext); + assertTrue(result.isValid()); + } + + @Test + public void testSingleCharIllegalChar() { + CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator(); + ValidationContext mockContext = Mockito.mock(ValidationContext.class); + ValidationResult result = validator.validate("EscapeChar", "\\r", mockContext); + assertEquals("\\r is not a valid character for this property", result.getExplanation()); + assertFalse(result.isValid()); + } + + @Test + public void testSingleCharGoodChar() { + CSVValidators.SingleCharacterValidator validator = new CSVValidators.SingleCharacterValidator(); + ValidationContext mockContext = Mockito.mock(ValidationContext.class); + ValidationResult result = validator.validate("EscapeChar", "'", mockContext); + assertTrue(result.isValid()); + } + + + /*** Unescaped SingleCharValidator **/ + + @Test + public void testUnEscapedSingleCharNullValue() { + Validator validator = CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR; + ValidationContext mockContext = Mockito.mock(ValidationContext.class); + ValidationResult result = validator.validate("Delimiter", null, mockContext); + assertEquals("Input is null for this property", result.getExplanation()); + assertFalse(result.isValid()); + + } + + @Test + public void testUnescapedSingleCharUnicodeChar() { + Validator validator = CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR; + ValidationContext mockContext = Mockito.mock(ValidationContext.class); + ValidationResult result = validator.validate("Delimiter", "\\u0001", mockContext); + assertTrue(result.isValid()); + } + + @Test + public void testUnescapedSingleCharGoodChar() { + Validator validator = CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR; + ValidationContext mockContext = Mockito.mock(ValidationContext.class); + ValidationResult result = validator.validate("Delimiter", ",", mockContext); + assertTrue(result.isValid()); + } + +} diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/csv/multi-bank-account_escapedchar.csv b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/csv/multi-bank-account_escapedchar.csv new file mode 100644 index 0000000000..bb18bc8eb1 --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/test/resources/csv/multi-bank-account_escapedchar.csv @@ -0,0 +1,3 @@ +idnamebalanceaddresscitystatezipCodecountry +1John Doe"4750.89""123 My Street"My CityMS11111USA +2Jane Doe4820.09321 Your StreetYour CityNY33333USA \ No newline at end of file