From 784f2a2c2094ae30b7f07a80cf41b1b548929211 Mon Sep 17 00:00:00 2001 From: Michal Klempa Date: Thu, 18 Feb 2016 09:42:58 +0100 Subject: [PATCH] NIFI-1518 InferAvroSchema note has an option to set CSV delimiter Reviewed (and amended based on acknowledgement in PR review) by Tony Kurc (tkurc@apache.org). This closes #235. --- .../nifi/processors/kite/InferAvroSchema.java | 39 +- .../processors/kite/TestInferAvroSchema.java | 43 +++ .../resources/Shapes_Header_TabDelimited.csv | 352 ++++++++++++++++++ 3 files changed, 431 insertions(+), 3 deletions(-) create mode 100644 nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/test/resources/Shapes_Header_TabDelimited.csv diff --git a/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/main/java/org/apache/nifi/processors/kite/InferAvroSchema.java b/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/main/java/org/apache/nifi/processors/kite/InferAvroSchema.java index ad8b7e5c88..1923785293 100644 --- a/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/main/java/org/apache/nifi/processors/kite/InferAvroSchema.java +++ b/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/main/java/org/apache/nifi/processors/kite/InferAvroSchema.java @@ -20,6 +20,7 @@ package org.apache.nifi.processors.kite; import org.apache.avro.Schema; import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringEscapeUtils; import org.apache.nifi.annotation.behavior.InputRequirement; import org.apache.nifi.annotation.behavior.ReadsAttribute; import org.apache.nifi.annotation.behavior.ReadsAttributes; @@ -28,6 +29,9 @@ import org.apache.nifi.annotation.behavior.WritesAttribute; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.components.Validator; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.processor.ProcessContext; @@ -42,7 +46,6 @@ import org.kitesdk.data.spi.JsonUtil; import org.kitesdk.data.spi.filesystem.CSVProperties; import org.kitesdk.data.spi.filesystem.CSVUtil; - import java.io.InputStream; import java.io.IOException; import java.io.InputStreamReader; @@ -81,7 +84,22 @@ import java.util.concurrent.atomic.AtomicReference; public class InferAvroSchema extends AbstractKiteProcessor { - public static final String CSV_DELIMITER = ","; + private static final Validator CHAR_VALIDATOR = new Validator() { + @Override + public ValidationResult validate(String subject, String input, ValidationContext context) { + // Allows special, escaped characters as input, which is then unescaped and converted to a single character. + // Examples for special characters: \t (or \u0009), \f. + input = unescapeString(input); + + return new ValidationResult.Builder() + .subject(subject) + .input(input) + .explanation("Only non-null single characters are supported") + .valid(input.length() == 1 && input.charAt(0) != 0) + .build(); + } + }; + public static final String USE_MIME_TYPE = "use mime.type value"; public static final String JSON_CONTENT = "json"; public static final String CSV_CONTENT = "csv"; @@ -154,6 +172,13 @@ public class InferAvroSchema .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) .build(); + public static final PropertyDescriptor DELIMITER = new PropertyDescriptor.Builder() + .name("CSV delimiter") + .description("Delimiter character for CSV records") + .addValidator(CHAR_VALIDATOR) + .defaultValue(",") + .build(); + public static final PropertyDescriptor ESCAPE_STRING = new PropertyDescriptor.Builder() .name("CSV Escape String") .description("This property only applies to CSV content type. String that represents an escape sequence" + @@ -234,6 +259,7 @@ public class InferAvroSchema properties.add(CSV_HEADER_DEFINITION); properties.add(GET_CSV_HEADER_DEFINITION_FROM_INPUT); properties.add(HEADER_LINE_SKIP_COUNT); + properties.add(DELIMITER); properties.add(ESCAPE_STRING); properties.add(QUOTE_STRING); properties.add(PRETTY_AVRO_OUTPUT); @@ -366,7 +392,7 @@ public class InferAvroSchema //Prepares the CSVProperties for kite final CSVProperties props = new CSVProperties.Builder() - .delimiter(CSV_DELIMITER) + .delimiter(context.getProperty(DELIMITER).getValue()) .escape(context.getProperty(ESCAPE_STRING).evaluateAttributeExpressions().getValue()) .quote(context.getProperty(QUOTE_STRING).evaluateAttributeExpressions().getValue()) .header(header.get()) @@ -457,4 +483,11 @@ public class InferAvroSchema return avroSchema; } + + private static String unescapeString(String input) { + if (input.length() > 1) { + input = StringEscapeUtils.unescapeJava(input); + } + return input; + } } diff --git a/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/test/java/org/apache/nifi/processors/kite/TestInferAvroSchema.java b/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/test/java/org/apache/nifi/processors/kite/TestInferAvroSchema.java index 08185f64ad..510eaf97ad 100644 --- a/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/test/java/org/apache/nifi/processors/kite/TestInferAvroSchema.java +++ b/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/test/java/org/apache/nifi/processors/kite/TestInferAvroSchema.java @@ -173,4 +173,47 @@ public class TestInferAvroSchema { runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0); runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0); } + + @Test + public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFile() throws Exception { + + runner.setProperty(InferAvroSchema.DELIMITER, "\\t"); + runner.assertValid(); + + Map attributes = new HashMap<>(); + attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv"); + runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes); + + runner.run(); + runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0); + runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0); + runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1); + runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1); + + MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0); + flowFile.assertContentEquals(new File("src/test/resources/Shapes_header.csv.avro").toPath()); + flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary"); + } + + @Test + public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFileNegativeTest() throws Exception { + + // Inproper InferAvroSchema.DELIMITER > original goes to InferAvroSchema.REL_FAILURE + runner.setProperty(InferAvroSchema.DELIMITER, ";"); + runner.assertValid(); + + Map attributes = new HashMap<>(); + attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv"); + runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes); + + runner.run(); + runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0); + runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 1); + runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0); + runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0); + + MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_FAILURE).get(0); + flowFile.assertContentEquals(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath()); + flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "text/csv"); + } } diff --git a/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/test/resources/Shapes_Header_TabDelimited.csv b/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/test/resources/Shapes_Header_TabDelimited.csv new file mode 100644 index 0000000000..7ffa4f9bd2 --- /dev/null +++ b/nifi-nar-bundles/nifi-kite-bundle/nifi-kite-processors/src/test/resources/Shapes_Header_TabDelimited.csv @@ -0,0 +1,352 @@ +shape color width height +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 +circle red 100 100 +square red 100 100 +sphere red 100 100 +triangle red 100 100 +rectangle red 100 100 +circle red 100 100 +sphere red 100 100 +circle red 100 100 +circle red 100 100 +triangle red 100 100 +cone red 100 100 +circle red 100 100 +rectangle red 100 100 \ No newline at end of file