mirror of https://github.com/apache/nifi.git
NIFI-1518 InferAvroSchema note has an option to set CSV delimiter
Reviewed (and amended based on acknowledgement in PR review) by Tony Kurc (tkurc@apache.org). This closes #235.
This commit is contained in:
parent
d3367a7dc3
commit
784f2a2c20
|
@ -20,6 +20,7 @@ package org.apache.nifi.processors.kite;
|
|||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringEscapeUtils;
|
||||
import org.apache.nifi.annotation.behavior.InputRequirement;
|
||||
import org.apache.nifi.annotation.behavior.ReadsAttribute;
|
||||
import org.apache.nifi.annotation.behavior.ReadsAttributes;
|
||||
|
@ -28,6 +29,9 @@ import org.apache.nifi.annotation.behavior.WritesAttribute;
|
|||
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||
import org.apache.nifi.annotation.documentation.Tags;
|
||||
import org.apache.nifi.components.PropertyDescriptor;
|
||||
import org.apache.nifi.components.ValidationContext;
|
||||
import org.apache.nifi.components.ValidationResult;
|
||||
import org.apache.nifi.components.Validator;
|
||||
import org.apache.nifi.flowfile.FlowFile;
|
||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||
import org.apache.nifi.processor.ProcessContext;
|
||||
|
@ -42,7 +46,6 @@ import org.kitesdk.data.spi.JsonUtil;
|
|||
import org.kitesdk.data.spi.filesystem.CSVProperties;
|
||||
import org.kitesdk.data.spi.filesystem.CSVUtil;
|
||||
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
@ -81,7 +84,22 @@ import java.util.concurrent.atomic.AtomicReference;
|
|||
public class InferAvroSchema
|
||||
extends AbstractKiteProcessor {
|
||||
|
||||
public static final String CSV_DELIMITER = ",";
|
||||
private static final Validator CHAR_VALIDATOR = new Validator() {
|
||||
@Override
|
||||
public ValidationResult validate(String subject, String input, ValidationContext context) {
|
||||
// Allows special, escaped characters as input, which is then unescaped and converted to a single character.
|
||||
// Examples for special characters: \t (or \u0009), \f.
|
||||
input = unescapeString(input);
|
||||
|
||||
return new ValidationResult.Builder()
|
||||
.subject(subject)
|
||||
.input(input)
|
||||
.explanation("Only non-null single characters are supported")
|
||||
.valid(input.length() == 1 && input.charAt(0) != 0)
|
||||
.build();
|
||||
}
|
||||
};
|
||||
|
||||
public static final String USE_MIME_TYPE = "use mime.type value";
|
||||
public static final String JSON_CONTENT = "json";
|
||||
public static final String CSV_CONTENT = "csv";
|
||||
|
@ -154,6 +172,13 @@ public class InferAvroSchema
|
|||
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
|
||||
.build();
|
||||
|
||||
public static final PropertyDescriptor DELIMITER = new PropertyDescriptor.Builder()
|
||||
.name("CSV delimiter")
|
||||
.description("Delimiter character for CSV records")
|
||||
.addValidator(CHAR_VALIDATOR)
|
||||
.defaultValue(",")
|
||||
.build();
|
||||
|
||||
public static final PropertyDescriptor ESCAPE_STRING = new PropertyDescriptor.Builder()
|
||||
.name("CSV Escape String")
|
||||
.description("This property only applies to CSV content type. String that represents an escape sequence" +
|
||||
|
@ -234,6 +259,7 @@ public class InferAvroSchema
|
|||
properties.add(CSV_HEADER_DEFINITION);
|
||||
properties.add(GET_CSV_HEADER_DEFINITION_FROM_INPUT);
|
||||
properties.add(HEADER_LINE_SKIP_COUNT);
|
||||
properties.add(DELIMITER);
|
||||
properties.add(ESCAPE_STRING);
|
||||
properties.add(QUOTE_STRING);
|
||||
properties.add(PRETTY_AVRO_OUTPUT);
|
||||
|
@ -366,7 +392,7 @@ public class InferAvroSchema
|
|||
|
||||
//Prepares the CSVProperties for kite
|
||||
final CSVProperties props = new CSVProperties.Builder()
|
||||
.delimiter(CSV_DELIMITER)
|
||||
.delimiter(context.getProperty(DELIMITER).getValue())
|
||||
.escape(context.getProperty(ESCAPE_STRING).evaluateAttributeExpressions().getValue())
|
||||
.quote(context.getProperty(QUOTE_STRING).evaluateAttributeExpressions().getValue())
|
||||
.header(header.get())
|
||||
|
@ -457,4 +483,11 @@ public class InferAvroSchema
|
|||
|
||||
return avroSchema;
|
||||
}
|
||||
|
||||
private static String unescapeString(String input) {
|
||||
if (input.length() > 1) {
|
||||
input = StringEscapeUtils.unescapeJava(input);
|
||||
}
|
||||
return input;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -173,4 +173,47 @@ public class TestInferAvroSchema {
|
|||
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0);
|
||||
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFile() throws Exception {
|
||||
|
||||
runner.setProperty(InferAvroSchema.DELIMITER, "\\t");
|
||||
runner.assertValid();
|
||||
|
||||
Map<String, String> attributes = new HashMap<>();
|
||||
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
|
||||
runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes);
|
||||
|
||||
runner.run();
|
||||
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
|
||||
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
|
||||
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
|
||||
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
|
||||
|
||||
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
|
||||
flowFile.assertContentEquals(new File("src/test/resources/Shapes_header.csv.avro").toPath());
|
||||
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFileNegativeTest() throws Exception {
|
||||
|
||||
// Inproper InferAvroSchema.DELIMITER > original goes to InferAvroSchema.REL_FAILURE
|
||||
runner.setProperty(InferAvroSchema.DELIMITER, ";");
|
||||
runner.assertValid();
|
||||
|
||||
Map<String, String> attributes = new HashMap<>();
|
||||
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
|
||||
runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes);
|
||||
|
||||
runner.run();
|
||||
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
|
||||
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 1);
|
||||
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0);
|
||||
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0);
|
||||
|
||||
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_FAILURE).get(0);
|
||||
flowFile.assertContentEquals(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath());
|
||||
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "text/csv");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,352 @@
|
|||
shape color width height
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
square red 100 100
|
||||
sphere red 100 100
|
||||
triangle red 100 100
|
||||
rectangle red 100 100
|
||||
circle red 100 100
|
||||
sphere red 100 100
|
||||
circle red 100 100
|
||||
circle red 100 100
|
||||
triangle red 100 100
|
||||
cone red 100 100
|
||||
circle red 100 100
|
||||
rectangle red 100 100
|
|
Loading…
Reference in New Issue