NIFI-2465 - InferAvroSchema EL support based on incoming FlowFiles

This closes #863.

Signed-off-by: Bryan Bende <bbende@apache.org>
This commit is contained in:
Simon Elliston Ball 2016-08-15 17:54:50 +01:00 committed by Bryan Bende
parent 26f5c496d1
commit b0122c6a73
No known key found for this signature in database
GPG Key ID: A0DDA9ED50711C39
2 changed files with 91 additions and 22 deletions

View File

@ -95,7 +95,7 @@ public class InferAvroSchema
.subject(subject) .subject(subject)
.input(input) .input(input)
.explanation("Only non-null single characters are supported") .explanation("Only non-null single characters are supported")
.valid(input.length() == 1 && input.charAt(0) != 0) .valid(input.length() == 1 && input.charAt(0) != 0 || context.isExpressionLanguagePresent(input))
.build(); .build();
} }
}; };
@ -175,6 +175,7 @@ public class InferAvroSchema
public static final PropertyDescriptor DELIMITER = new PropertyDescriptor.Builder() public static final PropertyDescriptor DELIMITER = new PropertyDescriptor.Builder()
.name("CSV delimiter") .name("CSV delimiter")
.description("Delimiter character for CSV records") .description("Delimiter character for CSV records")
.expressionLanguageSupported(true)
.addValidator(CHAR_VALIDATOR) .addValidator(CHAR_VALIDATOR)
.defaultValue(",") .defaultValue(",")
.build(); .build();
@ -212,6 +213,7 @@ public class InferAvroSchema
.description("Character encoding of CSV data.") .description("Character encoding of CSV data.")
.required(true) .required(true)
.defaultValue("UTF-8") .defaultValue("UTF-8")
.expressionLanguageSupported(true)
.addValidator(StandardValidators.CHARACTER_SET_VALIDATOR) .addValidator(StandardValidators.CHARACTER_SET_VALIDATOR)
.build(); .build();
@ -391,14 +393,14 @@ public class InferAvroSchema
} }
//Prepares the CSVProperties for kite //Prepares the CSVProperties for kite
final CSVProperties props = new CSVProperties.Builder() CSVProperties props = new CSVProperties.Builder()
.delimiter(context.getProperty(DELIMITER).getValue()) .charset(context.getProperty(CHARSET).evaluateAttributeExpressions(inputFlowFile).getValue())
.escape(context.getProperty(ESCAPE_STRING).evaluateAttributeExpressions().getValue()) .delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(inputFlowFile).getValue())
.quote(context.getProperty(QUOTE_STRING).evaluateAttributeExpressions().getValue()) .quote(context.getProperty(QUOTE_STRING).evaluateAttributeExpressions(inputFlowFile).getValue())
.escape(context.getProperty(ESCAPE_STRING).evaluateAttributeExpressions(inputFlowFile).getValue())
.linesToSkip(context.getProperty(HEADER_LINE_SKIP_COUNT).evaluateAttributeExpressions(inputFlowFile).asInteger())
.header(header.get()) .header(header.get())
.hasHeader(hasHeader.get()) .hasHeader(hasHeader.get())
.linesToSkip(context.getProperty(HEADER_LINE_SKIP_COUNT).evaluateAttributeExpressions().asInteger())
.charset(context.getProperty(CHARSET).getValue())
.build(); .build();
final AtomicReference<String> avroSchema = new AtomicReference<>(); final AtomicReference<String> avroSchema = new AtomicReference<>();
@ -408,7 +410,7 @@ public class InferAvroSchema
public void process(InputStream in) throws IOException { public void process(InputStream in) throws IOException {
avroSchema.set(CSVUtil avroSchema.set(CSVUtil
.inferSchema( .inferSchema(
context.getProperty(RECORD_NAME).evaluateAttributeExpressions().getValue(), in, props) context.getProperty(RECORD_NAME).evaluateAttributeExpressions(inputFlowFile).getValue(), in, props)
.toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean())); .toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean()));
} }
}); });
@ -435,8 +437,8 @@ public class InferAvroSchema
@Override @Override
public void process(InputStream in) throws IOException { public void process(InputStream in) throws IOException {
Schema as = JsonUtil.inferSchema( Schema as = JsonUtil.inferSchema(
in, context.getProperty(RECORD_NAME).evaluateAttributeExpressions().getValue(), in, context.getProperty(RECORD_NAME).evaluateAttributeExpressions(inputFlowFile).getValue(),
context.getProperty(NUM_RECORDS_TO_ANALYZE).evaluateAttributeExpressions().asInteger()); context.getProperty(NUM_RECORDS_TO_ANALYZE).evaluateAttributeExpressions(inputFlowFile).asInteger());
avroSchema.set(as.toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean())); avroSchema.set(as.toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean()));
} }

View File

@ -49,7 +49,7 @@ public class TestInferAvroSchema {
public void setup() { public void setup() {
runner = TestRunners.newTestRunner(InferAvroSchema.class); runner = TestRunners.newTestRunner(InferAvroSchema.class);
//Prepare the common setup. // Prepare the common setup.
runner.assertNotValid(); runner.assertNotValid();
runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE); runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);
@ -90,7 +90,7 @@ public class TestInferAvroSchema {
runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE); runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);
//Purposely set to True to test that none of the JSON file is read which would cause issues. // Purposely set to True to test that none of the JSON file is read which would cause issues.
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true"); runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_ATTRIBUTE); runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_ATTRIBUTE);
@ -106,12 +106,10 @@ public class TestInferAvroSchema {
MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0); MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
String avroSchema = data.getAttribute(InferAvroSchema.AVRO_SCHEMA_ATTRIBUTE_NAME); String avroSchema = data.getAttribute(InferAvroSchema.AVRO_SCHEMA_ATTRIBUTE_NAME);
String knownSchema = new String(unix2PlatformSpecificLineEndings( String knownSchema = new String(unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes.json.avro")), StandardCharsets.UTF_8);
new File("src/test/resources/Shapes.json.avro")),
StandardCharsets.UTF_8);
Assert.assertEquals(avroSchema, knownSchema); Assert.assertEquals(avroSchema, knownSchema);
//Since that avro schema is written to an attribute this should be teh same as the original // Since that avro schema is written to an attribute this should be teh same as the original
data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/json"); data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/json");
} }
@ -120,7 +118,7 @@ public class TestInferAvroSchema {
runner.assertValid(); runner.assertValid();
//Read in the header // Read in the header
StringWriter writer = new StringWriter(); StringWriter writer = new StringWriter();
IOUtils.copy((Files.newInputStream(Paths.get("src/test/resources/ShapesHeader.csv"), StandardOpenOption.READ)), writer, "UTF-8"); IOUtils.copy((Files.newInputStream(Paths.get("src/test/resources/ShapesHeader.csv"), StandardOpenOption.READ)), writer, "UTF-8");
runner.setProperty(InferAvroSchema.CSV_HEADER_DEFINITION, writer.toString()); runner.setProperty(InferAvroSchema.CSV_HEADER_DEFINITION, writer.toString());
@ -223,13 +221,81 @@ public class TestInferAvroSchema {
flowFile.assertContentEquals(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath()); flowFile.assertContentEquals(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath());
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "text/csv"); flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "text/csv");
} }
static byte [] unix2PlatformSpecificLineEndings(final File file) throws IOException {
try ( final BufferedInputStream in = new BufferedInputStream(new FileInputStream(file));
final ByteArrayOutputStream out = new ByteArrayOutputStream()) { @Test
public void specifyCSVparametersInExpressionLanguage() throws Exception {
runner.setProperty(InferAvroSchema.DELIMITER, "${csv.delimiter}");
runner.setProperty(InferAvroSchema.ESCAPE_STRING, "${csv.escape}");
runner.setProperty(InferAvroSchema.QUOTE_STRING, "${csv.quote}");
runner.setProperty(InferAvroSchema.CHARSET, "${csv.charset}");
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
runner.assertValid();
@SuppressWarnings("serial")
Map<String, String> attributes = new HashMap<String, String>() {
{
put("csv.delimiter",",");
put("csv.escape", "\\");
put("csv.quote", "\"");
put("csv.charset", "UTF-8");
put(CoreAttributes.MIME_TYPE.key(), "text/csv");
}
};
runner.enqueue(new File("src/test/resources/Shapes_Header.csv").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
flowFile.assertContentEquals(unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
}
@Test
public void specifyJsonParametersInExpressionLanguage() throws Exception {
runner.assertValid();
runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);
// Purposely set to True to test that none of the JSON file is read which would cause issues.
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_ATTRIBUTE);
runner.setProperty(InferAvroSchema.RECORD_NAME, "${record.name}");
runner.setProperty(InferAvroSchema.NUM_RECORDS_TO_ANALYZE, "${records.analyze}");
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "application/json");
attributes.put("record.name", "myrecord");
attributes.put("records.analyze", "2");
runner.enqueue(new File("src/test/resources/Shapes.json").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
String avroSchema = data.getAttribute(InferAvroSchema.AVRO_SCHEMA_ATTRIBUTE_NAME);
Assert.assertTrue(avroSchema.contains("\"name\" : \"myrecord\""));
// Since that avro schema is written to an attribute this should be teh same as the original
data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/json");
}
static byte[] unix2PlatformSpecificLineEndings(final File file) throws IOException {
try (final BufferedInputStream in = new BufferedInputStream(new FileInputStream(file)); final ByteArrayOutputStream out = new ByteArrayOutputStream()) {
byte eol[] = System.lineSeparator().getBytes(StandardCharsets.UTF_8); byte eol[] = System.lineSeparator().getBytes(StandardCharsets.UTF_8);
int justRead; int justRead;
while ((justRead = in.read()) != -1) { while ((justRead = in.read()) != -1) {
if (justRead == '\n'){ if (justRead == '\n') {
out.write(eol); out.write(eol);
} else { } else {
out.write(justRead); out.write(justRead);
@ -238,4 +304,5 @@ public class TestInferAvroSchema {
return out.toByteArray(); return out.toByteArray();
} }
} }
} }