mirror of https://github.com/apache/nifi.git
NIFI-1156
This commit is contained in:
parent
8d37af07b9
commit
8966643d48
|
@ -0,0 +1,234 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.nifi.processors.kite;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
|
import org.apache.nifi.annotation.behavior.InputRequirement;
|
||||||
|
import org.apache.nifi.annotation.behavior.InputRequirement.Requirement;
|
||||||
|
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||||
|
import org.apache.nifi.annotation.documentation.SeeAlso;
|
||||||
|
import org.apache.nifi.annotation.documentation.Tags;
|
||||||
|
import org.apache.nifi.components.PropertyDescriptor;
|
||||||
|
import org.apache.nifi.flowfile.FlowFile;
|
||||||
|
import org.apache.nifi.processor.ProcessContext;
|
||||||
|
import org.apache.nifi.processor.ProcessSession;
|
||||||
|
import org.apache.nifi.processor.ProcessorInitializationContext;
|
||||||
|
import org.apache.nifi.processor.Relationship;
|
||||||
|
import org.apache.nifi.processor.exception.ProcessException;
|
||||||
|
import org.apache.nifi.processor.io.InputStreamCallback;
|
||||||
|
import org.apache.nifi.processor.io.OutputStreamCallback;
|
||||||
|
import org.apache.nifi.processor.util.StandardValidators;
|
||||||
|
import org.kitesdk.data.spi.filesystem.CSVProperties;
|
||||||
|
import org.kitesdk.data.spi.filesystem.CSVUtil;
|
||||||
|
import org.kitesdk.shaded.com.google.common.collect.ImmutableSet;
|
||||||
|
|
||||||
|
@Tags({"kite", "csv", "avro", "infer", "schema"})
|
||||||
|
@SeeAlso({InferAvroSchemaFromCSV.class})
|
||||||
|
@InputRequirement(Requirement.INPUT_REQUIRED)
|
||||||
|
@CapabilityDescription("Creates an Avro schema from a CSV file header. The header line definition can either be provided" +
|
||||||
|
"as a property to the processor OR present in the first line of CSV in the incoming FlowFile content. If a header" +
|
||||||
|
" property is specified for this processor no attempt will be made to use the header line that may be present" +
|
||||||
|
" in the incoming CSV FlowFile content.")
|
||||||
|
public class InferAvroSchemaFromCSV
|
||||||
|
extends AbstractKiteProcessor {
|
||||||
|
|
||||||
|
public static final String CSV_DELIMITER = ",";
|
||||||
|
|
||||||
|
public static final PropertyDescriptor HEADER_LINE = new PropertyDescriptor.Builder()
|
||||||
|
.name("CSV Header Line")
|
||||||
|
.description("Comma separated string defining the column names expected in the CSV data. " +
|
||||||
|
"EX: \"fname,lname,zip,address\"")
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor HEADER_LINE_SKIP_COUNT = new PropertyDescriptor.Builder()
|
||||||
|
.name("CSV Header Line Skip Count")
|
||||||
|
.description("Specifies the number of header lines that should be skipped when reading the CSV data. If the " +
|
||||||
|
" first line of the CSV data is a header line and you specify the \"CSV Header Line\" property " +
|
||||||
|
"you need to set this vlaue to 1 otherwise the header line will be treated as actual data.")
|
||||||
|
.required(true)
|
||||||
|
.defaultValue("0")
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor ESCAPE_STRING = new PropertyDescriptor.Builder()
|
||||||
|
.name("CSV escape string")
|
||||||
|
.description("String that represents an escape sequence in the CSV FlowFile content data.")
|
||||||
|
.required(true)
|
||||||
|
.defaultValue("\\")
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor QUOTE_STRING = new PropertyDescriptor.Builder()
|
||||||
|
.name("CSV quote string")
|
||||||
|
.description("String that represents a literal quote character in the CSV FlowFile content data.")
|
||||||
|
.required(true)
|
||||||
|
.defaultValue("'")
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor RECORD_NAME = new PropertyDescriptor.Builder()
|
||||||
|
.name("Avro Record Name")
|
||||||
|
.description("Value to be placed in the Avro record schema \"name\" field.")
|
||||||
|
.required(true)
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor CHARSET = new PropertyDescriptor.Builder()
|
||||||
|
.name("Charset")
|
||||||
|
.description("Character encoding of CSV data.")
|
||||||
|
.required(true)
|
||||||
|
.defaultValue("UTF-8")
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor PRETTY_AVRO_OUTPUT = new PropertyDescriptor.Builder()
|
||||||
|
.name("Pretty Avro Output")
|
||||||
|
.description("If true the Avro output will be formatted.")
|
||||||
|
.required(true)
|
||||||
|
.defaultValue("true")
|
||||||
|
.allowableValues("true", "false")
|
||||||
|
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
|
||||||
|
public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
|
||||||
|
.description("Successfully created Avro schema for CSV data.").build();
|
||||||
|
|
||||||
|
public static final Relationship REL_ORIGINAL = new Relationship.Builder().name("original")
|
||||||
|
.description("Original incoming FlowFile CSV data").build();
|
||||||
|
|
||||||
|
public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
|
||||||
|
.description("Failed to create Avro schema for CSV data.").build();
|
||||||
|
|
||||||
|
private List<PropertyDescriptor> properties;
|
||||||
|
private Set<Relationship> relationships;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void init(final ProcessorInitializationContext context) {
|
||||||
|
final List<PropertyDescriptor> properties = new ArrayList<>();
|
||||||
|
properties.add(HEADER_LINE);
|
||||||
|
properties.add(HEADER_LINE_SKIP_COUNT);
|
||||||
|
properties.add(ESCAPE_STRING);
|
||||||
|
properties.add(QUOTE_STRING);
|
||||||
|
properties.add(PRETTY_AVRO_OUTPUT);
|
||||||
|
properties.add(RECORD_NAME);
|
||||||
|
properties.add(CHARSET);
|
||||||
|
this.properties = Collections.unmodifiableList(properties);
|
||||||
|
|
||||||
|
final Set<Relationship> relationships = new HashSet<>();
|
||||||
|
relationships.add(REL_SUCCESS);
|
||||||
|
relationships.add(REL_FAILURE);
|
||||||
|
relationships.add(REL_ORIGINAL);
|
||||||
|
this.relationships = Collections.unmodifiableSet(relationships);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Set<Relationship> getRelationships() {
|
||||||
|
return relationships;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onTrigger(final ProcessContext context, ProcessSession session) throws ProcessException {
|
||||||
|
final FlowFile original = session.get();
|
||||||
|
if (original == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
//Determines the header line either from the property input or the first line of the delimited file.
|
||||||
|
final AtomicReference<String> header = new AtomicReference<>();
|
||||||
|
final AtomicReference<Boolean> hasHeader = new AtomicReference<>();
|
||||||
|
|
||||||
|
if (context.getProperty(HEADER_LINE).isSet()) {
|
||||||
|
header.set(context.getProperty(HEADER_LINE).getValue());
|
||||||
|
hasHeader.set(Boolean.FALSE);
|
||||||
|
} else {
|
||||||
|
//Read the first line of the file to get the header value.
|
||||||
|
session.read(original, new InputStreamCallback() {
|
||||||
|
@Override
|
||||||
|
public void process(InputStream in) throws IOException {
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(in));
|
||||||
|
header.set(br.readLine());
|
||||||
|
hasHeader.set(Boolean.TRUE);
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
//Prepares the CSVProperties for kite
|
||||||
|
final CSVProperties props = new CSVProperties.Builder()
|
||||||
|
.delimiter(CSV_DELIMITER)
|
||||||
|
.escape(context.getProperty(ESCAPE_STRING).getValue())
|
||||||
|
.quote(context.getProperty(QUOTE_STRING).getValue())
|
||||||
|
.header(header.get())
|
||||||
|
.hasHeader(hasHeader.get())
|
||||||
|
.linesToSkip(context.getProperty(HEADER_LINE_SKIP_COUNT).asInteger())
|
||||||
|
.charset(context.getProperty(CHARSET).getValue())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
final Set<String> required = ImmutableSet.of();
|
||||||
|
final AtomicReference<String> avroSchema = new AtomicReference<>();
|
||||||
|
|
||||||
|
session.read(original, new InputStreamCallback() {
|
||||||
|
@Override
|
||||||
|
public void process(InputStream in) throws IOException {
|
||||||
|
avroSchema.set(CSVUtil
|
||||||
|
.inferNullableSchema(
|
||||||
|
context.getProperty(RECORD_NAME).getValue(), in, props, required)
|
||||||
|
.toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean()));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
FlowFile avroSchemaFF = session.write(session.create(), new OutputStreamCallback() {
|
||||||
|
@Override
|
||||||
|
public void process(OutputStream out) throws IOException {
|
||||||
|
out.write(avroSchema.get().getBytes());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
//Transfer the sessions.
|
||||||
|
session.transfer(original, REL_ORIGINAL);
|
||||||
|
session.transfer(avroSchemaFF, REL_SUCCESS);
|
||||||
|
|
||||||
|
} catch (Exception ex) {
|
||||||
|
getLogger().error(ex.getMessage());
|
||||||
|
session.transfer(original, REL_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,164 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.nifi.processors.kite;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.nifi.annotation.behavior.InputRequirement;
|
||||||
|
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||||
|
import org.apache.nifi.annotation.documentation.SeeAlso;
|
||||||
|
import org.apache.nifi.annotation.documentation.Tags;
|
||||||
|
import org.apache.nifi.components.PropertyDescriptor;
|
||||||
|
import org.apache.nifi.flowfile.FlowFile;
|
||||||
|
import org.apache.nifi.processor.ProcessContext;
|
||||||
|
import org.apache.nifi.processor.ProcessSession;
|
||||||
|
import org.apache.nifi.processor.ProcessorInitializationContext;
|
||||||
|
import org.apache.nifi.processor.Relationship;
|
||||||
|
import org.apache.nifi.processor.exception.ProcessException;
|
||||||
|
import org.apache.nifi.processor.io.InputStreamCallback;
|
||||||
|
import org.apache.nifi.processor.io.OutputStreamCallback;
|
||||||
|
import org.apache.nifi.processor.util.StandardValidators;
|
||||||
|
import org.kitesdk.data.spi.JsonUtil;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
|
@Tags({"kite", "json", "avro", "infer", "schema"})
|
||||||
|
@SeeAlso({InferAvroSchemaFromJSON.class})
|
||||||
|
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
|
||||||
|
@CapabilityDescription("Creates an Avro schema from JSON data. The Avro schema is inferred by examining the fields " +
|
||||||
|
"in the JSON input. The Avro schema generated by kite will use the same names present in the incoming JSON payload")
|
||||||
|
public class InferAvroSchemaFromJSON
|
||||||
|
extends AbstractKiteProcessor {
|
||||||
|
|
||||||
|
public static final PropertyDescriptor RECORD_NAME = new PropertyDescriptor.Builder()
|
||||||
|
.name("Avro Record Name")
|
||||||
|
.description("Value to be placed in the Avro record schema \"name\" field.")
|
||||||
|
.required(true)
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor NUM_RECORDS_TO_ANALYZE = new PropertyDescriptor.Builder()
|
||||||
|
.name("Number of records to analyze")
|
||||||
|
.description("Number of records that should be analyzed by kite to infer the Avro schema")
|
||||||
|
.required(true)
|
||||||
|
.defaultValue("10")
|
||||||
|
.addValidator(StandardValidators.INTEGER_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor CHARSET = new PropertyDescriptor.Builder()
|
||||||
|
.name("Charset")
|
||||||
|
.description("Character encoding of CSV data.")
|
||||||
|
.required(true)
|
||||||
|
.defaultValue("UTF-8")
|
||||||
|
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static final PropertyDescriptor PRETTY_AVRO_OUTPUT = new PropertyDescriptor.Builder()
|
||||||
|
.name("Pretty Avro Output")
|
||||||
|
.description("If true the Avro output will be formatted.")
|
||||||
|
.required(true)
|
||||||
|
.defaultValue("true")
|
||||||
|
.allowableValues("true", "false")
|
||||||
|
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
|
||||||
|
public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
|
||||||
|
.description("Successfully created Avro schema for JSON data.").build();
|
||||||
|
|
||||||
|
public static final Relationship REL_ORIGINAL = new Relationship.Builder().name("original")
|
||||||
|
.description("Original incoming FlowFile JSON data").build();
|
||||||
|
|
||||||
|
public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
|
||||||
|
.description("Failed to create Avro schema for JSON data.").build();
|
||||||
|
|
||||||
|
private List<PropertyDescriptor> properties;
|
||||||
|
private Set<Relationship> relationships;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void init(final ProcessorInitializationContext context) {
|
||||||
|
final List<PropertyDescriptor> properties = new ArrayList<>();
|
||||||
|
properties.add(CHARSET);
|
||||||
|
properties.add(PRETTY_AVRO_OUTPUT);
|
||||||
|
properties.add(RECORD_NAME);
|
||||||
|
properties.add(NUM_RECORDS_TO_ANALYZE);
|
||||||
|
this.properties = Collections.unmodifiableList(properties);
|
||||||
|
|
||||||
|
final Set<Relationship> relationships = new HashSet<>();
|
||||||
|
relationships.add(REL_SUCCESS);
|
||||||
|
relationships.add(REL_FAILURE);
|
||||||
|
relationships.add(REL_ORIGINAL);
|
||||||
|
this.relationships = Collections.unmodifiableSet(relationships);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Set<Relationship> getRelationships() {
|
||||||
|
return relationships;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onTrigger(final ProcessContext context, ProcessSession session) throws ProcessException {
|
||||||
|
final FlowFile original = session.get();
|
||||||
|
if (original == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
final AtomicReference<String> avroSchema = new AtomicReference<>();
|
||||||
|
session.read(original, new InputStreamCallback() {
|
||||||
|
@Override
|
||||||
|
public void process(InputStream in) throws IOException {
|
||||||
|
Schema as = JsonUtil.inferSchema(
|
||||||
|
in, context.getProperty(RECORD_NAME).getValue(), context.getProperty(NUM_RECORDS_TO_ANALYZE).asInteger());
|
||||||
|
avroSchema.set(as.toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean()));
|
||||||
|
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
FlowFile avroSchemaFF = session.write(session.create(), new OutputStreamCallback() {
|
||||||
|
@Override
|
||||||
|
public void process(OutputStream out) throws IOException {
|
||||||
|
out.write(avroSchema.get().getBytes());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
//Transfer the FlowFiles
|
||||||
|
session.transfer(original, REL_ORIGINAL);
|
||||||
|
session.transfer(avroSchemaFF, REL_SUCCESS);
|
||||||
|
|
||||||
|
} catch (Exception ex) {
|
||||||
|
getLogger().error(ex.getMessage());
|
||||||
|
session.transfer(original, REL_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,3 +16,5 @@ org.apache.nifi.processors.kite.StoreInKiteDataset
|
||||||
org.apache.nifi.processors.kite.ConvertCSVToAvro
|
org.apache.nifi.processors.kite.ConvertCSVToAvro
|
||||||
org.apache.nifi.processors.kite.ConvertJSONToAvro
|
org.apache.nifi.processors.kite.ConvertJSONToAvro
|
||||||
org.apache.nifi.processors.kite.ConvertAvroSchema
|
org.apache.nifi.processors.kite.ConvertAvroSchema
|
||||||
|
org.apache.nifi.processors.kite.InferAvroSchemaFromCSV
|
||||||
|
org.apache.nifi.processors.kite.InferAvroSchemaFromJSON
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.nifi.processors.kite;
|
||||||
|
;
|
||||||
|
import org.apache.nifi.flowfile.FlowFile;
|
||||||
|
import org.apache.nifi.processor.ProcessSession;
|
||||||
|
import org.apache.nifi.processor.io.OutputStreamCallback;
|
||||||
|
import org.apache.nifi.processor.io.StreamCallback;
|
||||||
|
import org.apache.nifi.util.TestRunner;
|
||||||
|
import org.apache.nifi.util.TestRunners;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
|
||||||
|
public class TestInferAvroSchemaFromCSV {
|
||||||
|
|
||||||
|
private final String CSV_HEADER_LINE = "fname,lname,age,zip";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void inferSchemaFromHeaderLineOfCSV() throws Exception {
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(InferAvroSchemaFromCSV.class);
|
||||||
|
|
||||||
|
runner.assertNotValid();
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.HEADER_LINE_SKIP_COUNT, "0");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.ESCAPE_STRING, "\\");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.QUOTE_STRING, "'");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.RECORD_NAME, "contact");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.CHARSET, "UTF-8");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.PRETTY_AVRO_OUTPUT, "true");
|
||||||
|
|
||||||
|
runner.assertValid();
|
||||||
|
|
||||||
|
ProcessSession session = runner.getProcessSessionFactory().createSession();
|
||||||
|
FlowFile ff = session.write(session.create(), new OutputStreamCallback() {
|
||||||
|
@Override
|
||||||
|
public void process(OutputStream out) throws IOException {
|
||||||
|
out.write((CSV_HEADER_LINE + "\nJeremy,Dyer,29,55555").getBytes());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
//Enqueue the empty FlowFile
|
||||||
|
runner.enqueue(ff);
|
||||||
|
runner.run();
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_FAILURE, 0);
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_ORIGINAL, 1);
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_SUCCESS, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void inferSchemaFormHeaderLinePropertyOfProcessor() throws Exception {
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(InferAvroSchemaFromCSV.class);
|
||||||
|
|
||||||
|
runner.assertNotValid();
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.HEADER_LINE, CSV_HEADER_LINE);
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.HEADER_LINE_SKIP_COUNT, "1");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.ESCAPE_STRING, "\\");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.QUOTE_STRING, "'");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.RECORD_NAME, "contact");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.CHARSET, "UTF-8");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.PRETTY_AVRO_OUTPUT, "true");
|
||||||
|
|
||||||
|
runner.assertValid();
|
||||||
|
|
||||||
|
ProcessSession session = runner.getProcessSessionFactory().createSession();
|
||||||
|
FlowFile ff = session.write(session.create(), new StreamCallback() {
|
||||||
|
@Override
|
||||||
|
public void process(InputStream in, OutputStream out) throws IOException {
|
||||||
|
out.write((CSV_HEADER_LINE + "\nJeremy,Dyer,29,55555").getBytes());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
//Enqueue the empty FlowFile
|
||||||
|
runner.enqueue(ff);
|
||||||
|
runner.run();
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_FAILURE, 0);
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_ORIGINAL, 1);
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_SUCCESS, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void inferSchemaFromEmptyContent() throws Exception {
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(InferAvroSchemaFromCSV.class);
|
||||||
|
|
||||||
|
runner.assertNotValid();
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.HEADER_LINE, CSV_HEADER_LINE);
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.HEADER_LINE_SKIP_COUNT, "1");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.ESCAPE_STRING, "\\");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.QUOTE_STRING, "'");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.RECORD_NAME, "contact");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.CHARSET, "UTF-8");
|
||||||
|
runner.setProperty(InferAvroSchemaFromCSV.PRETTY_AVRO_OUTPUT, "true");
|
||||||
|
|
||||||
|
runner.assertValid();
|
||||||
|
|
||||||
|
ProcessSession session = runner.getProcessSessionFactory().createSession();
|
||||||
|
FlowFile ff = session.write(session.create(), new StreamCallback() {
|
||||||
|
@Override
|
||||||
|
public void process(InputStream in, OutputStream out) throws IOException {
|
||||||
|
out.write("".getBytes());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
//Enqueue the empty FlowFile
|
||||||
|
runner.enqueue(ff);
|
||||||
|
runner.run();
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_FAILURE, 1);
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_ORIGINAL, 0);
|
||||||
|
runner.assertTransferCount(InferAvroSchemaFromCSV.REL_SUCCESS, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,215 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.nifi.processors.kite;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.SchemaBuilder;
|
||||||
|
import org.apache.avro.file.DataFileStream;
|
||||||
|
import org.apache.avro.generic.GenericData;
|
||||||
|
import org.apache.avro.generic.GenericDatumReader;
|
||||||
|
import org.apache.nifi.util.MockFlowFile;
|
||||||
|
import org.apache.nifi.util.TestRunner;
|
||||||
|
import org.apache.nifi.util.TestRunners;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.apache.nifi.processors.kite.TestUtil.streamFor;
|
||||||
|
|
||||||
|
public class TestInferAvroSchemaFromJSON {
|
||||||
|
|
||||||
|
public static final Schema INPUT_SCHEMA = SchemaBuilder.record("InputTest")
|
||||||
|
.fields().requiredString("id").requiredString("primaryColor")
|
||||||
|
.optionalString("secondaryColor").optionalString("price")
|
||||||
|
.endRecord();
|
||||||
|
|
||||||
|
public static final Schema OUTPUT_SCHEMA = SchemaBuilder.record("Test")
|
||||||
|
.fields().requiredLong("id").requiredString("color")
|
||||||
|
.optionalDouble("price").endRecord();
|
||||||
|
|
||||||
|
public static final String MAPPING = "[{\"source\":\"primaryColor\", \"target\":\"color\"}]";
|
||||||
|
|
||||||
|
public static final String FAILURE_SUMMARY = "Cannot convert free to double";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasicConversion() throws IOException {
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(ConvertAvroSchema.class);
|
||||||
|
runner.assertNotValid();
|
||||||
|
runner.setProperty(ConvertAvroSchema.INPUT_SCHEMA,
|
||||||
|
INPUT_SCHEMA.toString());
|
||||||
|
runner.setProperty(ConvertAvroSchema.OUTPUT_SCHEMA,
|
||||||
|
OUTPUT_SCHEMA.toString());
|
||||||
|
runner.setProperty("primaryColor", "color");
|
||||||
|
runner.assertValid();
|
||||||
|
|
||||||
|
// Two valid rows, and one invalid because "free" is not a double.
|
||||||
|
GenericData.Record goodRecord1 = dataBasic("1", "blue", null, null);
|
||||||
|
GenericData.Record goodRecord2 = dataBasic("2", "red", "yellow", "5.5");
|
||||||
|
GenericData.Record badRecord = dataBasic("3", "red", "yellow", "free");
|
||||||
|
List<GenericData.Record> input = Lists.newArrayList(goodRecord1, goodRecord2,
|
||||||
|
badRecord);
|
||||||
|
|
||||||
|
runner.enqueue(streamFor(input));
|
||||||
|
runner.run();
|
||||||
|
|
||||||
|
long converted = runner.getCounterValue("Converted records");
|
||||||
|
long errors = runner.getCounterValue("Conversion errors");
|
||||||
|
Assert.assertEquals("Should convert 2 rows", 2, converted);
|
||||||
|
Assert.assertEquals("Should reject 1 rows", 1, errors);
|
||||||
|
|
||||||
|
runner.assertTransferCount("success", 1);
|
||||||
|
runner.assertTransferCount("failure", 1);
|
||||||
|
|
||||||
|
MockFlowFile incompatible = runner.getFlowFilesForRelationship(
|
||||||
|
"failure").get(0);
|
||||||
|
GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>(
|
||||||
|
INPUT_SCHEMA);
|
||||||
|
DataFileStream<GenericData.Record> stream = new DataFileStream<GenericData.Record>(
|
||||||
|
new ByteArrayInputStream(
|
||||||
|
runner.getContentAsByteArray(incompatible)), reader);
|
||||||
|
int count = 0;
|
||||||
|
for (GenericData.Record r : stream) {
|
||||||
|
Assert.assertEquals(badRecord, r);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
stream.close();
|
||||||
|
Assert.assertEquals(1, count);
|
||||||
|
Assert.assertEquals("Should accumulate error messages",
|
||||||
|
FAILURE_SUMMARY, incompatible.getAttribute("errors"));
|
||||||
|
|
||||||
|
GenericDatumReader<GenericData.Record> successReader = new GenericDatumReader<GenericData.Record>(
|
||||||
|
OUTPUT_SCHEMA);
|
||||||
|
DataFileStream<GenericData.Record> successStream = new DataFileStream<GenericData.Record>(
|
||||||
|
new ByteArrayInputStream(runner.getContentAsByteArray(runner
|
||||||
|
.getFlowFilesForRelationship("success").get(0))),
|
||||||
|
successReader);
|
||||||
|
count = 0;
|
||||||
|
for (GenericData.Record r : successStream) {
|
||||||
|
if (count == 0) {
|
||||||
|
Assert.assertEquals(convertBasic(goodRecord1), r);
|
||||||
|
} else {
|
||||||
|
Assert.assertEquals(convertBasic(goodRecord2), r);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
successStream.close();
|
||||||
|
Assert.assertEquals(2, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNestedConversion() throws IOException {
|
||||||
|
TestRunner runner = TestRunners.newTestRunner(ConvertAvroSchema.class);
|
||||||
|
runner.assertNotValid();
|
||||||
|
runner.setProperty(ConvertAvroSchema.INPUT_SCHEMA,
|
||||||
|
TestAvroRecordConverter.NESTED_RECORD_SCHEMA.toString());
|
||||||
|
runner.setProperty(ConvertAvroSchema.OUTPUT_SCHEMA,
|
||||||
|
TestAvroRecordConverter.UNNESTED_OUTPUT_SCHEMA.toString());
|
||||||
|
runner.setProperty("parent.id", "parentId");
|
||||||
|
runner.assertValid();
|
||||||
|
|
||||||
|
// Two valid rows
|
||||||
|
GenericData.Record goodRecord1 = dataNested(1L, "200", null, null);
|
||||||
|
GenericData.Record goodRecord2 = dataNested(2L, "300", 5L, "ParentCompany");
|
||||||
|
List<GenericData.Record> input = Lists.newArrayList(goodRecord1, goodRecord2);
|
||||||
|
|
||||||
|
runner.enqueue(streamFor(input));
|
||||||
|
runner.run();
|
||||||
|
|
||||||
|
long converted = runner.getCounterValue("Converted records");
|
||||||
|
long errors = runner.getCounterValue("Conversion errors");
|
||||||
|
Assert.assertEquals("Should convert 2 rows", 2, converted);
|
||||||
|
Assert.assertEquals("Should reject 0 rows", 0, errors);
|
||||||
|
|
||||||
|
runner.assertTransferCount("success", 1);
|
||||||
|
runner.assertTransferCount("failure", 0);
|
||||||
|
|
||||||
|
GenericDatumReader<GenericData.Record> successReader = new GenericDatumReader<GenericData.Record>(
|
||||||
|
TestAvroRecordConverter.UNNESTED_OUTPUT_SCHEMA);
|
||||||
|
DataFileStream<GenericData.Record> successStream = new DataFileStream<GenericData.Record>(
|
||||||
|
new ByteArrayInputStream(runner.getContentAsByteArray(runner
|
||||||
|
.getFlowFilesForRelationship("success").get(0))),
|
||||||
|
successReader);
|
||||||
|
int count = 0;
|
||||||
|
for (GenericData.Record r : successStream) {
|
||||||
|
if (count == 0) {
|
||||||
|
Assert.assertEquals(convertNested(goodRecord1), r);
|
||||||
|
} else {
|
||||||
|
Assert.assertEquals(convertNested(goodRecord2), r);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
successStream.close();
|
||||||
|
Assert.assertEquals(2, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
private GenericData.Record convertBasic(GenericData.Record inputRecord) {
|
||||||
|
GenericData.Record result = new GenericData.Record(OUTPUT_SCHEMA);
|
||||||
|
result.put("id", Long.parseLong(inputRecord.get("id").toString()));
|
||||||
|
result.put("color", inputRecord.get("primaryColor").toString());
|
||||||
|
if (inputRecord.get("price") == null) {
|
||||||
|
result.put("price", null);
|
||||||
|
} else {
|
||||||
|
result.put("price",
|
||||||
|
Double.parseDouble(inputRecord.get("price").toString()));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private GenericData.Record dataBasic(String id, String primaryColor,
|
||||||
|
String secondaryColor, String price) {
|
||||||
|
GenericData.Record result = new GenericData.Record(INPUT_SCHEMA);
|
||||||
|
result.put("id", id);
|
||||||
|
result.put("primaryColor", primaryColor);
|
||||||
|
result.put("secondaryColor", secondaryColor);
|
||||||
|
result.put("price", price);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private GenericData.Record convertNested(GenericData.Record inputRecord) {
|
||||||
|
GenericData.Record result = new GenericData.Record(
|
||||||
|
TestAvroRecordConverter.UNNESTED_OUTPUT_SCHEMA);
|
||||||
|
result.put("l1", inputRecord.get("l1"));
|
||||||
|
result.put("s1", Long.parseLong(inputRecord.get("s1").toString()));
|
||||||
|
if (inputRecord.get("parent") != null) {
|
||||||
|
// output schema doesn't have parent name.
|
||||||
|
result.put("parentId",
|
||||||
|
((GenericData.Record) inputRecord.get("parent")).get("id"));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private GenericData.Record dataNested(long id, String companyName, Long parentId,
|
||||||
|
String parentName) {
|
||||||
|
GenericData.Record result = new GenericData.Record(TestAvroRecordConverter.NESTED_RECORD_SCHEMA);
|
||||||
|
result.put("l1", id);
|
||||||
|
result.put("s1", companyName);
|
||||||
|
if (parentId != null || parentName != null) {
|
||||||
|
GenericData.Record parent = new GenericData.Record(
|
||||||
|
TestAvroRecordConverter.NESTED_PARENT_SCHEMA);
|
||||||
|
parent.put("id", parentId);
|
||||||
|
parent.put("name", parentName);
|
||||||
|
result.put("parent", parent);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue