mirror of https://github.com/apache/nifi.git
NIFI-912 Initial version of ExtractAvroMetadata processor.
- Adding optional ability to extract record count - Renaming record.count to item.count for clarity, and updating documentation - Adding a test case with 0 records - Removing validators from properties that use allowable values
This commit is contained in:
parent
3d4ce34529
commit
1007999415
|
@ -39,6 +39,10 @@
|
|||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-codec</groupId>
|
||||
<artifactId>commons-codec</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.nifi</groupId>
|
||||
<artifactId>nifi-mock</artifactId>
|
||||
|
@ -64,6 +68,7 @@
|
|||
<configuration>
|
||||
<excludes combine.children="append">
|
||||
<exclude>src/test/resources/user.avsc</exclude>
|
||||
<exclude>src/test/resources/array.avsc</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
|
|
@ -0,0 +1,219 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.nifi.processors.avro;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.SchemaNormalization;
|
||||
import org.apache.avro.file.DataFileStream;
|
||||
import org.apache.avro.generic.GenericDatumReader;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.commons.codec.binary.Hex;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.nifi.annotation.behavior.SideEffectFree;
|
||||
import org.apache.nifi.annotation.behavior.SupportsBatching;
|
||||
import org.apache.nifi.annotation.behavior.WritesAttribute;
|
||||
import org.apache.nifi.annotation.behavior.WritesAttributes;
|
||||
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||
import org.apache.nifi.annotation.documentation.Tags;
|
||||
import org.apache.nifi.components.AllowableValue;
|
||||
import org.apache.nifi.components.PropertyDescriptor;
|
||||
import org.apache.nifi.flowfile.FlowFile;
|
||||
import org.apache.nifi.processor.AbstractProcessor;
|
||||
import org.apache.nifi.processor.ProcessContext;
|
||||
import org.apache.nifi.processor.ProcessSession;
|
||||
import org.apache.nifi.processor.ProcessorInitializationContext;
|
||||
import org.apache.nifi.processor.Relationship;
|
||||
import org.apache.nifi.processor.exception.ProcessException;
|
||||
import org.apache.nifi.processor.io.InputStreamCallback;
|
||||
import org.apache.nifi.processor.util.StandardValidators;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
@SideEffectFree
|
||||
@SupportsBatching
|
||||
@Tags({ "avro", "schema", "metadata" })
|
||||
@CapabilityDescription("Extracts metadata from the header of an Avro datafile.")
|
||||
@WritesAttributes({
|
||||
@WritesAttribute(attribute = "schema.type", description = "The type of the schema (i.e. record, enum, etc.)."),
|
||||
@WritesAttribute(attribute = "schema.name", description = "Contains the name when the type is a record, enum or fixed, " +
|
||||
"otherwise contains the name of the primitive type."),
|
||||
@WritesAttribute(attribute = "schema.fingerprint", description = "The result of the Fingerprint Algorithm as a Hex string."),
|
||||
@WritesAttribute(attribute = "item.count", description = "The total number of items in the datafile, only written if Count Items " +
|
||||
"is set to true.")
|
||||
})
|
||||
public class ExtractAvroMetadata extends AbstractProcessor {
|
||||
|
||||
static final AllowableValue CRC_64_AVRO = new AllowableValue("CRC-64-AVRO");
|
||||
static final AllowableValue MD5 = new AllowableValue("MD5");
|
||||
static final AllowableValue SHA_256 = new AllowableValue("SHA-256");
|
||||
|
||||
static final PropertyDescriptor FINGERPRINT_ALGORITHM = new PropertyDescriptor.Builder()
|
||||
.name("Fingerprint Algorithm")
|
||||
.description("The algorithm used to generate the schema fingerprint. Available choices are based on the Avro recommended practices for " +
|
||||
"fingerprint generation.")
|
||||
.allowableValues(CRC_64_AVRO, MD5, SHA_256)
|
||||
.defaultValue(CRC_64_AVRO.getValue())
|
||||
.required(true)
|
||||
.build();
|
||||
|
||||
static final PropertyDescriptor METADATA_KEYS = new PropertyDescriptor.Builder()
|
||||
.name("Metadata Keys")
|
||||
.description("A comma-separated list of keys indicating key/value pairs to extract from the Avro file header. The key 'avro.schema' can " +
|
||||
"be used to extract the full schema in JSON format, and 'avro.codec' can be used to extract the codec name if one exists.")
|
||||
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
|
||||
.required(false)
|
||||
.build();
|
||||
|
||||
static final PropertyDescriptor COUNT_ITEMS = new PropertyDescriptor.Builder()
|
||||
.name("Count Items")
|
||||
.description("If true the number of items in the datafile will be counted and stored in a FlowFile attribute 'item.count'. The counting is done " +
|
||||
"by reading blocks and getting the number of items for each block, thus avoiding de-serializing. The items being counted will be the top-level " +
|
||||
"items in the datafile. For example, with a schema of type record the items will be the records, and for a schema of type Array the items will " +
|
||||
"be the arrays (not the number of entries in each array).")
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("false")
|
||||
.required(true)
|
||||
.build();
|
||||
|
||||
static final Relationship REL_SUCCESS = new Relationship.Builder()
|
||||
.name("success")
|
||||
.description("A FlowFile is routed to this relationship after metadata has been extracted.")
|
||||
.build();
|
||||
static final Relationship REL_FAILURE = new Relationship.Builder()
|
||||
.name("failure")
|
||||
.description("A FlowFile is routed to this relationship if it cannot be parsed as Avro or metadata cannot be extracted for any reason")
|
||||
.build();
|
||||
|
||||
static final String SCHEMA_TYPE_ATTR = "schema.type";
|
||||
static final String SCHEMA_NAME_ATTR = "schema.name";
|
||||
static final String SCHEMA_FINGERPRINT_ATTR = "schema.fingerprint";
|
||||
static final String ITEM_COUNT_ATTR = "item.count";
|
||||
|
||||
private List<PropertyDescriptor> properties;
|
||||
private Set<Relationship> relationships;
|
||||
|
||||
@Override
|
||||
protected void init(ProcessorInitializationContext context) {
|
||||
super.init(context);
|
||||
|
||||
final List<PropertyDescriptor> properties = new ArrayList<>();
|
||||
properties.add(FINGERPRINT_ALGORITHM);
|
||||
properties.add(METADATA_KEYS);
|
||||
properties.add(COUNT_ITEMS);
|
||||
this.properties = Collections.unmodifiableList(properties);
|
||||
|
||||
final Set<Relationship> relationships = new HashSet<>();
|
||||
relationships.add(REL_SUCCESS);
|
||||
relationships.add(REL_FAILURE);
|
||||
this.relationships = Collections.unmodifiableSet(relationships);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Relationship> getRelationships() {
|
||||
return relationships;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
||||
FlowFile flowFile = session.get();
|
||||
if (flowFile == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
final Map<String,String> avroMetadata = new HashMap<>();
|
||||
final Set<String> requestedMetadataKeys = new HashSet<>();
|
||||
|
||||
final boolean countRecords = context.getProperty(COUNT_ITEMS).asBoolean();
|
||||
final String fingerprintAlgorithm = context.getProperty(FINGERPRINT_ALGORITHM).getValue();
|
||||
final String metadataKeysValue = context.getProperty(METADATA_KEYS).getValue();
|
||||
|
||||
if (!StringUtils.isEmpty(metadataKeysValue)) {
|
||||
final String[] keys = metadataKeysValue.split("\\s*,\\s*");
|
||||
for (final String key : keys) {
|
||||
requestedMetadataKeys.add(key.trim());
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
session.read(flowFile, new InputStreamCallback() {
|
||||
@Override
|
||||
public void process(InputStream rawIn) throws IOException {
|
||||
try (final InputStream in = new BufferedInputStream(rawIn);
|
||||
final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
|
||||
|
||||
final Schema schema = reader.getSchema();
|
||||
if (schema == null) {
|
||||
throw new ProcessException("Avro schema was null");
|
||||
}
|
||||
|
||||
for (String key : reader.getMetaKeys()) {
|
||||
if (requestedMetadataKeys.contains(key)) {
|
||||
avroMetadata.put(key, reader.getMetaString(key));
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
final byte[] rawFingerprint = SchemaNormalization.parsingFingerprint(fingerprintAlgorithm, schema);
|
||||
avroMetadata.put(SCHEMA_FINGERPRINT_ATTR, Hex.encodeHexString(rawFingerprint));
|
||||
avroMetadata.put(SCHEMA_TYPE_ATTR, schema.getType().getName());
|
||||
avroMetadata.put(SCHEMA_NAME_ATTR, schema.getName());
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
// shouldn't happen since allowable values are valid algorithms
|
||||
throw new ProcessException(e);
|
||||
}
|
||||
|
||||
if (countRecords) {
|
||||
long recordCount = reader.getBlockCount();
|
||||
try {
|
||||
while (reader.nextBlock() != null) {
|
||||
recordCount += reader.getBlockCount();
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
// happens at end of file
|
||||
}
|
||||
avroMetadata.put(ITEM_COUNT_ATTR, String.valueOf(recordCount));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
} catch (final ProcessException pe) {
|
||||
getLogger().error("Failed to extract Avro metadata for {} due to {}; transferring to failure", new Object[] {flowFile, pe});
|
||||
session.transfer(flowFile, REL_FAILURE);
|
||||
return;
|
||||
}
|
||||
|
||||
flowFile = session.putAllAttributes(flowFile, avroMetadata);
|
||||
session.transfer(flowFile, REL_SUCCESS);
|
||||
}
|
||||
|
||||
}
|
|
@ -12,4 +12,5 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
org.apache.nifi.processors.avro.ConvertAvroToJSON
|
||||
org.apache.nifi.processors.avro.ConvertAvroToJSON
|
||||
org.apache.nifi.processors.avro.ExtractAvroMetadata
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.nifi.processors.avro;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.file.DataFileWriter;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.io.DatumWriter;
|
||||
import org.apache.nifi.stream.io.ByteArrayOutputStream;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class AvroTestUtil {
|
||||
|
||||
public static ByteArrayOutputStream serializeAvroRecord(final Schema schema, final DatumWriter<GenericRecord> datumWriter, final GenericRecord... users) throws IOException {
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
|
||||
dataFileWriter.create(schema, out);
|
||||
for (final GenericRecord user : users) {
|
||||
dataFileWriter.append(user);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
}
|
|
@ -16,22 +16,20 @@
|
|||
*/
|
||||
package org.apache.nifi.processors.avro;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.file.DataFileWriter;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericDatumWriter;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.io.DatumWriter;
|
||||
import org.apache.nifi.processors.avro.ConvertAvroToJSON;
|
||||
import org.apache.nifi.stream.io.ByteArrayOutputStream;
|
||||
import org.apache.nifi.util.MockFlowFile;
|
||||
import org.apache.nifi.util.TestRunner;
|
||||
import org.apache.nifi.util.TestRunners;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestConvertAvroToJSON {
|
||||
|
||||
@Test
|
||||
|
@ -44,7 +42,7 @@ public class TestConvertAvroToJSON {
|
|||
user1.put("favorite_number", 256);
|
||||
|
||||
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
|
||||
final ByteArrayOutputStream out1 = serializeAvroRecord(schema, datumWriter, user1);
|
||||
final ByteArrayOutputStream out1 = AvroTestUtil.serializeAvroRecord(schema, datumWriter, user1);
|
||||
runner.enqueue(out1.toByteArray());
|
||||
|
||||
runner.run();
|
||||
|
@ -69,7 +67,7 @@ public class TestConvertAvroToJSON {
|
|||
user2.put("favorite_color", "red");
|
||||
|
||||
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
|
||||
final ByteArrayOutputStream out1 = serializeAvroRecord(schema, datumWriter, user1, user2);
|
||||
final ByteArrayOutputStream out1 = AvroTestUtil.serializeAvroRecord(schema, datumWriter, user1, user2);
|
||||
runner.enqueue(out1.toByteArray());
|
||||
|
||||
runner.run();
|
||||
|
@ -87,16 +85,4 @@ public class TestConvertAvroToJSON {
|
|||
runner.assertAllFlowFilesTransferred(ConvertAvroToJSON.REL_FAILURE, 1);
|
||||
}
|
||||
|
||||
private ByteArrayOutputStream serializeAvroRecord(final Schema schema, final DatumWriter<GenericRecord> datumWriter, final GenericRecord... users) throws IOException {
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
|
||||
dataFileWriter.create(schema, out);
|
||||
for (final GenericRecord user : users) {
|
||||
dataFileWriter.append(user);
|
||||
}
|
||||
|
||||
dataFileWriter.close();
|
||||
return out;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,256 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.nifi.processors.avro;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.file.CodecFactory;
|
||||
import org.apache.avro.file.DataFileWriter;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericDatumWriter;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.io.DatumWriter;
|
||||
import org.apache.nifi.stream.io.ByteArrayOutputStream;
|
||||
import org.apache.nifi.util.MockFlowFile;
|
||||
import org.apache.nifi.util.TestRunner;
|
||||
import org.apache.nifi.util.TestRunners;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestExtractAvroMetadata {
|
||||
|
||||
static final String AVRO_SCHEMA_ATTR = "avro.schema";
|
||||
static final String AVRO_CODEC_ATTR = "avro.codec";
|
||||
|
||||
@Test
|
||||
public void testDefaultExtraction() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
|
||||
final ByteArrayOutputStream out = getOutputStreamWithOneUser(schema);
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_FINGERPRINT_ATTR, "b2d1d8d3de2833ce");
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_TYPE_ATTR, Schema.Type.RECORD.getName());
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_NAME_ATTR, "User");
|
||||
flowFile.assertAttributeNotExists(AVRO_SCHEMA_ATTR);
|
||||
flowFile.assertAttributeNotExists(ExtractAvroMetadata.ITEM_COUNT_ATTR);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithItemCount() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
runner.setProperty(ExtractAvroMetadata.COUNT_ITEMS, "true");
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
|
||||
final ByteArrayOutputStream out = getOutputStreamWithMultipleUsers(schema, 6000); // creates 2 blocks
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.ITEM_COUNT_ATTR, "6000");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithZeroUsers() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
runner.setProperty(ExtractAvroMetadata.COUNT_ITEMS, "true");
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
|
||||
final ByteArrayOutputStream out = getOutputStreamWithMultipleUsers(schema, 0);
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_FINGERPRINT_ATTR, "b2d1d8d3de2833ce");
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_TYPE_ATTR, Schema.Type.RECORD.getName());
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_NAME_ATTR, "User");
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.ITEM_COUNT_ATTR, "0");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithMD5() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
runner.setProperty(ExtractAvroMetadata.FINGERPRINT_ALGORITHM, ExtractAvroMetadata.MD5);
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
|
||||
final ByteArrayOutputStream out = getOutputStreamWithOneUser(schema);
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_FINGERPRINT_ATTR, "3c6a7bee8994be20314dd28c6a3af4f2");
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_TYPE_ATTR, Schema.Type.RECORD.getName());
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_NAME_ATTR, "User");
|
||||
flowFile.assertAttributeNotExists(AVRO_SCHEMA_ATTR);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithSHA256() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
runner.setProperty(ExtractAvroMetadata.FINGERPRINT_ALGORITHM, ExtractAvroMetadata.SHA_256);
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
|
||||
final ByteArrayOutputStream out = getOutputStreamWithOneUser(schema);
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_FINGERPRINT_ATTR, "683f8f51ecd208038f4f0d39820ee9dd0ef3e32a3bee9371de0a2016d501b113");
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_TYPE_ATTR, Schema.Type.RECORD.getName());
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_NAME_ATTR, "User");
|
||||
flowFile.assertAttributeNotExists(AVRO_SCHEMA_ATTR);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithMetadataKey() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, AVRO_SCHEMA_ATTR); // test dynamic attribute avro.schema
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
|
||||
final ByteArrayOutputStream out = getOutputStreamWithOneUser(schema);
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeExists(ExtractAvroMetadata.SCHEMA_FINGERPRINT_ATTR);
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_TYPE_ATTR, Schema.Type.RECORD.getName());
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_NAME_ATTR, "User");
|
||||
flowFile.assertAttributeEquals(AVRO_SCHEMA_ATTR, schema.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithMetadataKeysWhitespace() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, "foo, bar, " + AVRO_SCHEMA_ATTR); // test dynamic attribute avro.schema
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/user.avsc"));
|
||||
final ByteArrayOutputStream out = getOutputStreamWithOneUser(schema);
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeExists(ExtractAvroMetadata.SCHEMA_FINGERPRINT_ATTR);
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_TYPE_ATTR, Schema.Type.RECORD.getName());
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_NAME_ATTR, "User");
|
||||
flowFile.assertAttributeEquals(AVRO_SCHEMA_ATTR, schema.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithNonRecordSchema() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
runner.setProperty(ExtractAvroMetadata.COUNT_ITEMS, "true");
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc"));
|
||||
|
||||
final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three"));
|
||||
final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema);
|
||||
|
||||
final ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter);
|
||||
dataFileWriter.create(schema, out);
|
||||
dataFileWriter.append(data);
|
||||
dataFileWriter.append(data);
|
||||
dataFileWriter.close();
|
||||
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeExists(ExtractAvroMetadata.SCHEMA_FINGERPRINT_ATTR);
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_TYPE_ATTR, Schema.Type.ARRAY.getName());
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_NAME_ATTR, "array");
|
||||
flowFile.assertAttributeEquals(ExtractAvroMetadata.ITEM_COUNT_ATTR, "2"); // number of arrays, not elements
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithCodec() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, AVRO_CODEC_ATTR); // test dynamic attribute avro.codec
|
||||
|
||||
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc"));
|
||||
|
||||
final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three"));
|
||||
final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema);
|
||||
|
||||
final ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter);
|
||||
dataFileWriter.setCodec(CodecFactory.deflateCodec(1));
|
||||
dataFileWriter.create(schema, out);
|
||||
dataFileWriter.append(data);
|
||||
dataFileWriter.close();
|
||||
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);
|
||||
|
||||
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeEquals("avro.codec", "deflate");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractionWithBadInput() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
|
||||
|
||||
final ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
out.write("not avro".getBytes("UTF-8"));
|
||||
out.flush();
|
||||
|
||||
runner.enqueue(out.toByteArray());
|
||||
runner.run();
|
||||
runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_FAILURE, 1);
|
||||
}
|
||||
|
||||
private ByteArrayOutputStream getOutputStreamWithOneUser(Schema schema) throws IOException {
|
||||
final GenericRecord user = new GenericData.Record(schema);
|
||||
user.put("name", "Alyssa");
|
||||
user.put("favorite_number", 256);
|
||||
|
||||
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
|
||||
return AvroTestUtil.serializeAvroRecord(schema, datumWriter, user);
|
||||
}
|
||||
|
||||
private ByteArrayOutputStream getOutputStreamWithMultipleUsers(Schema schema, int numUsers) throws IOException {
|
||||
final GenericRecord[] users = new GenericRecord[numUsers];
|
||||
for (int i=0; i < numUsers; i++) {
|
||||
final GenericRecord user = new GenericData.Record(schema);
|
||||
user.put("name", "user" + i);
|
||||
user.put("favorite_number", i);
|
||||
users[i] = user;
|
||||
}
|
||||
|
||||
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
|
||||
return AvroTestUtil.serializeAvroRecord(schema, datumWriter, users);
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
{"type": "array", "items": "string"}
|
Loading…
Reference in New Issue