NIFI-1868: Incorporate PutHiveStreaming review comments

This closes #706. Signed-off-by: Bryan Bende <bbende@apache.org>
2016-08-04 10:03:17 -04:00 · 2016-08-04 10:03:17 -04:00 · 3943d72e95
parent 59659232c7
commit 3943d72e95
5 changed files with 433 additions and 104 deletions
--- a/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/main/java/org/apache/nifi/processors/hive/PutHiveStreaming.java
+++ b/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/main/java/org/apache/nifi/processors/hive/PutHiveStreaming.java
@ -18,8 +18,12 @@ package org.apache.nifi.processors.hive;
 import com.google.common.util.concurrent.ThreadFactoryBuilder;
 import org.apache.avro.Schema;
 import org.apache.avro.file.CodecFactory;
 import org.apache.avro.file.DataFileConstants;
 import org.apache.avro.file.DataFileStream;
 import org.apache.avro.file.DataFileWriter;
 import org.apache.avro.generic.GenericDatumReader;
 import org.apache.avro.generic.GenericDatumWriter;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.security.UserGroupInformation;
@ -27,6 +31,7 @@ import org.apache.hive.hcatalog.streaming.ConnectionError;
 import org.apache.hive.hcatalog.streaming.HiveEndPoint;
 import org.apache.hive.hcatalog.streaming.SerializationError;
 import org.apache.hive.hcatalog.streaming.StreamingException;
 import org.apache.nifi.annotation.behavior.TriggerSerially;
 import org.apache.nifi.annotation.behavior.WritesAttribute;
 import org.apache.nifi.annotation.behavior.WritesAttributes;
 import org.apache.nifi.annotation.documentation.CapabilityDescription;
@ -60,6 +65,7 @@ import org.json.JSONObject;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.LinkedList;
@ -73,17 +79,21 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.regex.Pattern;
 /**
 * This processor utilizes the Hive Streaming capability to insert data from the flow into a Hive database table.
 */
@TriggerSerially
@Tags({"hive", "streaming", "put", "database", "store"})
@CapabilityDescription("This processor uses Hive Streaming to send flow file data to an Apache Hive table. The incoming flow file is expected to be in "
        + "Avro format and the table must exist in Hive. Please see the Hive documentation for requirements on the Hive table (format, partitions, etc.). "
-        + "The partition values are extracted from the Avro record based on the names of the partition columns as specified in the processor. ")
+        + "The partition values are extracted from the Avro record based on the names of the partition columns as specified in the processor.")
@WritesAttributes({
-        @WritesAttribute(attribute = "hivestreaming.record.count", description = "The number of records from this flow file written using Hive Streaming.")
+        @WritesAttribute(attribute = "hivestreaming.record.count", description = "This attribute is written on the flow files routed to the 'success' "
                + "and 'failure' relationships, and contains the number of records from the incoming flow file written successfully and unsuccessfully, respectively.")
 })
 public class PutHiveStreaming extends AbstractProcessor {
@ -110,6 +120,17 @@ public class PutHiveStreaming extends AbstractProcessor {
        return new ValidationResult.Builder().subject(subject).input(value).explanation(reason).valid(reason == null).build();
    };
    // Metadata keys that are not transferred to split files when output strategy is datafile
    // Avro will write this key/values pairs on its own
    private static final Set<String> RESERVED_METADATA;
    static {
        Set<String> reservedMetadata = new HashSet<>();
        reservedMetadata.add("avro.schema");
        reservedMetadata.add("avro.codec");
        RESERVED_METADATA = Collections.unmodifiableSet(reservedMetadata);
    }
    // Properties
    public static final PropertyDescriptor METASTORE_URI = new PropertyDescriptor.Builder()
            .name("hive-stream-metastore-uri")
@ -202,15 +223,20 @@ public class PutHiveStreaming extends AbstractProcessor {
    // Relationships
    public static final Relationship REL_SUCCESS = new Relationship.Builder()
            .name("success")
-            .description("A FlowFile is routed to this relationship after the database is successfully updated")
+            .description("A FlowFile containing the JSON contents of a record is routed to this relationship after the record has been successfully transmitted to Hive.")
            .build();
    public static final Relationship REL_RETRY = new Relationship.Builder()
            .name("retry")
            .description("A FlowFile is routed to this relationship if the database cannot be updated but attempting the operation again may succeed")
            .build();
    public static final Relationship REL_FAILURE = new Relationship.Builder()
            .name("failure")
-            .description("A FlowFile is routed to this relationship if the database cannot be updated and retrying the operation will also fail.")
+            .description("A FlowFile containing the JSON contents of a record is routed to this relationship if the record could not be transmitted to Hive.")
            .build();
    public static final Relationship REL_RETRY = new Relationship.Builder()
            .name("retry")
            .description("The incoming FlowFile is routed to this relationship if its records cannot be transmitted to Hive. Note that "
                    + "some records may have been processed successfully, they will be routed (as JSON flow files) to the success relationship. "
                    + "The combination of the retry, success, and failure relationships indicate how many records succeeded and/or failed. This "
                    + "can be used to provide a retry capability since full rollback is not possible.")
            .build();
    private final static List<PropertyDescriptor> propertyDescriptors;
@ -333,105 +359,280 @@ public class PutHiveStreaming extends AbstractProcessor {
        }
        final ComponentLog log = getLogger();
-        try {
+        final Integer txnsPerBatch = context.getProperty(TXNS_PER_BATCH).asInteger();
-            final List<String> partitionColumnList;
+
-            String partitionColumns = context.getProperty(PARTITION_COLUMNS).getValue();
+        // Store the original class loader, then explicitly set it to this class's classloader (for use by the Hive Metastore)
-            if (StringUtils.isEmpty(partitionColumns)) {
+        ClassLoader originalClassloader = Thread.currentThread().getContextClassLoader();
-                partitionColumnList = Collections.emptyList();
+        Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
-            } else {
+
-                String[] partitionCols = partitionColumns.split(",");
+        final List<String> partitionColumnList;
-                partitionColumnList = new ArrayList<>(partitionCols.length);
+        final String partitionColumns = context.getProperty(PARTITION_COLUMNS).getValue();
-                for (String col : partitionCols) {
+        if (StringUtils.isEmpty(partitionColumns)) {
-                    partitionColumnList.add(col.trim());
+            partitionColumnList = Collections.emptyList();
-                }
+        } else {
            String[] partitionCols = partitionColumns.split(",");
            partitionColumnList = new ArrayList<>(partitionCols.length);
            for (String col : partitionCols) {
                partitionColumnList.add(col.trim());
            }
        }
-            // Store the original class loader, then explicitly set it to this class's classloader (for use by the Hive Metastore)
+        final AtomicInteger recordCount = new AtomicInteger(0);
-            ClassLoader originalClassloader = Thread.currentThread().getContextClassLoader();
+        final AtomicInteger successfulRecordCount = new AtomicInteger(0);
-            Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
+        List<HiveStreamingRecord> successfulRecords = new LinkedList<>();
        final FlowFile inputFlowFile = flowFile;
        final AtomicBoolean incomingFlowFileTransferred = new AtomicBoolean(false);
-            int recordCount = 0;
+        // Create output flow files and their Avro writers
-            final List<HiveStreamingRecord> records = new LinkedList<>();
+        AtomicReference<FlowFile> successFlowFile = new AtomicReference<>(session.create(inputFlowFile));
        final DataFileWriter<GenericRecord> successAvroWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>());
        AtomicReference<FlowFile> failureFlowFile = new AtomicReference<>(session.create(inputFlowFile));
        final DataFileWriter<GenericRecord> failureAvroWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>());
-            session.read(flowFile, in -> {
+        try {
            session.read(inputFlowFile, in -> {
                try (final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
                    GenericRecord currRecord = null;
                    // Copy codec and schema information to all writers
                    final String codec = reader.getMetaString(DataFileConstants.CODEC) == null
                            ? DataFileConstants.NULL_CODEC
                            : reader.getMetaString(DataFileConstants.CODEC);
                    Arrays.asList(successAvroWriter, failureAvroWriter)
                            .forEach((writer) -> {
                                writer.setCodec(CodecFactory.fromString(codec));
                                // Transfer metadata (this is a subset of the incoming file)
                                for (String metaKey : reader.getMetaKeys()) {
                                    if (!RESERVED_METADATA.contains(metaKey)) {
                                        writer.setMeta(metaKey, reader.getMeta(metaKey));
                                    }
                                }
                            });
                    GenericRecord currRecord;
                    while (reader.hasNext()) {
-                        currRecord = reader.next();
+                        currRecord = reader.next(currRecord);
                        recordCount.incrementAndGet();
                        // Extract the partition values (they must be put separately into the Hive Streaming API)
                        List<String> partitionValues = new ArrayList<>();
-                        for (String partition : partitionColumnList) {
+                        try {
-                            Object partitionValue = currRecord.get(partition);
+                            for (String partition : partitionColumnList) {
-                            if (partitionValue == null) {
+                                Object partitionValue = currRecord.get(partition);
-                                throw new IOException("Partition column '" + partition + "' not found in Avro record");
+                                if (partitionValue == null) {
                                    throw new IOException("Partition column '" + partition + "' not found in Avro record");
                                }
                                partitionValues.add(partitionValue.toString());
                            }
-                            partitionValues.add(partitionValue.toString());
+                        } catch (IOException ioe) {
                            // Add the failed record to the failure flow file
                            log.error("Error writing record to Hive Streaming transaction", ioe);
                            appendRecordsToFlowFile(session, Collections.singletonList(new HiveStreamingRecord(null, currRecord)),
                                    failureFlowFile, failureAvroWriter, reader);
                            continue;
                        }
                        List<Schema.Field> fields = currRecord.getSchema().getFields();
                        if (fields != null) {
                            JSONObject obj = new JSONObject();
-                            for (Schema.Field field : fields) {
+                            try {
-                                String fieldName = field.name();
+                                for (Schema.Field field : fields) {
-                                // Skip fields that are partition columns, we extracted those values above to create an EndPoint
+                                    String fieldName = field.name();
-                                if (!partitionColumnList.contains(fieldName)) {
+                                    // Skip fields that are partition columns, we extracted those values above to create an EndPoint
-                                    Object value = currRecord.get(fieldName);
+                                    if (!partitionColumnList.contains(fieldName)) {
                                        Object value = currRecord.get(fieldName);
                                        try {
                                            obj.put(fieldName, value);
                                        } catch (JSONException je) {
                                            throw new IOException(je);
                                        }
                                    }
                                }
                            } catch (IOException ioe) {
                                // This really shouldn't happen since we are iterating over the schema fields, but just in case,
                                // add the failed record to the failure flow file.
                                log.error("Error writing record to Hive Streaming transaction", ioe);
                                appendRecordsToFlowFile(session, Collections.singletonList(new HiveStreamingRecord(null, currRecord)),
                                        failureFlowFile, failureAvroWriter, reader);
                                continue;
                            }
                            final HiveStreamingRecord record = new HiveStreamingRecord(partitionValues, currRecord);
                            HiveEndPoint endPoint = null;
                            HiveWriter hiveWriter = null;
                            try {
                                endPoint = makeHiveEndPoint(record.getPartitionValues(), options);
                                hiveWriter = getOrCreateWriter(endPoint);
                            } catch (ConnectionError
                                    | HiveWriter.ConnectFailure
                                    | InterruptedException connectionError) {
                                // Can't connect to Hive endpoint.
                                log.error("Error connecting to Hive endpoint: table {} at {}",
                                        new Object[]{options.getTableName(), options.getMetaStoreURI()});
                                // If we can't connect to the endpoint, exit the loop and let the outer exception handler route the original flow file to retry
                                abortAndCloseWriters();
                                throw new ProcessException(connectionError);
                            }
                            try {
                                try {
                                    hiveWriter.write(record.getRecord().toString().getBytes(StandardCharsets.UTF_8));
                                    successfulRecords.add(record);
                                } catch (InterruptedException | HiveWriter.WriteFailure wf) {
                                    // Add the failed record to the failure flow file
                                    log.error("Error writing record to Hive Streaming transaction", wf);
                                    appendRecordsToFlowFile(session, Collections.singletonList(record), failureFlowFile, failureAvroWriter, reader);
                                }
                                // If we've reached the transactions-per-batch limit, flush the Hive Writer and update the Avro Writer for successful records
                                if (hiveWriter.getTotalRecords() >= txnsPerBatch) {
                                    hiveWriter.flush(true);
                                    // Now send the records to the success relationship and update the success count
                                    try {
-                                        obj.put(fieldName, value);
+                                        appendRecordsToFlowFile(session, successfulRecords, successFlowFile, successAvroWriter, reader);
-                                    } catch (JSONException je) {
+                                        successfulRecordCount.accumulateAndGet(successfulRecords.size(), (current, incr) -> current + incr);
-                                        throw new IOException(je);
+
                                        // Clear the list of successful records, we'll use it at the end when we flush whatever records are left
                                        successfulRecords.clear();
                                    } catch (IOException ioe) {
                                        // The records were put to Hive Streaming successfully, but there was an error while writing the
                                        // Avro records to the flow file. Log as an error and move on.
                                        getLogger().error("Error writing Avro records (which were sent successfully to Hive Streaming) to the flow file", ioe);
                                    }
                                }
                            } catch (InterruptedException
                                    | HiveWriter.CommitFailure
                                    | HiveWriter.TxnBatchFailure
                                    | HiveWriter.TxnFailure
                                    | SerializationError writeException) {
                                log.error("Error writing record to Hive Streaming transaction", writeException);
                                // Add the failed record to the failure flow file
                                appendRecordsToFlowFile(session, Collections.singletonList(record), failureFlowFile, failureAvroWriter, reader);
                                if (!(writeException instanceof SerializationError)) {
                                    try {
                                        hiveWriter.abort();
                                    } catch (Exception e) {
                                        // Can't even abort properly, throw a process exception
                                        throw new ProcessException(e);
                                    }
                                }
                            }
                            records.add(new HiveStreamingRecord(partitionValues, obj));
                        }
                    }
                    try {
                        // Finish any transactions
                        flushAllWriters(true);
                        closeAllWriters();
                        // Now send any remaining records to the success relationship and update the count
                        appendRecordsToFlowFile(session, successfulRecords, successFlowFile, successAvroWriter, reader);
                        successfulRecordCount.accumulateAndGet(successfulRecords.size(), (current, incr) -> current + incr);
                        successfulRecords.clear();
                    } catch (HiveWriter.CommitFailure
                            | HiveWriter.TxnBatchFailure
                            | HiveWriter.TxnFailure
                            | InterruptedException e) {
                        // If any records are in the successfulRecords list but ended up here, then they actually weren't transferred successfully, so
                        // route them to failure instead
                        appendRecordsToFlowFile(session, successfulRecords, failureFlowFile, failureAvroWriter, reader);
                    }
                } catch (IOException ioe) {
                    // The Avro file is invalid (or may not be an Avro file at all), send it to failure
                    log.error("The incoming flow file can not be read as an Avro file, routing to failure", ioe);
                    session.transfer(inputFlowFile, REL_FAILURE);
                    incomingFlowFileTransferred.set(true);
                }
            });
-            // Write all records to Hive Streaming
+
-            for (HiveStreamingRecord record : records) {
+            if (recordCount.get() > 0) {
-                HiveEndPoint endPoint = makeHiveEndPoint(record.getPartitionValues(), options);
+                if (successfulRecordCount.get() > 0) {
-                HiveWriter writer = getOrCreateWriter(endPoint);
+                    // Transfer the flow file with successful records
-                writer.write(record.getRecord().toString().getBytes(StandardCharsets.UTF_8));
+                    successFlowFile.set(
-                recordCount++;
+                            session.putAttribute(successFlowFile.get(), HIVE_STREAMING_RECORD_COUNT_ATTR, Integer.toString(recordCount.get())));
                    session.getProvenanceReporter().send(successFlowFile.get(), options.getMetaStoreURI());
                    session.transfer(successFlowFile.get(), REL_SUCCESS);
                } else {
                    session.remove(successFlowFile.get());
                }
                if (recordCount.get() != successfulRecordCount.get()) {
                    // There were some failed records, so transfer that flow file to failure
                    failureFlowFile.set(
                            session.putAttribute(failureFlowFile.get(), HIVE_STREAMING_RECORD_COUNT_ATTR,
                                    Integer.toString(recordCount.get() - successfulRecordCount.get())));
                    session.transfer(failureFlowFile.get(), REL_FAILURE);
                } else {
                    session.remove(failureFlowFile.get());
                }
            } else {
                // No records were processed, so remove the output flow files
                session.remove(successFlowFile.get());
                session.remove(failureFlowFile.get());
            }
            successFlowFile.set(null);
            failureFlowFile.set(null);
            // If we got here, we've processed the outgoing flow files correctly, so remove the incoming one if necessary
            if (!incomingFlowFileTransferred.get()) {
                session.remove(flowFile);
            }
-            flowFile = session.putAttribute(flowFile, HIVE_STREAMING_RECORD_COUNT_ATTR, Integer.toString(recordCount));
+        } catch (ProcessException pe) {
-            flushAllWriters(true);
+            abortAndCloseWriters();
-
+            Throwable t = pe.getCause();
-            session.getProvenanceReporter().send(flowFile, options.getMetaStoreURI());
+            if (t != null) {
-            session.transfer(flowFile, REL_SUCCESS);
+                if (t instanceof ConnectionError
-
+                        || t instanceof HiveWriter.ConnectFailure
                        || t instanceof HiveWriter.CommitFailure
                        || t instanceof HiveWriter.TxnBatchFailure
                        || t instanceof HiveWriter.TxnFailure
                        || t instanceof InterruptedException) {
                    log.error("Hive Streaming connect/write error, flow file will be penalized and routed to retry", t);
                    flowFile = session.penalize(flowFile);
                    session.transfer(flowFile, REL_RETRY);
                    // Remove the ones we created
                    if (successFlowFile.get() != null) {
                        session.remove(successFlowFile.get());
                    }
                    if (failureFlowFile.get() != null) {
                        session.remove(failureFlowFile.get());
                    }
                } else {
                    throw pe;
                }
            } else {
                throw pe;
            }
        } finally {
            // Restore original class loader, might not be necessary but is good practice since the processor task changed it
            Thread.currentThread().setContextClassLoader(originalClassloader);
        } catch (HiveWriter.CommitFailure commitFailure) {
            log.error("Error committing to Hive", commitFailure);
            session.transfer(flowFile, REL_FAILURE);
        } catch (HiveWriter.TxnBatchFailure | HiveWriter.TxnFailure txnFailure) {
            log.error("Hive Streaming Transaction Failure", txnFailure);
            session.transfer(flowFile, REL_FAILURE);
        } catch (InterruptedException e) {
            log.error("Hive Streaming Interrupted, flow file will be penalized and routed to retry", e);
            flowFile = session.penalize(flowFile);
            session.transfer(flowFile, REL_RETRY);
        } catch (ConnectionError | HiveWriter.ConnectFailure ce) {
            log.error("Error while connecting via Hive Streaming, flow file will be penalized and routed to retry", ce);
            flowFile = session.penalize(flowFile);
            session.transfer(flowFile, REL_RETRY);
        } catch (SerializationError se) {
            log.error("Serialization exception occurred, record not written to Hive.", se);
            session.transfer(flowFile, REL_FAILURE);
        } catch (HiveWriter.WriteFailure wf) {
            log.error("Error while writing record to Hive Streaming", wf);
            abortAndCloseWriters();
            session.transfer(flowFile, REL_FAILURE);
        }
    }
    private void appendRecordsToFlowFile(ProcessSession session,
                                         List<HiveStreamingRecord> records,
                                         AtomicReference<FlowFile> appendFlowFile,
                                         DataFileWriter<GenericRecord> avroWriter,
                                         DataFileStream<GenericRecord> reader) throws IOException {
        appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> {
            try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) {
                for (HiveStreamingRecord sRecord : records) {
                    writer.append(sRecord.getRecord());
                }
                writer.flush();
            }
        }));
    }
    @OnStopped
    public void cleanup() {
        ComponentLog log = getLogger();
@ -637,9 +838,9 @@ public class PutHiveStreaming extends AbstractProcessor {
    protected class HiveStreamingRecord {
        private List<String> partitionValues;
-        private JSONObject record;
+        private GenericRecord record;
-        public HiveStreamingRecord(List<String> partitionValues, JSONObject record) {
+        public HiveStreamingRecord(List<String> partitionValues, GenericRecord record) {
            this.partitionValues = partitionValues;
            this.record = record;
        }
@ -648,7 +849,7 @@ public class PutHiveStreaming extends AbstractProcessor {
            return partitionValues;
        }
-        public JSONObject getRecord() {
+        public GenericRecord getRecord() {
            return record;
        }
--- a/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/main/java/org/apache/nifi/util/hive/HiveUtils.java
+++ b/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/main/java/org/apache/nifi/util/hive/HiveUtils.java
@ -36,9 +36,6 @@ public class HiveUtils {
    private static final Logger LOG = LoggerFactory.getLogger(HiveUtils.class);
    public static HiveEndPoint makeEndPoint(List<String> partitionVals, HiveOptions options) throws ConnectionError {
        if(partitionVals==null) {
            return new HiveEndPoint(options.getMetaStoreURI(), options.getDatabaseName(), options.getTableName(), null);
        }
        return new HiveEndPoint(options.getMetaStoreURI(), options.getDatabaseName(), options.getTableName(), partitionVals);
    }
--- a/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/main/java/org/apache/nifi/util/hive/HiveWriter.java
+++ b/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/main/java/org/apache/nifi/util/hive/HiveWriter.java
@ -74,7 +74,7 @@ public class HiveWriter {
            this.txnBatch = nextTxnBatch(recordWriter);
            this.closed = false;
            this.lastUsed = System.currentTimeMillis();
-        } catch (InterruptedException | RuntimeException e) {
+        } catch (InterruptedException | RuntimeException | ConnectFailure e) {
            throw e;
        } catch (Exception e) {
            throw new ConnectFailure(endPoint, e);
--- a/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/test/java/org/apache/nifi/processors/hive/TestPutHiveStreaming.java
+++ b/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/test/java/org/apache/nifi/processors/hive/TestPutHiveStreaming.java
@ -17,8 +17,10 @@
 package org.apache.nifi.processors.hive;
 import org.apache.avro.Schema;
 import org.apache.avro.file.DataFileStream;
 import org.apache.avro.file.DataFileWriter;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericDatumReader;
 import org.apache.avro.generic.GenericDatumWriter;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.io.DatumWriter;
@ -31,17 +33,19 @@ import org.apache.hive.hcatalog.streaming.StreamingException;
 import org.apache.hive.hcatalog.streaming.TransactionBatch;
 import org.apache.nifi.hadoop.KerberosProperties;
 import org.apache.nifi.stream.io.ByteArrayOutputStream;
 import org.apache.nifi.util.MockFlowFile;
 import org.apache.nifi.util.NiFiProperties;
 import org.apache.nifi.util.TestRunner;
 import org.apache.nifi.util.TestRunners;
 import org.apache.nifi.util.hive.HiveOptions;
 import org.apache.nifi.util.hive.HiveWriter;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedList;
@ -49,6 +53,12 @@ import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ExecutorService;
 import static org.apache.nifi.processors.hive.PutHiveStreaming.HIVE_STREAMING_RECORD_COUNT_ATTR;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
@ -57,8 +67,8 @@ import static org.mockito.Mockito.when;
 */
 public class TestPutHiveStreaming {
-    TestRunner runner;
+    private TestRunner runner;
-    MockPutHiveStreaming processor;
+    private MockPutHiveStreaming processor;
    private KerberosProperties kerberosPropsWithFile;
    private KerberosProperties kerberosPropsWithoutFile;
@ -84,12 +94,6 @@ public class TestPutHiveStreaming {
        runner = TestRunners.newTestRunner(processor);
    }
    @After
    public void tearDown() throws Exception {
    }
    @Test
    public void testSetup() throws Exception {
        runner.setValidateExpressionUsage(false);
@ -126,6 +130,17 @@ public class TestPutHiveStreaming {
        runner.run();
    }
    @Test
    public void testSingleBatchInvalid() throws Exception {
        runner.setProperty(PutHiveStreaming.METASTORE_URI, "thrift://localhost:9083");
        runner.setProperty(PutHiveStreaming.DB_NAME, "default");
        runner.setProperty(PutHiveStreaming.TABLE_NAME, "users");
        runner.setProperty(PutHiveStreaming.TXNS_PER_BATCH, "2");
        runner.assertValid();
        runner.setProperty(PutHiveStreaming.TXNS_PER_BATCH, "1");
        runner.assertNotValid();
    }
    @Test
    public void onTrigger() throws Exception {
        runner.setProperty(PutHiveStreaming.METASTORE_URI, "thrift://localhost:9083");
@ -142,7 +157,76 @@ public class TestPutHiveStreaming {
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run();
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_SUCCESS);
+        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 1);
        assertEquals("1", runner.getFlowFilesForRelationship(PutHiveStreaming.REL_SUCCESS).get(0).getAttribute(HIVE_STREAMING_RECORD_COUNT_ATTR));
    }
    @Test
    public void onTriggerBadInput() throws Exception {
        runner.setProperty(PutHiveStreaming.METASTORE_URI, "thrift://localhost:9083");
        runner.setProperty(PutHiveStreaming.DB_NAME, "default");
        runner.setProperty(PutHiveStreaming.TABLE_NAME, "users");
        runner.setProperty(PutHiveStreaming.TXNS_PER_BATCH, "100");
        runner.setValidateExpressionUsage(false);
        runner.enqueue("I am not an Avro record".getBytes());
        runner.run();
        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 1);
    }
    @Test
    public void onTriggerMultipleRecords() throws Exception {
        runner.setProperty(PutHiveStreaming.METASTORE_URI, "thrift://localhost:9083");
        runner.setProperty(PutHiveStreaming.DB_NAME, "default");
        runner.setProperty(PutHiveStreaming.TABLE_NAME, "users");
        runner.setProperty(PutHiveStreaming.TXNS_PER_BATCH, "2");
        runner.setValidateExpressionUsage(false);
        Map<String, Object> user1 = new HashMap<String, Object>() {
            {
                put("name", "Joe");
                put("favorite_number", 146);
            }
        };
        Map<String, Object> user2 = new HashMap<String, Object>() {
            {
                put("name", "Mary");
                put("favorite_number", 42);
            }
        };
        Map<String, Object> user3 = new HashMap<String, Object>() {
            {
                put("name", "Matt");
                put("favorite_number", 3);
            }
        };
        runner.enqueue(createAvroRecord(Arrays.asList(user1, user2, user3)));
        runner.run();
        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 1);
        MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(PutHiveStreaming.REL_SUCCESS).get(0);
        assertNotNull(resultFlowFile);
        assertEquals("3", resultFlowFile.getAttribute(PutHiveStreaming.HIVE_STREAMING_RECORD_COUNT_ATTR));
        final DataFileStream<GenericRecord> reader = new DataFileStream<>(
                new ByteArrayInputStream(resultFlowFile.toByteArray()),
                new GenericDatumReader<GenericRecord>());
        Schema schema = reader.getSchema();
        // Verify that the schema is preserved
        assertTrue(schema.equals(new Schema.Parser().parse(new File("src/test/resources/user.avsc"))));
        // Verify the records are intact. We can't guarantee order so check the total number and non-null fields
        assertTrue(reader.hasNext());
        GenericRecord record = reader.next(null);
        assertNotNull(record.get("name"));
        assertNotNull(record.get("favorite_number"));
        assertNull(record.get("favorite_color"));
        assertNull(record.get("scale"));
        assertTrue(reader.hasNext());
        record = reader.next(record);
        assertTrue(reader.hasNext());
        reader.next(record);
        assertFalse(reader.hasNext());
    }
    @Test
@ -165,7 +249,35 @@ public class TestPutHiveStreaming {
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run();
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_SUCCESS);
+        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 1);
        assertEquals("1", runner.getFlowFilesForRelationship(PutHiveStreaming.REL_SUCCESS).get(0).getAttribute(HIVE_STREAMING_RECORD_COUNT_ATTR));
        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 0);
        runner.assertTransferCount(PutHiveStreaming.REL_RETRY, 0);
    }
    @Test
    public void onTriggerWithPartitionColumnsNotInRecord() throws Exception {
        runner.setProperty(PutHiveStreaming.METASTORE_URI, "thrift://localhost:9083");
        runner.setProperty(PutHiveStreaming.DB_NAME, "default");
        runner.setProperty(PutHiveStreaming.TABLE_NAME, "users");
        runner.setProperty(PutHiveStreaming.TXNS_PER_BATCH, "100");
        runner.setProperty(PutHiveStreaming.PARTITION_COLUMNS, "favorite_food");
        runner.setProperty(PutHiveStreaming.AUTOCREATE_PARTITIONS, "false");
        runner.setValidateExpressionUsage(false);
        Map<String, Object> user1 = new HashMap<String, Object>() {
            {
                put("name", "Joe");
                put("favorite_number", 146);
                put("favorite_color", "blue");
            }
        };
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run();
        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 1);
        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 0);
        runner.assertTransferCount(PutHiveStreaming.REL_RETRY, 0);
    }
    @Test
@ -186,7 +298,9 @@ public class TestPutHiveStreaming {
        }
        runner.run(10);
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_SUCCESS);
+        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 10);
        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 0);
        runner.assertTransferCount(PutHiveStreaming.REL_RETRY, 0);
    }
    @Test
@ -210,7 +324,9 @@ public class TestPutHiveStreaming {
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run(1, true);
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_SUCCESS);
+        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 2);
        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 0);
        runner.assertTransferCount(PutHiveStreaming.REL_RETRY, 0);
    }
    @Test
@ -230,7 +346,9 @@ public class TestPutHiveStreaming {
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run();
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_RETRY);
+        runner.assertTransferCount(PutHiveStreaming.REL_RETRY, 1);
        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 0);
        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 0);
    }
    @Test
@ -250,7 +368,7 @@ public class TestPutHiveStreaming {
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run();
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_RETRY);
+        runner.assertTransferCount(PutHiveStreaming.REL_RETRY, 1);
    }
    @Test
@ -267,10 +385,17 @@ public class TestPutHiveStreaming {
                put("favorite_number", 146);
            }
        };
-        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
+        Map<String, Object> user2 = new HashMap<String, Object>() {
            {
                put("name", "Mary");
                put("favorite_number", 42);
            }
        };
        runner.enqueue(createAvroRecord(Arrays.asList(user1, user2)));
        runner.run();
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_FAILURE);
+        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 1);
        assertEquals("2", runner.getFlowFilesForRelationship(PutHiveStreaming.REL_FAILURE).get(0).getAttribute(HIVE_STREAMING_RECORD_COUNT_ATTR));
    }
    @Test
@ -290,7 +415,8 @@ public class TestPutHiveStreaming {
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run();
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_FAILURE);
+        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 0);
        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 1);
    }
    @Test
@ -310,7 +436,9 @@ public class TestPutHiveStreaming {
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run();
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_FAILURE);
+        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 1);
        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 0);
        runner.assertTransferCount(PutHiveStreaming.REL_RETRY, 0);
    }
    @Test
@ -330,7 +458,9 @@ public class TestPutHiveStreaming {
        runner.enqueue(createAvroRecord(Collections.singletonList(user1)));
        runner.run();
-        runner.assertAllFlowFilesTransferred(PutHiveStreaming.REL_FAILURE);
+        runner.assertTransferCount(PutHiveStreaming.REL_FAILURE, 1);
        runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 0);
        runner.assertTransferCount(PutHiveStreaming.REL_RETRY, 0);
    }
    @Test
@ -377,7 +507,6 @@ public class TestPutHiveStreaming {
            user.put("favorite_color", record.get("favorite_color"));
            users.add(user);
        }
        final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
@ -387,6 +516,7 @@ public class TestPutHiveStreaming {
            }
        }
        return out.toByteArray();
    }
    private class MockPutHiveStreaming extends PutHiveStreaming {
--- a/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/test/resources/user.avsc
+++ b/nifi-nar-bundles/nifi-hive-bundle/nifi-hive-processors/src/test/resources/user.avsc
@ -20,6 +20,7 @@
 "fields": [
     {"name": "name", "type": "string"},
     {"name": "favorite_number",  "type": ["int", "null"]},
-     {"name": "favorite_color", "type": ["string", "null"]}
+     {"name": "favorite_color", "type": ["string", "null"]},
     {"name": "scale", "type": ["double", "null"]}
 ]
 }