From f6a14bc475b80a8382fb1731ae0d9a3bc69906f3 Mon Sep 17 00:00:00 2001
From: Matt Burgess <mattyb149@apache.org>
Date: Sun, 16 Jul 2023 13:45:42 -0400
Subject: [PATCH] NIFI-11807 Added ExtractRecordSchema Processor

This closes #7482

Signed-off-by: David Handermann <exceptionfactory@apache.org>
---
 .../nifi-standard-processors/pom.xml          |   1 +
 .../standard/ExtractRecordSchema.java         | 149 ++++++++++++++++++
 .../standard/HandleHttpRequest.java           |  35 ++++
 .../org.apache.nifi.processor.Processor       |   1 +
 .../standard/TestExtractRecordSchema.java     |  87 ++++++++++
 .../name_age_schema.avsc                      |   1 +
 6 files changed, 274 insertions(+)
 create mode 100644 nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractRecordSchema.java
 create mode 100644 nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractRecordSchema.java
 create mode 100644 nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestExtractRecordSchema/name_age_schema.avsc
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml
index 8c00f9f64a..d7c206d37b 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml
@@ -559,6 +559,7 @@
                         <exclude>src/test/resources/TestExtractGrok/apache.log</exclude>
                         <exclude>src/test/resources/TestExtractGrok/patterns</exclude>
                         <exclude>src/test/resources/TestExtractGrok/simple_text.log</exclude>
+                        <exclude>src/test/resources/TestExtractRecordSchema/name_age_schema.avsc</exclude>
                         <exclude>src/test/resources/TestForkRecord/input/complex-input-json.json</exclude>
                         <exclude>src/test/resources/TestForkRecord/output/extract-transactions.json</exclude>
                         <exclude>src/test/resources/TestForkRecord/output/split-address.json</exclude>
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractRecordSchema.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractRecordSchema.java
new file mode 100644
index 0000000000..8c7c670816
--- /dev/null
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractRecordSchema.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nifi.processors.standard;
+
+import com.github.benmanes.caffeine.cache.Caffeine;
+import com.github.benmanes.caffeine.cache.LoadingCache;
+import org.apache.nifi.annotation.behavior.InputRequirement;
+import org.apache.nifi.annotation.behavior.InputRequirement.Requirement;
+import org.apache.nifi.annotation.behavior.SideEffectFree;
+import org.apache.nifi.annotation.behavior.SupportsBatching;
+import org.apache.nifi.annotation.behavior.WritesAttribute;
+import org.apache.nifi.annotation.behavior.WritesAttributes;
+import org.apache.nifi.annotation.documentation.CapabilityDescription;
+import org.apache.nifi.annotation.documentation.Tags;
+import org.apache.nifi.annotation.lifecycle.OnScheduled;
+import org.apache.nifi.avro.AvroTypeUtil;
+import org.apache.nifi.components.PropertyDescriptor;
+import org.apache.nifi.flowfile.FlowFile;
+import org.apache.nifi.processor.AbstractProcessor;
+import org.apache.nifi.processor.ProcessContext;
+import org.apache.nifi.processor.ProcessSession;
+import org.apache.nifi.processor.Relationship;
+import org.apache.nifi.processor.exception.ProcessException;
+import org.apache.nifi.processor.util.StandardValidators;
+import org.apache.nifi.serialization.RecordReader;
+import org.apache.nifi.serialization.RecordReaderFactory;
+import org.apache.nifi.serialization.record.RecordSchema;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+@SupportsBatching
+@InputRequirement(Requirement.INPUT_REQUIRED)
+@SideEffectFree
+@Tags({"record", "generic", "schema", "json", "csv", "avro", "freeform", "text", "xml"})
+@WritesAttributes({
+        @WritesAttribute(attribute = "record.error.message", description = "This attribute provides on failure the error message encountered by the Reader."),
+        @WritesAttribute(attribute = "avro.schema", description = "This attribute provides the schema extracted from the input FlowFile using the provided RecordReader."),
+})
+@CapabilityDescription("Extracts the record schema from the FlowFile using the supplied Record Reader and writes it to the `avro.schema` attribute.")
+public class ExtractRecordSchema extends AbstractProcessor {
+
+    public static final String SCHEMA_ATTRIBUTE_NAME = "avro.schema";
+
+    static final PropertyDescriptor RECORD_READER = new PropertyDescriptor.Builder()
+            .name("record-reader")
+            .displayName("Record Reader")
+            .description("Specifies the Controller Service to use for reading incoming data")
+            .identifiesControllerService(RecordReaderFactory.class)
+            .required(true)
+            .build();
+
+    static final PropertyDescriptor SCHEMA_CACHE_SIZE = new PropertyDescriptor.Builder()
+            .name("cache-size")
+            .displayName("Schema Cache Size")
+            .description("Specifies the number of schemas to cache. This value should reflect the expected number of different schemas "
+                    + "that may be in the incoming FlowFiles. This ensures more efficient retrieval of the schemas and thus the processor performance.")
+            .defaultValue("10")
+            .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR)
+            .required(true)
+            .build();
+
+    static final Relationship REL_SUCCESS = new Relationship.Builder()
+            .name("success")
+            .description("FlowFiles whose record schemas are successfully extracted will be routed to this relationship")
+            .build();
+    static final Relationship REL_FAILURE = new Relationship.Builder()
+            .name("failure")
+            .description("If a FlowFile's record schema cannot be extracted from the configured input format, "
+                    + "the FlowFile will be routed to this relationship")
+            .build();
+
+    static final List<PropertyDescriptor> properties = Arrays.asList(RECORD_READER, SCHEMA_CACHE_SIZE);
+
+    private LoadingCache<RecordSchema, String> avroSchemaTextCache;
+
+    @Override
+    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
+        return properties;
+    }
+
+    @Override
+    public Set<Relationship> getRelationships() {
+        final Set<Relationship> relationships = new HashSet<>();
+        relationships.add(REL_SUCCESS);
+        relationships.add(REL_FAILURE);
+        return relationships;
+    }
+
+    @OnScheduled
+    public void setup(ProcessContext context) {
+        final int cacheSize = context.getProperty(SCHEMA_CACHE_SIZE).asInteger();
+        avroSchemaTextCache = Caffeine.newBuilder()
+                .maximumSize(cacheSize)
+                .build(schema -> AvroTypeUtil.extractAvroSchema(schema).toString());
+    }
+
+    @Override
+    public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
+        FlowFile flowFile = session.get();
+        if (flowFile == null) {
+            return;
+        }
+
+        final RecordReaderFactory readerFactory = context.getProperty(RECORD_READER).asControllerService(RecordReaderFactory.class);
+        final Map<String, String> originalAttributes = flowFile.getAttributes();
+        final RecordSchema recordSchema;
+        try (final InputStream inputStream = session.read(flowFile);
+             final RecordReader reader = readerFactory.createRecordReader(originalAttributes, inputStream, flowFile.getSize(), getLogger())) {
+            recordSchema = reader.getSchema();
+
+        } catch (final Exception e) {
+            getLogger().error("Failed to process {}; will route to failure", flowFile, e);
+            // Since we are wrapping the exceptions above there should always be a cause
+            // but it's possible it might not have a message. This handles that by logging
+            // the name of the class thrown.
+            Throwable c = e.getCause();
+            if (c == null) {
+                session.putAttribute(flowFile, "record.error.message", e.getClass().getCanonicalName() + " Thrown");
+            } else {
+                session.putAttribute(flowFile, "record.error.message", (c.getLocalizedMessage() != null) ? c.getLocalizedMessage() : c.getClass().getCanonicalName() + " Thrown");
+            }
+            session.transfer(flowFile, REL_FAILURE);
+            return;
+        }
+
+        session.putAttribute(flowFile, SCHEMA_ATTRIBUTE_NAME, avroSchemaTextCache.get(recordSchema));
+        session.transfer(flowFile, REL_SUCCESS);
+    }
+}
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/HandleHttpRequest.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/HandleHttpRequest.java
index 39e598f880..e3dda03afe 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/HandleHttpRequest.java
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/HandleHttpRequest.java
@@ -69,8 +69,11 @@ import java.io.UnsupportedEncodingException;
 import java.net.URI;
 import java.net.URLDecoder;
 import java.security.Principal;
+import java.security.cert.CertificateParsingException;
 import java.security.cert.X509Certificate;
 import java.util.ArrayList;
+import java.util.Base64;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashMap;
@@ -769,6 +772,38 @@ public class HandleHttpRequest extends AbstractProcessor {
       return session.putAllAttributes(flowFile, attributes);
     }
 
+    private void putCertificateAttributes(final X509Certificate certificate, final Map<String, String> attributes) {
+        putAttribute(attributes, "http.subject.dn", certificate.getSubjectX500Principal().getName());
+        putAttribute(attributes, "http.issuer.dn", certificate.getIssuerX500Principal().getName());
+
+        try {
+            final Collection<List<?>> subjectAlternativeNames = certificate.getSubjectAlternativeNames();
+
+            if (subjectAlternativeNames != null) {
+                int subjectAlternativeNameIndex = 0;
+                for (final List<?> subjectAlternativeTypeName : subjectAlternativeNames) {
+                    final String nameTypeAttributeKey = String.format("http.client.certificate.sans.%d.nameType", subjectAlternativeNameIndex);
+                    final String nameType = subjectAlternativeTypeName.get(0).toString();
+                    putAttribute(attributes, nameTypeAttributeKey, nameType);
+
+                    final String nameAttributeKey = String.format("http.client.certificate.sans.%d.name", subjectAlternativeNameIndex);
+                    final Object name = subjectAlternativeTypeName.get(1);
+
+                    final String serializedName;
+                    if (name instanceof byte[]) {
+                        final byte[] encodedName = (byte[]) name;
+                        serializedName = Base64.getEncoder().encodeToString(encodedName);
+                    } else {
+                        serializedName = name.toString();
+                    }
+                    putAttribute(attributes, nameTypeAttributeKey, serializedName);
+                }
+            }
+        } catch (final CertificateParsingException e) {
+            getLogger().info("Read X.509 Client Certificate Subject Alternative Names failed", e);
+        }
+    }
+
     private void forwardFlowFile(final ProcessSession session, final long start, final HttpServletRequest request, final FlowFile flowFile) {
       final long receiveMillis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
       final String subjectDn = flowFile.getAttribute(HTTPUtils.HTTP_SSL_CERT);
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
index eb01b54c9f..38ef7d0267 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/META-INF/services/org.apache.nifi.processor.Processor
@@ -39,6 +39,7 @@ org.apache.nifi.processors.standard.ExecuteSQL
 org.apache.nifi.processors.standard.ExecuteSQLRecord
 org.apache.nifi.processors.standard.ExecuteStreamCommand
 org.apache.nifi.processors.standard.ExtractGrok
+org.apache.nifi.processors.standard.ExtractRecordSchema
 org.apache.nifi.processors.standard.ExtractText
 org.apache.nifi.processors.standard.FetchDistributedMapCache
 org.apache.nifi.processors.standard.FetchFile
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractRecordSchema.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractRecordSchema.java
new file mode 100644
index 0000000000..9e2e64db72
--- /dev/null
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractRecordSchema.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.processors.standard;
+
+import org.apache.nifi.logging.ComponentLog;
+import org.apache.nifi.schema.access.SchemaNotFoundException;
+import org.apache.nifi.serialization.RecordReader;
+import org.apache.nifi.serialization.record.MockRecordParser;
+import org.apache.nifi.serialization.record.RecordFieldType;
+import org.apache.nifi.util.MockFlowFile;
+import org.apache.nifi.util.TestRunner;
+import org.apache.nifi.util.TestRunners;
+import org.junit.jupiter.api.Test;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Map;
+
+public class TestExtractRecordSchema {
+
+    final static Path NAME_AGE_SCHEMA_PATH = Paths.get("src/test/resources/TestExtractRecordSchema/name_age_schema.avsc");
+
+    @Test
+    public void testSuccessfulExtraction() throws Exception {
+        final MockRecordParser readerService = new MockRecordParser();
+        final TestRunner runner = TestRunners.newTestRunner(ExtractRecordSchema.class);
+        runner.addControllerService("reader", readerService);
+        runner.enableControllerService(readerService);
+        runner.setProperty(ExtractRecordSchema.RECORD_READER, "reader");
+
+        readerService.addSchemaField("name", RecordFieldType.STRING);
+        readerService.addSchemaField("age", RecordFieldType.INT);
+
+        readerService.addRecord("John Doe", 48);
+        readerService.addRecord("Jane Doe", 47);
+        readerService.addRecord("Jimmy Doe", 14);
+
+        runner.enqueue("");
+        runner.run();
+
+        runner.assertAllFlowFilesTransferred(ExtractRecordSchema.REL_SUCCESS, 1);
+        final MockFlowFile out = runner.getFlowFilesForRelationship(ExtractRecordSchema.REL_SUCCESS).get(0);
+
+        final String expectedAttributeValue = new String(Files.readAllBytes(NAME_AGE_SCHEMA_PATH));
+        out.assertAttributeEquals(ExtractRecordSchema.SCHEMA_ATTRIBUTE_NAME, expectedAttributeValue);
+    }
+
+    @Test
+    public void testNoSchema() throws Exception {
+        final MockRecordParser readerService = new MockRecordParserSchemaNotFound();
+        final TestRunner runner = TestRunners.newTestRunner(ExtractRecordSchema.class);
+        runner.addControllerService("reader", readerService);
+        runner.enableControllerService(readerService);
+        runner.setProperty(ExtractRecordSchema.RECORD_READER, "reader");
+
+        runner.enqueue("");
+        runner.run();
+
+        runner.assertAllFlowFilesTransferred(ExtractRecordSchema.REL_FAILURE, 1);
+        final MockFlowFile out = runner.getFlowFilesForRelationship(ExtractRecordSchema.REL_FAILURE).get(0);
+
+        out.assertAttributeEquals("record.error.message", "org.apache.nifi.schema.access.SchemaNotFoundException Thrown");
+    }
+
+    private static class MockRecordParserSchemaNotFound extends MockRecordParser {
+        @Override
+        public RecordReader createRecordReader(Map<String, String> variables, InputStream in, long inputLength, ComponentLog logger) throws SchemaNotFoundException {
+            throw new SchemaNotFoundException("test");
+        }
+    }
+}
\ No newline at end of file
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestExtractRecordSchema/name_age_schema.avsc b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestExtractRecordSchema/name_age_schema.avsc
new file mode 100644
index 0000000000..c349193e84
--- /dev/null
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestExtractRecordSchema/name_age_schema.avsc
@@ -0,0 +1 @@
+{"type":"record","name":"nifiRecord","namespace":"org.apache.nifi","fields":[{"name":"name","type":["string","null"]},{"name":"age","type":["int","null"]}]}
\ No newline at end of file