From f738e19a757675228f8e594ef1461ce5084eeb56 Mon Sep 17 00:00:00 2001 From: John Highcock Date: Fri, 24 Jan 2020 16:22:27 -0500 Subject: [PATCH] NIFI-2537: Add custom MIME type configuration support to IdentifyMimeType Add two new properties to IdentifyMimeType (Config File and Config Body). Specifying one of these properties will override the default NiFi MIME type configuration and use the configured property's MIME config instead. Add additional runtime documentation for IdentifyMimeType's usage. The default behavior of IdentifyMimeType is unchanged. Signed-off-by: Pierre Villard This closes #4016. --- .../nifi-standard-processors/pom.xml | 2 + .../processors/standard/IdentifyMimeType.java | 98 ++++++++- .../additionalDetails.html | 25 ++- .../standard/TestIdentifyMimeType.java | 200 ++++++++++++++++++ .../TestIdentifyMimeType/.customConfig.xml | 34 +++ .../resources/TestIdentifyMimeType/2.custom | 1 + 6 files changed, 350 insertions(+), 10 deletions(-) create mode 100644 nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/.customConfig.xml create mode 100644 nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/2.custom diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml index 67cf93dec6..568b4d50fc 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml @@ -526,6 +526,8 @@ src/test/resources/TestIdentifyMimeType/1.txt.gz src/test/resources/TestIdentifyMimeType/1.zip src/test/resources/TestIdentifyMimeType/flowfilev1.tar + src/test/resources/TestIdentifyMimeType/2.custom + src/test/resources/TestIdentifyMimeType/.customConfig.xml src/test/resources/TestUnpackContent/data.tar src/test/resources/TestUnpackContent/data.zip src/test/resources/TestUnpackContent/invalid_data.zip diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java index c259e881d5..d6ebd39e50 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java @@ -17,6 +17,9 @@ package org.apache.nifi.processors.standard; import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.util.Collection; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -32,9 +35,15 @@ import org.apache.nifi.annotation.behavior.InputRequirement.Requirement; import org.apache.nifi.annotation.behavior.SideEffectFree; import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.behavior.WritesAttribute; +import org.apache.nifi.annotation.behavior.WritesAttributes; +import org.apache.nifi.annotation.lifecycle.OnScheduled; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.components.Validator; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.expression.ExpressionLanguageScope; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.logging.ComponentLog; @@ -44,6 +53,8 @@ import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.ProcessorInitializationContext; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.io.InputStreamCallback; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.util.StandardValidators; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; @@ -51,8 +62,11 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.mime.MimeTypesFactory; import org.apache.tika.mime.MimeTypeException; + /** *

* Attempts to detect the MIME Type of a FlowFile by examining its contents. If the MIME Type is determined, it is added @@ -76,9 +90,16 @@ import org.apache.tika.mime.MimeTypeException; @CapabilityDescription("Attempts to identify the MIME Type used for a FlowFile. If the MIME Type can be identified, " + "an attribute with the name 'mime.type' is added with the value being the MIME Type. If the MIME Type cannot be determined, " + "the value will be set to 'application/octet-stream'. In addition, the attribute mime.extension will be set if a common file " - + "extension for the MIME Type is known.") + + "extension for the MIME Type is known. If both Config File and Config Body are not set, the default NiFi MIME Types will " + + "be used.") +@WritesAttributes({ @WritesAttribute(attribute = "mime.type", description = "This Processor sets the FlowFile's mime.type attribute to the detected MIME Type. " - + "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream") + + "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream"), +@WritesAttribute(attribute = "mime.extension", description = "This Processor sets the FlowFile's mime.extension attribute to the file " + + "extension associated with the detected MIME Type. " + + "If there is no correlated extension, the attribute's value will be empty") +} +) public class IdentifyMimeType extends AbstractProcessor { public static final PropertyDescriptor USE_FILENAME_IN_DETECTION = new PropertyDescriptor.Builder() @@ -90,6 +111,24 @@ public class IdentifyMimeType extends AbstractProcessor { .defaultValue("true") .build(); + public static final PropertyDescriptor MIME_CONFIG_FILE = new PropertyDescriptor.Builder() + .displayName("Config File") + .name("config-file") + .required(false) + .description("Path to MIME type config file. Only one of Config File or Config Body may be used.") + .addValidator(new StandardValidators.FileExistsValidator(true)) + .expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY) + .build(); + + public static final PropertyDescriptor MIME_CONFIG_BODY = new PropertyDescriptor.Builder() + .displayName("Config Body") + .name("config-body") + .required(false) + .description("Body of MIME type config file. Only one of Config File or Config Body may be used.") + .addValidator(Validator.VALID) + .expressionLanguageSupported(ExpressionLanguageScope.NONE) + .build(); + public static final Relationship REL_SUCCESS = new Relationship.Builder() .name("success") .description("All FlowFiles are routed to success") @@ -99,12 +138,11 @@ public class IdentifyMimeType extends AbstractProcessor { private List properties; private final TikaConfig config; - private final Detector detector; + private Detector detector; + private MimeTypes mimeTypes; public IdentifyMimeType() { - // Setup Tika this.config = TikaConfig.getDefaultConfig(); - this.detector = config.getDetector(); } @Override @@ -112,6 +150,8 @@ public class IdentifyMimeType extends AbstractProcessor { final List properties = new ArrayList<>(); properties.add(USE_FILENAME_IN_DETECTION); + properties.add(MIME_CONFIG_BODY); + properties.add(MIME_CONFIG_FILE); this.properties = Collections.unmodifiableList(properties); final Set rels = new HashSet<>(); @@ -119,6 +159,35 @@ public class IdentifyMimeType extends AbstractProcessor { this.relationships = Collections.unmodifiableSet(rels); } + @OnScheduled + public void setup(final ProcessContext context) { + String configBody = context.getProperty(MIME_CONFIG_BODY).getValue(); + String configFile = context.getProperty(MIME_CONFIG_FILE).evaluateAttributeExpressions().getValue(); + + if (configBody == null && configFile == null){ + this.detector = config.getDetector(); + this.mimeTypes = config.getMimeRepository(); + } else if (configBody != null) { + try { + this.detector = MimeTypesFactory.create(new ByteArrayInputStream(configBody.getBytes())); + this.mimeTypes = (MimeTypes)this.detector; + } catch (Exception e) { + context.yield(); + throw new ProcessException("Failed to load config body", e); + } + + } else { + try { + this.detector = MimeTypesFactory.create(new FileInputStream(configFile)); + this.mimeTypes = (MimeTypes)this.detector; + } catch (Exception e) { + context.yield(); + throw new ProcessException("Failed to load config file", e); + } + } + } + + @Override public Set getRelationships() { return relationships; @@ -161,7 +230,7 @@ public class IdentifyMimeType extends AbstractProcessor { String extension = ""; try { MimeType mimetype; - mimetype = config.getMimeRepository().forName(mimeType); + mimetype = mimeTypes.forName(mimeType); extension = mimetype.getExtension(); } catch (MimeTypeException ex) { logger.warn("MIME type extension lookup failed: {}", new Object[]{ex}); @@ -185,4 +254,21 @@ public class IdentifyMimeType extends AbstractProcessor { session.getProvenanceReporter().modifyAttributes(flowFile); session.transfer(flowFile, REL_SUCCESS); } + + @Override + protected Collection customValidate(ValidationContext validationContext) { + Set results = new HashSet<>(); + String body = validationContext.getProperty(MIME_CONFIG_BODY).getValue(); + String file = validationContext.getProperty(MIME_CONFIG_FILE).getValue(); + if(body != null && file != null) { + results.add(new ValidationResult.Builder() + .input(MIME_CONFIG_FILE.getName()) + .subject(file) + .valid(false) + .explanation("Can only specify Config Body or Config File. Not both.") + .build()); + } + return results; + } + } diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.IdentifyMimeType/additionalDetails.html b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.IdentifyMimeType/additionalDetails.html index bc331f6f09..015d6a0bdc 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.IdentifyMimeType/additionalDetails.html +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.IdentifyMimeType/additionalDetails.html @@ -22,7 +22,7 @@ -

The following is a non-exhaustive list of MIME Types detected: +

The following is a non-exhaustive list of MIME Types detected by default in NiFi:

-

For a complete list, please refer to - - Apache Tika's source code + +

An example value for the Config Body property that will identify a file whose contents start with "abcd" as MIME Type "custom/abcd" + and with extension ".abcd" would look like the following: +

+ +

For a more complete list of Tika's default types (and additional details regarding customization of + the value for the Config Body property), please refer to + + + Apache Tika's documentation

diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java index dc611135b9..71ede0b2db 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java @@ -74,6 +74,7 @@ public class TestIdentifyMimeType { expectedMimeTypes.put("flowfilev3", "application/flowfile-v3"); expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1"); expectedMimeTypes.put("fake.csv", "text/csv"); + expectedMimeTypes.put("2.custom", "text/plain"); final Map expectedExtensions = new HashMap<>(); expectedExtensions.put("1.7z", ".7z"); @@ -94,6 +95,7 @@ public class TestIdentifyMimeType { expectedExtensions.put("flowfilev3", ""); expectedExtensions.put("flowfilev1.tar", ""); expectedExtensions.put("fake.csv", ".csv"); + expectedExtensions.put("2.custom", ".txt"); final List filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); for (final MockFlowFile file : filesOut) { @@ -122,4 +124,202 @@ public class TestIdentifyMimeType { flowFile.assertAttributeEquals("mime.extension", ".txt"); flowFile.assertAttributeEquals("mime.type", "text/plain"); } + + @Test + public void testConfigBody() throws IOException { + final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType()); + + + final File dir = new File("src/test/resources/TestIdentifyMimeType"); + final File[] files = dir.listFiles((ldir,name)-> name != null && !name.startsWith(".")); + int fileCount = 0; + for (final File file : files) { + if (file.isDirectory()) { + continue; + } + + runner.enqueue(file.toPath()); + fileCount++; + } + + + String configBody = "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " PNG\n" + + " <_comment>Portable Network Graphics\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + ""; + runner.setProperty(IdentifyMimeType.MIME_CONFIG_BODY, configBody); + + runner.setThreadCount(1); + runner.run(fileCount); + + + runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, fileCount); + + final Map expectedMimeTypes = new HashMap<>(); + expectedMimeTypes.put("1.7z", "application/octet-stream"); + expectedMimeTypes.put("1.mdb", "application/octet-stream"); + expectedMimeTypes.put("1.txt", "text/plain"); + expectedMimeTypes.put("1.csv", "text/plain"); + expectedMimeTypes.put("1.txt.bz2", "application/octet-stream"); + expectedMimeTypes.put("1.txt.gz", "application/octet-stream"); + expectedMimeTypes.put("1.zip", "application/octet-stream"); + expectedMimeTypes.put("bgBannerFoot.png", "image/png"); + expectedMimeTypes.put("blueBtnBg.jpg", "application/octet-stream"); + expectedMimeTypes.put("1.pdf", "application/octet-stream"); + expectedMimeTypes.put("grid.gif", "application/octet-stream"); + expectedMimeTypes.put("1.tar", "application/octet-stream"); + expectedMimeTypes.put("1.tar.gz", "application/octet-stream"); + expectedMimeTypes.put("1.jar", "application/octet-stream"); + expectedMimeTypes.put("1.xml", "text/plain"); + expectedMimeTypes.put("flowfilev3", "application/octet-stream"); + expectedMimeTypes.put("flowfilev1.tar", "application/octet-stream"); + expectedMimeTypes.put("fake.csv", "text/plain"); + expectedMimeTypes.put("2.custom", "custom/abcd"); + + final Map expectedExtensions = new HashMap<>(); + expectedExtensions.put("1.7z", ""); + expectedExtensions.put("1.mdb", ""); + expectedExtensions.put("1.txt", ""); + expectedExtensions.put("1.csv", ""); + expectedExtensions.put("1.txt.bz2", ""); + expectedExtensions.put("1.txt.gz", ""); + expectedExtensions.put("1.zip", ""); + expectedExtensions.put("bgBannerFoot.png", ".customPng"); + expectedExtensions.put("blueBtnBg.jpg", ""); + expectedExtensions.put("1.pdf", ""); + expectedExtensions.put("grid.gif", ""); + expectedExtensions.put("1.tar", ""); + expectedExtensions.put("1.tar.gz", ""); + expectedExtensions.put("1.jar", ""); + expectedExtensions.put("1.xml", ""); + expectedExtensions.put("flowfilev3", ""); + expectedExtensions.put("flowfilev1.tar", ""); + expectedExtensions.put("fake.csv", ""); + expectedExtensions.put("2.custom", ".abcd"); + + final List filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); + for (final MockFlowFile file : filesOut) { + final String filename = file.getAttribute(CoreAttributes.FILENAME.key()); + final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key()); + final String expected = expectedMimeTypes.get(filename); + + final String extension = file.getAttribute("mime.extension"); + final String expectedExtension = expectedExtensions.get(filename); + + assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType); + assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension); + } + } + + @Test + public void testConfigFile() throws IOException { + final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType()); + + + final File dir = new File("src/test/resources/TestIdentifyMimeType"); + final File[] files = dir.listFiles((ldir,name)-> name != null && !name.startsWith(".")); + int fileCount = 0; + for (final File file : files) { + if (file.isDirectory()) { + continue; + } + + runner.enqueue(file.toPath()); + fileCount++; + } + + + String configFile = "src/test/resources/TestIdentifyMimeType/.customConfig.xml"; + runner.setProperty(IdentifyMimeType.MIME_CONFIG_FILE, configFile); + + runner.setThreadCount(1); + runner.run(fileCount); + + + runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, fileCount); + + final Map expectedMimeTypes = new HashMap<>(); + expectedMimeTypes.put("1.7z", "application/octet-stream"); + expectedMimeTypes.put("1.mdb", "application/octet-stream"); + expectedMimeTypes.put("1.txt", "text/plain"); + expectedMimeTypes.put("1.csv", "text/plain"); + expectedMimeTypes.put("1.txt.bz2", "application/octet-stream"); + expectedMimeTypes.put("1.txt.gz", "application/octet-stream"); + expectedMimeTypes.put("1.zip", "application/octet-stream"); + expectedMimeTypes.put("bgBannerFoot.png", "my/png"); + expectedMimeTypes.put("blueBtnBg.jpg", "my/jpeg"); + expectedMimeTypes.put("1.pdf", "application/octet-stream"); + expectedMimeTypes.put("grid.gif", "my/gif"); + expectedMimeTypes.put("1.tar", "application/octet-stream"); + expectedMimeTypes.put("1.tar.gz", "application/octet-stream"); + expectedMimeTypes.put("1.jar", "application/octet-stream"); + expectedMimeTypes.put("1.xml", "text/plain"); + expectedMimeTypes.put("flowfilev3", "application/octet-stream"); + expectedMimeTypes.put("flowfilev1.tar", "application/octet-stream"); + expectedMimeTypes.put("fake.csv", "text/plain"); + expectedMimeTypes.put("2.custom", "text/plain"); + + final Map expectedExtensions = new HashMap<>(); + expectedExtensions.put("1.7z", ""); + expectedExtensions.put("1.mdb", ""); + expectedExtensions.put("1.txt", ""); + expectedExtensions.put("1.csv", ""); + expectedExtensions.put("1.txt.bz2", ""); + expectedExtensions.put("1.txt.gz", ""); + expectedExtensions.put("1.zip", ""); + expectedExtensions.put("bgBannerFoot.png", ".mypng"); + expectedExtensions.put("blueBtnBg.jpg", ".myjpg"); + expectedExtensions.put("1.pdf", ""); + expectedExtensions.put("grid.gif", ".mygif"); + expectedExtensions.put("1.tar", ""); + expectedExtensions.put("1.tar.gz", ""); + expectedExtensions.put("1.jar", ""); + expectedExtensions.put("1.xml", ""); + expectedExtensions.put("flowfilev3", ""); + expectedExtensions.put("flowfilev1.tar", ""); + expectedExtensions.put("fake.csv", ""); + expectedExtensions.put("2.custom", ""); + + final List filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); + for (final MockFlowFile file : filesOut) { + final String filename = file.getAttribute(CoreAttributes.FILENAME.key()); + final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key()); + final String expected = expectedMimeTypes.get(filename); + + final String extension = file.getAttribute("mime.extension"); + final String expectedExtension = expectedExtensions.get(filename); + + assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType); + assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension); + } + } + + @Test(expected=AssertionError.class) + public void testOnlyOneCustomMimeConfigSpecified() throws IOException { + final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType()); + + String configFile = "src/test/resources/TestIdentifyMimeType/.customConfig.xml"; + runner.setProperty(IdentifyMimeType.MIME_CONFIG_FILE, configFile); + + String configBody = "foo"; + runner.setProperty(IdentifyMimeType.MIME_CONFIG_BODY, configBody); + + runner.setThreadCount(1); + runner.run(); + + } + } diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/.customConfig.xml b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/.customConfig.xml new file mode 100644 index 0000000000..fec5cb89b0 --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/.customConfig.xml @@ -0,0 +1,34 @@ + + + + GIF + <_comment>Graphics Interchange Format + http://en.wikipedia.org/wiki/Gif + com.compuserve.gif + + + + + + + + PNG + <_comment>Portable Network Graphics + + + + + + + JPEG + <_comment>Joint Photographic Experts Group + http://en.wikipedia.org/wiki/Jpeg + public.jpeg + + + + + + + + diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/2.custom b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/2.custom new file mode 100644 index 0000000000..acbe86c7c8 --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/2.custom @@ -0,0 +1 @@ +abcd