diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml index 67cf93dec6..568b4d50fc 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml @@ -526,6 +526,8 @@ src/test/resources/TestIdentifyMimeType/1.txt.gz src/test/resources/TestIdentifyMimeType/1.zip src/test/resources/TestIdentifyMimeType/flowfilev1.tar + src/test/resources/TestIdentifyMimeType/2.custom + src/test/resources/TestIdentifyMimeType/.customConfig.xml src/test/resources/TestUnpackContent/data.tar src/test/resources/TestUnpackContent/data.zip src/test/resources/TestUnpackContent/invalid_data.zip diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java index c259e881d5..d6ebd39e50 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java @@ -17,6 +17,9 @@ package org.apache.nifi.processors.standard; import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.util.Collection; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -32,9 +35,15 @@ import org.apache.nifi.annotation.behavior.InputRequirement.Requirement; import org.apache.nifi.annotation.behavior.SideEffectFree; import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.behavior.WritesAttribute; +import org.apache.nifi.annotation.behavior.WritesAttributes; +import org.apache.nifi.annotation.lifecycle.OnScheduled; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.components.Validator; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.expression.ExpressionLanguageScope; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.logging.ComponentLog; @@ -44,6 +53,8 @@ import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.ProcessorInitializationContext; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.io.InputStreamCallback; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.util.StandardValidators; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; @@ -51,8 +62,11 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.mime.MimeTypesFactory; import org.apache.tika.mime.MimeTypeException; + /** *

* Attempts to detect the MIME Type of a FlowFile by examining its contents. If the MIME Type is determined, it is added @@ -76,9 +90,16 @@ import org.apache.tika.mime.MimeTypeException; @CapabilityDescription("Attempts to identify the MIME Type used for a FlowFile. If the MIME Type can be identified, " + "an attribute with the name 'mime.type' is added with the value being the MIME Type. If the MIME Type cannot be determined, " + "the value will be set to 'application/octet-stream'. In addition, the attribute mime.extension will be set if a common file " - + "extension for the MIME Type is known.") + + "extension for the MIME Type is known. If both Config File and Config Body are not set, the default NiFi MIME Types will " + + "be used.") +@WritesAttributes({ @WritesAttribute(attribute = "mime.type", description = "This Processor sets the FlowFile's mime.type attribute to the detected MIME Type. " - + "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream") + + "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream"), +@WritesAttribute(attribute = "mime.extension", description = "This Processor sets the FlowFile's mime.extension attribute to the file " + + "extension associated with the detected MIME Type. " + + "If there is no correlated extension, the attribute's value will be empty") +} +) public class IdentifyMimeType extends AbstractProcessor { public static final PropertyDescriptor USE_FILENAME_IN_DETECTION = new PropertyDescriptor.Builder() @@ -90,6 +111,24 @@ public class IdentifyMimeType extends AbstractProcessor { .defaultValue("true") .build(); + public static final PropertyDescriptor MIME_CONFIG_FILE = new PropertyDescriptor.Builder() + .displayName("Config File") + .name("config-file") + .required(false) + .description("Path to MIME type config file. Only one of Config File or Config Body may be used.") + .addValidator(new StandardValidators.FileExistsValidator(true)) + .expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY) + .build(); + + public static final PropertyDescriptor MIME_CONFIG_BODY = new PropertyDescriptor.Builder() + .displayName("Config Body") + .name("config-body") + .required(false) + .description("Body of MIME type config file. Only one of Config File or Config Body may be used.") + .addValidator(Validator.VALID) + .expressionLanguageSupported(ExpressionLanguageScope.NONE) + .build(); + public static final Relationship REL_SUCCESS = new Relationship.Builder() .name("success") .description("All FlowFiles are routed to success") @@ -99,12 +138,11 @@ public class IdentifyMimeType extends AbstractProcessor { private List properties; private final TikaConfig config; - private final Detector detector; + private Detector detector; + private MimeTypes mimeTypes; public IdentifyMimeType() { - // Setup Tika this.config = TikaConfig.getDefaultConfig(); - this.detector = config.getDetector(); } @Override @@ -112,6 +150,8 @@ public class IdentifyMimeType extends AbstractProcessor { final List properties = new ArrayList<>(); properties.add(USE_FILENAME_IN_DETECTION); + properties.add(MIME_CONFIG_BODY); + properties.add(MIME_CONFIG_FILE); this.properties = Collections.unmodifiableList(properties); final Set rels = new HashSet<>(); @@ -119,6 +159,35 @@ public class IdentifyMimeType extends AbstractProcessor { this.relationships = Collections.unmodifiableSet(rels); } + @OnScheduled + public void setup(final ProcessContext context) { + String configBody = context.getProperty(MIME_CONFIG_BODY).getValue(); + String configFile = context.getProperty(MIME_CONFIG_FILE).evaluateAttributeExpressions().getValue(); + + if (configBody == null && configFile == null){ + this.detector = config.getDetector(); + this.mimeTypes = config.getMimeRepository(); + } else if (configBody != null) { + try { + this.detector = MimeTypesFactory.create(new ByteArrayInputStream(configBody.getBytes())); + this.mimeTypes = (MimeTypes)this.detector; + } catch (Exception e) { + context.yield(); + throw new ProcessException("Failed to load config body", e); + } + + } else { + try { + this.detector = MimeTypesFactory.create(new FileInputStream(configFile)); + this.mimeTypes = (MimeTypes)this.detector; + } catch (Exception e) { + context.yield(); + throw new ProcessException("Failed to load config file", e); + } + } + } + + @Override public Set getRelationships() { return relationships; @@ -161,7 +230,7 @@ public class IdentifyMimeType extends AbstractProcessor { String extension = ""; try { MimeType mimetype; - mimetype = config.getMimeRepository().forName(mimeType); + mimetype = mimeTypes.forName(mimeType); extension = mimetype.getExtension(); } catch (MimeTypeException ex) { logger.warn("MIME type extension lookup failed: {}", new Object[]{ex}); @@ -185,4 +254,21 @@ public class IdentifyMimeType extends AbstractProcessor { session.getProvenanceReporter().modifyAttributes(flowFile); session.transfer(flowFile, REL_SUCCESS); } + + @Override + protected Collection customValidate(ValidationContext validationContext) { + Set results = new HashSet<>(); + String body = validationContext.getProperty(MIME_CONFIG_BODY).getValue(); + String file = validationContext.getProperty(MIME_CONFIG_FILE).getValue(); + if(body != null && file != null) { + results.add(new ValidationResult.Builder() + .input(MIME_CONFIG_FILE.getName()) + .subject(file) + .valid(false) + .explanation("Can only specify Config Body or Config File. Not both.") + .build()); + } + return results; + } + } diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.IdentifyMimeType/additionalDetails.html b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.IdentifyMimeType/additionalDetails.html index bc331f6f09..015d6a0bdc 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.IdentifyMimeType/additionalDetails.html +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.IdentifyMimeType/additionalDetails.html @@ -22,7 +22,7 @@ -

The following is a non-exhaustive list of MIME Types detected: +

The following is a non-exhaustive list of MIME Types detected by default in NiFi:

-

For a complete list, please refer to - - Apache Tika's source code + +

An example value for the Config Body property that will identify a file whose contents start with "abcd" as MIME Type "custom/abcd" + and with extension ".abcd" would look like the following: +

+ +

For a more complete list of Tika's default types (and additional details regarding customization of + the value for the Config Body property), please refer to + + + Apache Tika's documentation

diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java index dc611135b9..71ede0b2db 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java @@ -74,6 +74,7 @@ public class TestIdentifyMimeType { expectedMimeTypes.put("flowfilev3", "application/flowfile-v3"); expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1"); expectedMimeTypes.put("fake.csv", "text/csv"); + expectedMimeTypes.put("2.custom", "text/plain"); final Map expectedExtensions = new HashMap<>(); expectedExtensions.put("1.7z", ".7z"); @@ -94,6 +95,7 @@ public class TestIdentifyMimeType { expectedExtensions.put("flowfilev3", ""); expectedExtensions.put("flowfilev1.tar", ""); expectedExtensions.put("fake.csv", ".csv"); + expectedExtensions.put("2.custom", ".txt"); final List filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); for (final MockFlowFile file : filesOut) { @@ -122,4 +124,202 @@ public class TestIdentifyMimeType { flowFile.assertAttributeEquals("mime.extension", ".txt"); flowFile.assertAttributeEquals("mime.type", "text/plain"); } + + @Test + public void testConfigBody() throws IOException { + final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType()); + + + final File dir = new File("src/test/resources/TestIdentifyMimeType"); + final File[] files = dir.listFiles((ldir,name)-> name != null && !name.startsWith(".")); + int fileCount = 0; + for (final File file : files) { + if (file.isDirectory()) { + continue; + } + + runner.enqueue(file.toPath()); + fileCount++; + } + + + String configBody = "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " PNG\n" + + " <_comment>Portable Network Graphics\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + ""; + runner.setProperty(IdentifyMimeType.MIME_CONFIG_BODY, configBody); + + runner.setThreadCount(1); + runner.run(fileCount); + + + runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, fileCount); + + final Map expectedMimeTypes = new HashMap<>(); + expectedMimeTypes.put("1.7z", "application/octet-stream"); + expectedMimeTypes.put("1.mdb", "application/octet-stream"); + expectedMimeTypes.put("1.txt", "text/plain"); + expectedMimeTypes.put("1.csv", "text/plain"); + expectedMimeTypes.put("1.txt.bz2", "application/octet-stream"); + expectedMimeTypes.put("1.txt.gz", "application/octet-stream"); + expectedMimeTypes.put("1.zip", "application/octet-stream"); + expectedMimeTypes.put("bgBannerFoot.png", "image/png"); + expectedMimeTypes.put("blueBtnBg.jpg", "application/octet-stream"); + expectedMimeTypes.put("1.pdf", "application/octet-stream"); + expectedMimeTypes.put("grid.gif", "application/octet-stream"); + expectedMimeTypes.put("1.tar", "application/octet-stream"); + expectedMimeTypes.put("1.tar.gz", "application/octet-stream"); + expectedMimeTypes.put("1.jar", "application/octet-stream"); + expectedMimeTypes.put("1.xml", "text/plain"); + expectedMimeTypes.put("flowfilev3", "application/octet-stream"); + expectedMimeTypes.put("flowfilev1.tar", "application/octet-stream"); + expectedMimeTypes.put("fake.csv", "text/plain"); + expectedMimeTypes.put("2.custom", "custom/abcd"); + + final Map expectedExtensions = new HashMap<>(); + expectedExtensions.put("1.7z", ""); + expectedExtensions.put("1.mdb", ""); + expectedExtensions.put("1.txt", ""); + expectedExtensions.put("1.csv", ""); + expectedExtensions.put("1.txt.bz2", ""); + expectedExtensions.put("1.txt.gz", ""); + expectedExtensions.put("1.zip", ""); + expectedExtensions.put("bgBannerFoot.png", ".customPng"); + expectedExtensions.put("blueBtnBg.jpg", ""); + expectedExtensions.put("1.pdf", ""); + expectedExtensions.put("grid.gif", ""); + expectedExtensions.put("1.tar", ""); + expectedExtensions.put("1.tar.gz", ""); + expectedExtensions.put("1.jar", ""); + expectedExtensions.put("1.xml", ""); + expectedExtensions.put("flowfilev3", ""); + expectedExtensions.put("flowfilev1.tar", ""); + expectedExtensions.put("fake.csv", ""); + expectedExtensions.put("2.custom", ".abcd"); + + final List filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); + for (final MockFlowFile file : filesOut) { + final String filename = file.getAttribute(CoreAttributes.FILENAME.key()); + final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key()); + final String expected = expectedMimeTypes.get(filename); + + final String extension = file.getAttribute("mime.extension"); + final String expectedExtension = expectedExtensions.get(filename); + + assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType); + assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension); + } + } + + @Test + public void testConfigFile() throws IOException { + final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType()); + + + final File dir = new File("src/test/resources/TestIdentifyMimeType"); + final File[] files = dir.listFiles((ldir,name)-> name != null && !name.startsWith(".")); + int fileCount = 0; + for (final File file : files) { + if (file.isDirectory()) { + continue; + } + + runner.enqueue(file.toPath()); + fileCount++; + } + + + String configFile = "src/test/resources/TestIdentifyMimeType/.customConfig.xml"; + runner.setProperty(IdentifyMimeType.MIME_CONFIG_FILE, configFile); + + runner.setThreadCount(1); + runner.run(fileCount); + + + runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, fileCount); + + final Map expectedMimeTypes = new HashMap<>(); + expectedMimeTypes.put("1.7z", "application/octet-stream"); + expectedMimeTypes.put("1.mdb", "application/octet-stream"); + expectedMimeTypes.put("1.txt", "text/plain"); + expectedMimeTypes.put("1.csv", "text/plain"); + expectedMimeTypes.put("1.txt.bz2", "application/octet-stream"); + expectedMimeTypes.put("1.txt.gz", "application/octet-stream"); + expectedMimeTypes.put("1.zip", "application/octet-stream"); + expectedMimeTypes.put("bgBannerFoot.png", "my/png"); + expectedMimeTypes.put("blueBtnBg.jpg", "my/jpeg"); + expectedMimeTypes.put("1.pdf", "application/octet-stream"); + expectedMimeTypes.put("grid.gif", "my/gif"); + expectedMimeTypes.put("1.tar", "application/octet-stream"); + expectedMimeTypes.put("1.tar.gz", "application/octet-stream"); + expectedMimeTypes.put("1.jar", "application/octet-stream"); + expectedMimeTypes.put("1.xml", "text/plain"); + expectedMimeTypes.put("flowfilev3", "application/octet-stream"); + expectedMimeTypes.put("flowfilev1.tar", "application/octet-stream"); + expectedMimeTypes.put("fake.csv", "text/plain"); + expectedMimeTypes.put("2.custom", "text/plain"); + + final Map expectedExtensions = new HashMap<>(); + expectedExtensions.put("1.7z", ""); + expectedExtensions.put("1.mdb", ""); + expectedExtensions.put("1.txt", ""); + expectedExtensions.put("1.csv", ""); + expectedExtensions.put("1.txt.bz2", ""); + expectedExtensions.put("1.txt.gz", ""); + expectedExtensions.put("1.zip", ""); + expectedExtensions.put("bgBannerFoot.png", ".mypng"); + expectedExtensions.put("blueBtnBg.jpg", ".myjpg"); + expectedExtensions.put("1.pdf", ""); + expectedExtensions.put("grid.gif", ".mygif"); + expectedExtensions.put("1.tar", ""); + expectedExtensions.put("1.tar.gz", ""); + expectedExtensions.put("1.jar", ""); + expectedExtensions.put("1.xml", ""); + expectedExtensions.put("flowfilev3", ""); + expectedExtensions.put("flowfilev1.tar", ""); + expectedExtensions.put("fake.csv", ""); + expectedExtensions.put("2.custom", ""); + + final List filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); + for (final MockFlowFile file : filesOut) { + final String filename = file.getAttribute(CoreAttributes.FILENAME.key()); + final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key()); + final String expected = expectedMimeTypes.get(filename); + + final String extension = file.getAttribute("mime.extension"); + final String expectedExtension = expectedExtensions.get(filename); + + assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType); + assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension); + } + } + + @Test(expected=AssertionError.class) + public void testOnlyOneCustomMimeConfigSpecified() throws IOException { + final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType()); + + String configFile = "src/test/resources/TestIdentifyMimeType/.customConfig.xml"; + runner.setProperty(IdentifyMimeType.MIME_CONFIG_FILE, configFile); + + String configBody = "foo"; + runner.setProperty(IdentifyMimeType.MIME_CONFIG_BODY, configBody); + + runner.setThreadCount(1); + runner.run(); + + } + } diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/.customConfig.xml b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/.customConfig.xml new file mode 100644 index 0000000000..fec5cb89b0 --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/.customConfig.xml @@ -0,0 +1,34 @@ + + + + GIF + <_comment>Graphics Interchange Format + http://en.wikipedia.org/wiki/Gif + com.compuserve.gif + + + + + + + + PNG + <_comment>Portable Network Graphics + + + + + + + JPEG + <_comment>Joint Photographic Experts Group + http://en.wikipedia.org/wiki/Jpeg + public.jpeg + + + + + + + + diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/2.custom b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/2.custom new file mode 100644 index 0000000000..acbe86c7c8 --- /dev/null +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/2.custom @@ -0,0 +1 @@ +abcd