From 089eec2e0c03d0d3546c36fba716c71f48f66368 Mon Sep 17 00:00:00 2001 From: Adam Lamar Date: Tue, 24 Feb 2015 20:26:13 +0000 Subject: [PATCH] NIFI-296: Use only tika-core to keep jar footprint down --- .../nifi-standard-processors/pom.xml | 6 -- .../processors/standard/IdentifyMimeType.java | 86 ++++-------------- .../org/apache/tika/mime/custom-mimetypes.xml | 83 +++++++++++++++++ .../standard/TestIdentifyMimeType.java | 24 +++++ .../resources/TestIdentifyMimeType/1.tar.gz | Bin 0 -> 154 bytes 5 files changed, 123 insertions(+), 76 deletions(-) create mode 100644 nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml create mode 100755 nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml index b941d033dc..e31f0fa83f 100644 --- a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml +++ b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml @@ -159,12 +159,6 @@ org.apache.tika tika-core 1.7 - pom - - - org.apache.tika - tika-parsers - 1.7 diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java index fd3d4aeaf8..68880e67af 100644 --- a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java +++ b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java @@ -19,14 +19,9 @@ package org.apache.nifi.processors.standard; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashSet; -import java.util.List; import java.util.Set; -import org.apache.commons.compress.archivers.tar.TarArchiveEntry; -import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; @@ -42,15 +37,9 @@ import org.apache.nifi.annotation.behavior.SideEffectFree; import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.processor.io.InputStreamCallback; -import org.apache.nifi.stream.io.StreamUtils; -import org.apache.nifi.util.FlowFilePackagerV1; -import org.apache.nifi.util.FlowFilePackagerV3; import org.apache.nifi.util.ObjectHolder; import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.CompositeDetector; -import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; -import org.apache.tika.detect.MagicDetector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -97,16 +86,7 @@ public class IdentifyMimeType extends AbstractProcessor { public IdentifyMimeType() { // Setup Tika this.config = TikaConfig.getDefaultConfig(); - DefaultDetector ddetector = new DefaultDetector(); - - // Create list of detectors, preferring our custom detectors first - List detectors = new ArrayList<>(); - detectors.add(getFlowFileV3Detector()); - detectors.add(getFlowFileV1Detector()); - detectors.addAll(ddetector.getDetectors()); - - CompositeDetector compositeDetector = new CompositeDetector(detectors); - this.detector = compositeDetector; + this.detector = config.getDetector(); } @Override @@ -129,9 +109,8 @@ public class IdentifyMimeType extends AbstractProcessor { } final ProcessorLog logger = getLogger(); - final ObjectHolder mimeTypeRef = new ObjectHolder<>(null); - final ObjectHolder extensionRef = new ObjectHolder<>(null); + session.read(flowFile, new InputStreamCallback() { @Override public void process(final InputStream stream) throws IOException { @@ -141,20 +120,25 @@ public class IdentifyMimeType extends AbstractProcessor { // Get mime type MediaType mediatype = detector.detect(tikaStream, metadata); mimeTypeRef.set(mediatype.toString()); - // Get common file extension - try { - MimeType mimetype; - mimetype = config.getMimeRepository().forName(mediatype.toString()); - extensionRef.set(mimetype.getExtension()); - } catch (MimeTypeException ex) { - logger.warn("MIME type detection failed: {}", new Object[]{ex}); - } } } }); String mimeType = mimeTypeRef.get(); - String extension = extensionRef.get(); + String extension = ""; + try { + MimeType mimetype; + mimetype = config.getMimeRepository().forName(mimeType); + extension = mimetype.getExtension(); + } catch (MimeTypeException ex) { + logger.warn("MIME type extension lookup failed: {}", new Object[]{ex}); + } + + // Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563 + if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) { + extension = ".gz"; + } + if (mimeType == null) { flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream"); flowFile = session.putAttribute(flowFile, "mime.extension", ""); @@ -168,42 +152,4 @@ public class IdentifyMimeType extends AbstractProcessor { session.getProvenanceReporter().modifyAttributes(flowFile); session.transfer(flowFile, REL_SUCCESS); } - - private Detector getFlowFileV3Detector() { - return new MagicDetector(FLOWFILE_V3, FlowFilePackagerV3.MAGIC_HEADER); - } - - private Detector getFlowFileV1Detector() { - return new FlowFileV1Detector(); - } - - private class FlowFileV1Detector implements Detector { - - @Override - public MediaType detect(InputStream in, Metadata mtdt) throws IOException { - // Sanity check the stream. This may not be a tarfile at all - in.mark(FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()); - byte[] bytes = new byte[FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()]; - StreamUtils.fillBuffer(in, bytes, false); - in.reset(); - - // Quick exit if the first filename is not correct - if (!Arrays.equals(bytes, FlowFilePackagerV1.FILENAME_ATTRIBUTES.getBytes())) { - return MediaType.OCTET_STREAM; - } - - // More in-depth detection - final TarArchiveInputStream tarIn = new TarArchiveInputStream(in); - final TarArchiveEntry firstEntry = tarIn.getNextTarEntry(); - if (firstEntry != null) { - if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) { - final TarArchiveEntry secondEntry = tarIn.getNextTarEntry(); - if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) { - return FLOWFILE_V1; - } - } - } - return MediaType.OCTET_STREAM; - } - } } diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml new file mode 100644 index 0000000000..657b4b5467 --- /dev/null +++ b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/org/apache/tika/mime/custom-mimetypes.xml @@ -0,0 +1,83 @@ + + + + + <_comment>NiFi FlowFile V1 + + + + + + + + <_comment>NiFi FlowFile V3 + + + + + + + <_comment>Office Open XML Workbook + + + + + + + + + + + + + <_comment>Office Open XML Document + + + + + + + + + + + + + <_comment>Office Open XML Presentation + + + + + + + + + + + + + + <_comment>Java Archive + http://en.wikipedia.org/wiki/.jar + com.sun.java-archive + + + + + + + + + + + + <_comment>GNU tar Compressed File Archive (GNU Tape Archive) + + + + + + + + diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java index 40b03b4909..1bf45853a9 100644 --- a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java +++ b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestIdentifyMimeType.java @@ -67,17 +67,41 @@ public class TestIdentifyMimeType { expectedMimeTypes.put("1.pdf", "application/pdf"); expectedMimeTypes.put("grid.gif", "image/gif"); expectedMimeTypes.put("1.tar", "application/x-tar"); + expectedMimeTypes.put("1.tar.gz", "application/gzip"); expectedMimeTypes.put("1.jar", "application/java-archive"); expectedMimeTypes.put("1.xml", "application/xml"); expectedMimeTypes.put("flowfilev3", "application/flowfile-v3"); expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1"); + final Map expectedExtensions = new HashMap<>(); + expectedExtensions.put("1.7z", ".7z"); + expectedExtensions.put("1.mdb", ".mdb"); + expectedExtensions.put("1.txt", ".txt"); + expectedExtensions.put("1.txt.bz2", ".bz2"); + expectedExtensions.put("1.txt.gz", ".gz"); + expectedExtensions.put("1.zip", ".zip"); + expectedExtensions.put("bgBannerFoot.png", ".png"); + expectedExtensions.put("blueBtnBg.jpg", ".jpg"); + expectedExtensions.put("1.pdf", ".pdf"); + expectedExtensions.put("grid.gif", ".gif"); + expectedExtensions.put("1.tar", ".tar"); + expectedExtensions.put("1.tar.gz", ".gz"); + expectedExtensions.put("1.jar", ".jar"); + expectedExtensions.put("1.xml", ".xml"); + expectedExtensions.put("flowfilev3", ""); + expectedExtensions.put("flowfilev1.tar", ""); + final List filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); for (final MockFlowFile file : filesOut) { final String filename = file.getAttribute(CoreAttributes.FILENAME.key()); final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key()); final String expected = expectedMimeTypes.get(filename); + + final String extension = file.getAttribute("mime.extension"); + final String expectedExtension = expectedExtensions.get(filename); + assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType); + assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension); } } } diff --git a/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz b/nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestIdentifyMimeType/1.tar.gz new file mode 100755 index 0000000000000000000000000000000000000000..481ccc1b01123688391f6705dee4f2aac4c5c3d0 GIT binary patch literal 154 zcmb2|=HM`J{2ao-Y^Ya~Sj6!5f+61_0|A#n`J?I%yZf5YJ2+0~ZYWqHk{}{I<&KC_nYZ^T!3fcWe6p%ds*bgC8$hLyL6-88jFe0J!!* AmH+?% literal 0 HcmV?d00001