NIFI-296: Use only tika-core to keep jar footprint down

This commit is contained in:
Adam Lamar 2015-02-24 20:26:13 +00:00
parent b418b890a6
commit 089eec2e0c
5 changed files with 123 additions and 76 deletions

View File

@ -159,12 +159,6 @@
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.7</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.7</version>
</dependency>
</dependencies>
</project>

View File

@ -19,14 +19,9 @@ package org.apache.nifi.processors.standard;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
@ -42,15 +37,9 @@ import org.apache.nifi.annotation.behavior.SideEffectFree;
import org.apache.nifi.annotation.behavior.SupportsBatching;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.stream.io.StreamUtils;
import org.apache.nifi.util.FlowFilePackagerV1;
import org.apache.nifi.util.FlowFilePackagerV3;
import org.apache.nifi.util.ObjectHolder;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.MagicDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@ -97,16 +86,7 @@ public class IdentifyMimeType extends AbstractProcessor {
public IdentifyMimeType() {
// Setup Tika
this.config = TikaConfig.getDefaultConfig();
DefaultDetector ddetector = new DefaultDetector();
// Create list of detectors, preferring our custom detectors first
List<Detector> detectors = new ArrayList<>();
detectors.add(getFlowFileV3Detector());
detectors.add(getFlowFileV1Detector());
detectors.addAll(ddetector.getDetectors());
CompositeDetector compositeDetector = new CompositeDetector(detectors);
this.detector = compositeDetector;
this.detector = config.getDetector();
}
@Override
@ -129,9 +109,8 @@ public class IdentifyMimeType extends AbstractProcessor {
}
final ProcessorLog logger = getLogger();
final ObjectHolder<String> mimeTypeRef = new ObjectHolder<>(null);
final ObjectHolder<String> extensionRef = new ObjectHolder<>(null);
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(final InputStream stream) throws IOException {
@ -141,20 +120,25 @@ public class IdentifyMimeType extends AbstractProcessor {
// Get mime type
MediaType mediatype = detector.detect(tikaStream, metadata);
mimeTypeRef.set(mediatype.toString());
// Get common file extension
try {
MimeType mimetype;
mimetype = config.getMimeRepository().forName(mediatype.toString());
extensionRef.set(mimetype.getExtension());
} catch (MimeTypeException ex) {
logger.warn("MIME type detection failed: {}", new Object[]{ex});
}
}
}
});
String mimeType = mimeTypeRef.get();
String extension = extensionRef.get();
String extension = "";
try {
MimeType mimetype;
mimetype = config.getMimeRepository().forName(mimeType);
extension = mimetype.getExtension();
} catch (MimeTypeException ex) {
logger.warn("MIME type extension lookup failed: {}", new Object[]{ex});
}
// Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
extension = ".gz";
}
if (mimeType == null) {
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
flowFile = session.putAttribute(flowFile, "mime.extension", "");
@ -168,42 +152,4 @@ public class IdentifyMimeType extends AbstractProcessor {
session.getProvenanceReporter().modifyAttributes(flowFile);
session.transfer(flowFile, REL_SUCCESS);
}
private Detector getFlowFileV3Detector() {
return new MagicDetector(FLOWFILE_V3, FlowFilePackagerV3.MAGIC_HEADER);
}
private Detector getFlowFileV1Detector() {
return new FlowFileV1Detector();
}
private class FlowFileV1Detector implements Detector {
@Override
public MediaType detect(InputStream in, Metadata mtdt) throws IOException {
// Sanity check the stream. This may not be a tarfile at all
in.mark(FlowFilePackagerV1.FILENAME_ATTRIBUTES.length());
byte[] bytes = new byte[FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()];
StreamUtils.fillBuffer(in, bytes, false);
in.reset();
// Quick exit if the first filename is not correct
if (!Arrays.equals(bytes, FlowFilePackagerV1.FILENAME_ATTRIBUTES.getBytes())) {
return MediaType.OCTET_STREAM;
}
// More in-depth detection
final TarArchiveInputStream tarIn = new TarArchiveInputStream(in);
final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
if (firstEntry != null) {
if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
final TarArchiveEntry secondEntry = tarIn.getNextTarEntry();
if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
return FLOWFILE_V1;
}
}
}
return MediaType.OCTET_STREAM;
}
}
}

View File

@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<mime-info>
<mime-type type="application/flowfile-v1">
<_comment>NiFi FlowFile V1</_comment>
<sub-class-of type="application/x-tar"/>
<magic>
<match value="flowfile.attributes" type="string" offset="0" />
</magic>
</mime-type>
<mime-type type="application/flowfile-v3">
<_comment>NiFi FlowFile V3</_comment>
<magic>
<match value="NiFiFF3" type="string" offset="0" />
</magic>
</mime-type>
<mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet">
<_comment>Office Open XML Workbook</_comment>
<glob pattern="*.xlsx"/>
<sub-class-of type="application/x-tika-ooxml"/>
<magic priority="60">
<match value="PK\003\004" type="string" offset="0">
<match value="[Content_Types].xml" type="string" offset="30">
<match value="xl/_rels/workbook.xml.rels" type="string" offset="30:4096"/>
</match>
</match>
</magic>
</mime-type>
<mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.document">
<_comment>Office Open XML Document</_comment>
<glob pattern="*.docx"/>
<sub-class-of type="application/x-tika-ooxml"/>
<magic priority="60">
<match value="PK\003\004" type="string" offset="0">
<match value="[Content_Types].xml" type="string" offset="30">
<match value="word/_rels/document.xml.rels" type="string" offset="30:4096"/>
</match>
</match>
</magic>
</mime-type>
<mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
<_comment>Office Open XML Presentation</_comment>
<glob pattern="*.pptx"/>
<glob pattern="*.thmx"/>
<sub-class-of type="application/x-tika-ooxml"/>
<magic priority="60">
<match value="PK\003\004" type="string" offset="0">
<match value="[Content_Types].xml" type="string" offset="30">
<match value="ppt/slides/_rels/slide" type="string" offset="30:4096"/>
</match>
</match>
</magic>
</mime-type>
<mime-type type="application/java-archive">
<_comment>Java Archive</_comment>
<tika:link>http://en.wikipedia.org/wiki/.jar</tika:link>
<tika:uti>com.sun.java-archive</tika:uti>
<sub-class-of type="application/zip"/>
<glob pattern="*.jar"/>
<magic priority="50">
<match value="PK\003\004" type="string" offset="0">
<match value="META-INF/MANIFEST.MF" type="string" offset="0:1024"/>
</match>
</magic>
</mime-type>
<!-- Override tika's default behavior for GNU tar detection because nobody calls
a GNU tar a .gtar -->
<mime-type type="application/x-tar">
<_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
<magic priority="60">
<!-- GNU tar archive -->
<match value="ustar \0" type="string" offset="257" />
</magic>
<glob pattern="*.tar"/>
</mime-type>
</mime-info>

View File

@ -67,17 +67,41 @@ public class TestIdentifyMimeType {
expectedMimeTypes.put("1.pdf", "application/pdf");
expectedMimeTypes.put("grid.gif", "image/gif");
expectedMimeTypes.put("1.tar", "application/x-tar");
expectedMimeTypes.put("1.tar.gz", "application/gzip");
expectedMimeTypes.put("1.jar", "application/java-archive");
expectedMimeTypes.put("1.xml", "application/xml");
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
final Map<String, String> expectedExtensions = new HashMap<>();
expectedExtensions.put("1.7z", ".7z");
expectedExtensions.put("1.mdb", ".mdb");
expectedExtensions.put("1.txt", ".txt");
expectedExtensions.put("1.txt.bz2", ".bz2");
expectedExtensions.put("1.txt.gz", ".gz");
expectedExtensions.put("1.zip", ".zip");
expectedExtensions.put("bgBannerFoot.png", ".png");
expectedExtensions.put("blueBtnBg.jpg", ".jpg");
expectedExtensions.put("1.pdf", ".pdf");
expectedExtensions.put("grid.gif", ".gif");
expectedExtensions.put("1.tar", ".tar");
expectedExtensions.put("1.tar.gz", ".gz");
expectedExtensions.put("1.jar", ".jar");
expectedExtensions.put("1.xml", ".xml");
expectedExtensions.put("flowfilev3", "");
expectedExtensions.put("flowfilev1.tar", "");
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
for (final MockFlowFile file : filesOut) {
final String filename = file.getAttribute(CoreAttributes.FILENAME.key());
final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key());
final String expected = expectedMimeTypes.get(filename);
final String extension = file.getAttribute("mime.extension");
final String expectedExtension = expectedExtensions.get(filename);
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension);
}
}
}