mirror of https://github.com/apache/nifi.git
NIFI-296: Use only tika-core to keep jar footprint down
This commit is contained in:
parent
b418b890a6
commit
089eec2e0c
|
@ -159,12 +159,6 @@
|
|||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-core</artifactId>
|
||||
<version>1.7</version>
|
||||
<type>pom</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-parsers</artifactId>
|
||||
<version>1.7</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
|
|
@ -19,14 +19,9 @@ package org.apache.nifi.processors.standard;
|
|||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
|
||||
import org.apache.nifi.flowfile.FlowFile;
|
||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||
|
@ -42,15 +37,9 @@ import org.apache.nifi.annotation.behavior.SideEffectFree;
|
|||
import org.apache.nifi.annotation.behavior.SupportsBatching;
|
||||
import org.apache.nifi.annotation.documentation.Tags;
|
||||
import org.apache.nifi.processor.io.InputStreamCallback;
|
||||
import org.apache.nifi.stream.io.StreamUtils;
|
||||
import org.apache.nifi.util.FlowFilePackagerV1;
|
||||
import org.apache.nifi.util.FlowFilePackagerV3;
|
||||
import org.apache.nifi.util.ObjectHolder;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.detect.CompositeDetector;
|
||||
import org.apache.tika.detect.DefaultDetector;
|
||||
import org.apache.tika.detect.Detector;
|
||||
import org.apache.tika.detect.MagicDetector;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
|
@ -97,16 +86,7 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
public IdentifyMimeType() {
|
||||
// Setup Tika
|
||||
this.config = TikaConfig.getDefaultConfig();
|
||||
DefaultDetector ddetector = new DefaultDetector();
|
||||
|
||||
// Create list of detectors, preferring our custom detectors first
|
||||
List<Detector> detectors = new ArrayList<>();
|
||||
detectors.add(getFlowFileV3Detector());
|
||||
detectors.add(getFlowFileV1Detector());
|
||||
detectors.addAll(ddetector.getDetectors());
|
||||
|
||||
CompositeDetector compositeDetector = new CompositeDetector(detectors);
|
||||
this.detector = compositeDetector;
|
||||
this.detector = config.getDetector();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -129,9 +109,8 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
}
|
||||
|
||||
final ProcessorLog logger = getLogger();
|
||||
|
||||
final ObjectHolder<String> mimeTypeRef = new ObjectHolder<>(null);
|
||||
final ObjectHolder<String> extensionRef = new ObjectHolder<>(null);
|
||||
|
||||
session.read(flowFile, new InputStreamCallback() {
|
||||
@Override
|
||||
public void process(final InputStream stream) throws IOException {
|
||||
|
@ -141,20 +120,25 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
// Get mime type
|
||||
MediaType mediatype = detector.detect(tikaStream, metadata);
|
||||
mimeTypeRef.set(mediatype.toString());
|
||||
// Get common file extension
|
||||
try {
|
||||
MimeType mimetype;
|
||||
mimetype = config.getMimeRepository().forName(mediatype.toString());
|
||||
extensionRef.set(mimetype.getExtension());
|
||||
} catch (MimeTypeException ex) {
|
||||
logger.warn("MIME type detection failed: {}", new Object[]{ex});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
String mimeType = mimeTypeRef.get();
|
||||
String extension = extensionRef.get();
|
||||
String extension = "";
|
||||
try {
|
||||
MimeType mimetype;
|
||||
mimetype = config.getMimeRepository().forName(mimeType);
|
||||
extension = mimetype.getExtension();
|
||||
} catch (MimeTypeException ex) {
|
||||
logger.warn("MIME type extension lookup failed: {}", new Object[]{ex});
|
||||
}
|
||||
|
||||
// Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
|
||||
if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
|
||||
extension = ".gz";
|
||||
}
|
||||
|
||||
if (mimeType == null) {
|
||||
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
|
||||
flowFile = session.putAttribute(flowFile, "mime.extension", "");
|
||||
|
@ -168,42 +152,4 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
session.getProvenanceReporter().modifyAttributes(flowFile);
|
||||
session.transfer(flowFile, REL_SUCCESS);
|
||||
}
|
||||
|
||||
private Detector getFlowFileV3Detector() {
|
||||
return new MagicDetector(FLOWFILE_V3, FlowFilePackagerV3.MAGIC_HEADER);
|
||||
}
|
||||
|
||||
private Detector getFlowFileV1Detector() {
|
||||
return new FlowFileV1Detector();
|
||||
}
|
||||
|
||||
private class FlowFileV1Detector implements Detector {
|
||||
|
||||
@Override
|
||||
public MediaType detect(InputStream in, Metadata mtdt) throws IOException {
|
||||
// Sanity check the stream. This may not be a tarfile at all
|
||||
in.mark(FlowFilePackagerV1.FILENAME_ATTRIBUTES.length());
|
||||
byte[] bytes = new byte[FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()];
|
||||
StreamUtils.fillBuffer(in, bytes, false);
|
||||
in.reset();
|
||||
|
||||
// Quick exit if the first filename is not correct
|
||||
if (!Arrays.equals(bytes, FlowFilePackagerV1.FILENAME_ATTRIBUTES.getBytes())) {
|
||||
return MediaType.OCTET_STREAM;
|
||||
}
|
||||
|
||||
// More in-depth detection
|
||||
final TarArchiveInputStream tarIn = new TarArchiveInputStream(in);
|
||||
final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
|
||||
if (firstEntry != null) {
|
||||
if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
|
||||
final TarArchiveEntry secondEntry = tarIn.getNextTarEntry();
|
||||
if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
|
||||
return FLOWFILE_V1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return MediaType.OCTET_STREAM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<mime-info>
|
||||
|
||||
<mime-type type="application/flowfile-v1">
|
||||
<_comment>NiFi FlowFile V1</_comment>
|
||||
<sub-class-of type="application/x-tar"/>
|
||||
<magic>
|
||||
<match value="flowfile.attributes" type="string" offset="0" />
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/flowfile-v3">
|
||||
<_comment>NiFi FlowFile V3</_comment>
|
||||
<magic>
|
||||
<match value="NiFiFF3" type="string" offset="0" />
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet">
|
||||
<_comment>Office Open XML Workbook</_comment>
|
||||
<glob pattern="*.xlsx"/>
|
||||
<sub-class-of type="application/x-tika-ooxml"/>
|
||||
<magic priority="60">
|
||||
<match value="PK\003\004" type="string" offset="0">
|
||||
<match value="[Content_Types].xml" type="string" offset="30">
|
||||
<match value="xl/_rels/workbook.xml.rels" type="string" offset="30:4096"/>
|
||||
</match>
|
||||
</match>
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.document">
|
||||
<_comment>Office Open XML Document</_comment>
|
||||
<glob pattern="*.docx"/>
|
||||
<sub-class-of type="application/x-tika-ooxml"/>
|
||||
<magic priority="60">
|
||||
<match value="PK\003\004" type="string" offset="0">
|
||||
<match value="[Content_Types].xml" type="string" offset="30">
|
||||
<match value="word/_rels/document.xml.rels" type="string" offset="30:4096"/>
|
||||
</match>
|
||||
</match>
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
|
||||
<_comment>Office Open XML Presentation</_comment>
|
||||
<glob pattern="*.pptx"/>
|
||||
<glob pattern="*.thmx"/>
|
||||
<sub-class-of type="application/x-tika-ooxml"/>
|
||||
<magic priority="60">
|
||||
<match value="PK\003\004" type="string" offset="0">
|
||||
<match value="[Content_Types].xml" type="string" offset="30">
|
||||
<match value="ppt/slides/_rels/slide" type="string" offset="30:4096"/>
|
||||
</match>
|
||||
</match>
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/java-archive">
|
||||
<_comment>Java Archive</_comment>
|
||||
<tika:link>http://en.wikipedia.org/wiki/.jar</tika:link>
|
||||
<tika:uti>com.sun.java-archive</tika:uti>
|
||||
<sub-class-of type="application/zip"/>
|
||||
<glob pattern="*.jar"/>
|
||||
<magic priority="50">
|
||||
<match value="PK\003\004" type="string" offset="0">
|
||||
<match value="META-INF/MANIFEST.MF" type="string" offset="0:1024"/>
|
||||
</match>
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<!-- Override tika's default behavior for GNU tar detection because nobody calls
|
||||
a GNU tar a .gtar -->
|
||||
<mime-type type="application/x-tar">
|
||||
<_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
|
||||
<magic priority="60">
|
||||
<!-- GNU tar archive -->
|
||||
<match value="ustar \0" type="string" offset="257" />
|
||||
</magic>
|
||||
<glob pattern="*.tar"/>
|
||||
</mime-type>
|
||||
|
||||
</mime-info>
|
|
@ -67,17 +67,41 @@ public class TestIdentifyMimeType {
|
|||
expectedMimeTypes.put("1.pdf", "application/pdf");
|
||||
expectedMimeTypes.put("grid.gif", "image/gif");
|
||||
expectedMimeTypes.put("1.tar", "application/x-tar");
|
||||
expectedMimeTypes.put("1.tar.gz", "application/gzip");
|
||||
expectedMimeTypes.put("1.jar", "application/java-archive");
|
||||
expectedMimeTypes.put("1.xml", "application/xml");
|
||||
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
|
||||
expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
|
||||
|
||||
final Map<String, String> expectedExtensions = new HashMap<>();
|
||||
expectedExtensions.put("1.7z", ".7z");
|
||||
expectedExtensions.put("1.mdb", ".mdb");
|
||||
expectedExtensions.put("1.txt", ".txt");
|
||||
expectedExtensions.put("1.txt.bz2", ".bz2");
|
||||
expectedExtensions.put("1.txt.gz", ".gz");
|
||||
expectedExtensions.put("1.zip", ".zip");
|
||||
expectedExtensions.put("bgBannerFoot.png", ".png");
|
||||
expectedExtensions.put("blueBtnBg.jpg", ".jpg");
|
||||
expectedExtensions.put("1.pdf", ".pdf");
|
||||
expectedExtensions.put("grid.gif", ".gif");
|
||||
expectedExtensions.put("1.tar", ".tar");
|
||||
expectedExtensions.put("1.tar.gz", ".gz");
|
||||
expectedExtensions.put("1.jar", ".jar");
|
||||
expectedExtensions.put("1.xml", ".xml");
|
||||
expectedExtensions.put("flowfilev3", "");
|
||||
expectedExtensions.put("flowfilev1.tar", "");
|
||||
|
||||
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
|
||||
for (final MockFlowFile file : filesOut) {
|
||||
final String filename = file.getAttribute(CoreAttributes.FILENAME.key());
|
||||
final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key());
|
||||
final String expected = expectedMimeTypes.get(filename);
|
||||
|
||||
final String extension = file.getAttribute("mime.extension");
|
||||
final String expectedExtension = expectedExtensions.get(filename);
|
||||
|
||||
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
|
||||
assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Loading…
Reference in New Issue