Merge branch 'NIFI-296-extend-IdentifyMimeType' of https://github.com/adamonduty/incubator-nifi into develop

This commit is contained in:
Mark Payne 2015-03-02 10:12:28 -05:00
commit ee795a7d01
10 changed files with 171 additions and 417 deletions

View File

@ -163,5 +163,10 @@
<artifactId>nifi-ssl-context-service</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.7</version>
</dependency>
</dependencies>
</project>

View File

@ -19,14 +19,10 @@ package org.apache.nifi.processors.standard;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipInputStream;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ProcessorLog;
@ -41,53 +37,29 @@ import org.apache.nifi.annotation.behavior.SideEffectFree;
import org.apache.nifi.annotation.behavior.SupportsBatching;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.util.FlowFilePackagerV1;
import org.apache.nifi.util.FlowFilePackagerV3;
import org.apache.nifi.util.ObjectHolder;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
/**
* <p>
* Attempts to detect the MIME Type of a FlowFile by examining its contents. If
* the MIME Type is determined, it is added to an attribute with the name
* mime.type
* mime.type. In addition, mime.extension is set if a common file extension is known.
* </p>
*
* <p>
* The following MIME Types are supported:
* MIME Type detection is performed by Apache Tika; more information about
* detection is available at http://tika.apache.org.
*
* <ul>
* <li>application/gzip</li>
* <li>application/bzip2</li>
* <li>application/flowfile-v3</li>
* <li>application/flowfile-v1 (requires Identify TAR be set to true)</li>
* <li>application/xml</li>
* <li>video/mp4</li>
* <li>video/x-m4v</li>
* <li>video/mp4a-latm</li>
* <li>video/quicktime</li>
* <li>video/mpeg</li>
* <li>audio/wav</li>
* <li>audio/mp3</li>
* <li>image/bmp</li>
* <li>image/png</li>
* <li>image/jpg</li>
* <li>image/gif</li>
* <li>image/tif</li>
* <li>application/vnd.ms-works</li>
* <li>application/msexcel</li>
* <li>application/mspowerpoint</li>
* <li>application/msaccess</li>
* <li>application/x-ms-wmv</li>
* <li>application/pdf</li>
* <li>application/x-rpm</li>
* <li>application/tar</li>
* <li>application/x-7z-compressed</li>
* <li>application/java-archive</li>
* <li>application/zip</li>
* <li>application/x-lzh</li>
* <li>application/flowfile-v1</li>
* </ul>
* </p>
*/
@ -97,128 +69,31 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
@Tags({"compression", "gzip", "bzip2", "zip", "MIME", "mime.type", "file", "identify"})
@CapabilityDescription("Attempts to identify the MIME Type used for a FlowFile. If the MIME Type can be identified, "
+ "an attribute with the name 'mime.type' is added with the value being the MIME Type. If the MIME Type cannot be determined, "
+ "the value will be set to 'application/octet-stream'. Some MIME Types require reading a significant amount of data; for these MIME Types, their identification "
+ "is optional. The algorithm may have to read the entire contents of the file for each type of identification.")
+ "the value will be set to 'application/octet-stream'. In addition, the attribute mime.extension will be set if a common file "
+ "extension for the MIME Type is known.")
public class IdentifyMimeType extends AbstractProcessor {
public static final PropertyDescriptor IDENTIFY_ZIP = new PropertyDescriptor.Builder()
.name("Identify ZIP")
.description("Determines whether or not to attempt in depth identification of ZIP MIME types")
.required(true)
.allowableValues("true", "false")
.defaultValue("false")
.build();
public static final PropertyDescriptor IDENTIFY_TAR = new PropertyDescriptor.Builder()
.name("Identify TAR")
.description("Determines whether or not to attempt in depth identification of TAR MIME types")
.required(true)
.allowableValues("true", "false")
.defaultValue("false")
.build();
public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success").description("All FlowFiles are routed to success").build();
private Set<Relationship> relationships;
private List<PropertyDescriptor> properties;
public static final MediaType FLOWFILE_V1 = new MediaType("application", "flowfile-v1");
public static final MediaType FLOWFILE_V3 = new MediaType("application", "flowfile-v3");
private final List<MagicHeader> magicHeaders;
private final List<MagicHeader> zipMagicHeaders;
private final List<MagicHeader> tarMagicHeaders;
private final List<ContentScanningMimeTypeIdentifier> contentScanners;
private final int magicHeaderMaxLength;
private Set<Relationship> relationships;
private final TikaConfig config;
private final Detector detector;
public IdentifyMimeType() {
// compile a list of Magic Header detectors
final List<MagicHeader> headers = new ArrayList<>();
headers.add(new SimpleMagicHeader("application/gzip", new byte[]{0x1f, (byte) 0x8b}));
headers.add(new SimpleMagicHeader("application/bzip2", new byte[]{0x42, 0x5a}));
headers.add(new SimpleMagicHeader("application/flowfile-v3", FlowFilePackagerV3.MAGIC_HEADER));
headers.add(new SimpleMagicHeader("application/xml", new byte[]{0x3c, 0x3f, 0x78, 0x6d, 0x6c, 0x20}));
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6F, 0x6D}));
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x33, 0x67, 0x70, 0x35}));
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0X4d, 0X53, 0X4e, 0X56, 0X01, 0X29, 0, 0X46, 0X4d, 0X53, 0X4e, 0X56, 0X6d, 0X70, 0X34, 0X32}));
headers.add(new SimpleMagicHeader("video/x-m4v", new byte[]{0, 0, 0, 0x18, 0x66, 0x74, 0x79, 0x70, 0x6D, 0x70, 0x34, 0x32}));
headers.add(new SimpleMagicHeader("video/mp4a-latm", new byte[]{0, 0, 0, 0x18, 0x66, 0x74, 0x79, 0x70, 0x4D, 0x34, 0x41, 0x20}));
headers.add(new SimpleMagicHeader("video/quicktime", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x71, 0x74, 0x20, 0x20}));
headers.add(new SimpleMagicHeader("video/quicktime", new byte[]{0x6D, 0x6F, 0x6F, 0x76}, 4));
headers.add(new SimpleMagicHeader("audio/mp3", new byte[]{0x49, 0x44, 0x33}));
headers.add(new SimpleMagicHeader("image/bmp", new byte[]{0x42, 0x4D}));
headers.add(new SimpleMagicHeader("image/png", new byte[]{(byte) 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}));
headers.add(new SimpleMagicHeader("image/jpg", new byte[]{(byte) 0xFF, (byte) 0xD8, (byte) 0xFF}));
headers.add(new SimpleMagicHeader("image/gif", new byte[]{0x47, 0x49, 0x46, 0x38, 0x37, 0x61}));
headers.add(new SimpleMagicHeader("image/gif", new byte[]{0x47, 0x49, 0x46, 0x38, 0x39, 0x61}));
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x49, 0x20, 0x49}));
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x49, 0x49, 0x2A, 0x00}));
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x4D, 0x4D, 0x00, 0x2A}));
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x4D, 0x4D, 0x00, 0x2B}));
headers.add(new SimpleMagicHeader("application/vnd.ms-works", new byte[]{(byte) 0xFF, 0x00, 0x02, 0x00, 0x04, 0x04, 0x05, 0x54, 0x02, 0x00}));
headers.add(new SimpleMagicHeader("application/msexcel", new byte[]{0x09, 0x08, 0x10, 0, 0, 0x06, 0x05, 0}, 512));
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{0x00, 0x6E, 0x1E, (byte) 0xF0}, 512));
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{0x0F, 0x00, (byte) 0xE8, 0x03}, 512));
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{(byte) 0xA0, 0x46, 0x1D, (byte) 0xF0}, 512));
headers.add(new CompoundMagicHeader("application/mspowerpoint",
new SimpleMagicHeader("", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}, 512),
new SimpleMagicHeader("", new byte[]{0x00, 0x00, 0x00}, 517)));
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{0x00, 0x01, 0x00, 0x00, 0x53, 0x74, 0x61, 0x6E, 0x64, 0x61, 0x72, 0x64, 0x20, 0x41, 0x43, 0x45, 0x20, 0x44, 0x42}));
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{0x00, 0x01, 0x00, 0x00, 0x53, 0x74, 0x61, 0x6E, 0x64, 0x61, 0x72, 0x64, 0x20, 0x4A, 0x65, 0x74, 0x20, 0x44, 0x42}));
for (byte b : new byte[]{0x10, 0x1F, 0x22, 0x23, 0x28, 0x29}) {
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, b, 0x00}, 512));
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, b, 0x02}, 512));
}
headers.add(new SimpleMagicHeader("application/x-ms-wmv", new byte[]{0x30, 0x26, (byte) 0xB2, 0x75, (byte) 0x8E, 0x66, (byte) 0xCF, 0x11, (byte) 0xA6, (byte) 0xD9, 0x00, (byte) 0xAA, 0x00, 0x62, (byte) 0xCE, 0x6C}));
headers.add(new SimpleMagicHeader("application/pdf", new byte[]{0x25, 0x50, 0x44, 0x46}));
headers.add(new SimpleMagicHeader("application/x-rpm", new byte[]{(byte) 0xED, (byte) 0xAB, (byte) 0xEE, (byte) 0xDB}));
headers.add(new SimpleMagicHeader("application/x-7z-compressed", new byte[]{0x37, 0x7A, (byte) 0xBC, (byte) 0xAF, 0x27, 0x1C}));
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x4A, 0x41, 0x52, 0x43, 0x53, 0x00}));
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x08}));
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x50, 0x4B, 0x03, 0x04, (byte) 0xA0, 0x00, 0x00}));
headers.add(new SimpleMagicHeader("application/x-lzh", new byte[]{0x2D, 0x6C, 0x68}, 2));
headers.add(new CompoundMagicHeader("audio/wav",
new SimpleMagicHeader("", new byte[]{0x52, 0x49, 0x46, 0x46}),
new SimpleMagicHeader("", new byte[]{0x57, 0x41, 0x56, 0x45, 0x66, 0x6D, 0x74, 0x20}, 8)));
for (int nibble = 0xB0; nibble <= 0xBF; nibble++) {
headers.add(new SimpleMagicHeader("video/mpeg", new byte[]{0x00, 0x00, 0x01, (byte) nibble}));
}
this.magicHeaders = Collections.unmodifiableList(headers);
// additional Magic Header detectors that will be turned off based on property settings
final List<MagicHeader> zipHeaders = new ArrayList<>();
zipHeaders.add(new SimpleMagicHeader("application/zip", new byte[]{0x50, 0x4B, 0x03, 0x04}));
this.zipMagicHeaders = Collections.unmodifiableList(zipHeaders);
final List<MagicHeader> tarHeaders = new ArrayList<>();
tarHeaders.add(new SimpleMagicHeader("application/tar", new byte[]{0x75, 0x73, 0x74, 0x61, 0x72}, 257));
this.tarMagicHeaders = Collections.unmodifiableList(tarHeaders);
// determine the max length that we need to buffer for magic headers
int max = 0;
for (final MagicHeader header : magicHeaders) {
max = Math.max(max, header.getRequiredBufferLength());
}
for (final MagicHeader header : zipMagicHeaders) {
max = Math.max(max, header.getRequiredBufferLength());
}
for (final MagicHeader header : tarMagicHeaders) {
max = Math.max(max, header.getRequiredBufferLength());
}
this.magicHeaderMaxLength = max;
// create list of Content Scanners
final List<ContentScanningMimeTypeIdentifier> scanningIdentifiers = new ArrayList<>();
scanningIdentifiers.add(new ZipIdentifier());
scanningIdentifiers.add(new TarIdentifier());
this.contentScanners = Collections.unmodifiableList(scanningIdentifiers);
// Setup Tika
this.config = TikaConfig.getDefaultConfig();
this.detector = config.getDetector();
}
@Override
protected void init(final ProcessorInitializationContext context) {
final Set<Relationship> relationships = new HashSet<>();
relationships.add(REL_SUCCESS);
this.relationships = Collections.unmodifiableSet(relationships);
final List<PropertyDescriptor> properties = new ArrayList<>();
properties.add(IDENTIFY_ZIP);
properties.add(IDENTIFY_TAR);
this.properties = Collections.unmodifiableList(properties);
final Set<Relationship> rels = new HashSet<>();
rels.add(REL_SUCCESS);
this.relationships = Collections.unmodifiableSet(rels);
}
@Override
@ -226,11 +101,6 @@ public class IdentifyMimeType extends AbstractProcessor {
return relationships;
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return properties;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
FlowFile flowFile = session.get();
@ -239,236 +109,47 @@ public class IdentifyMimeType extends AbstractProcessor {
}
final ProcessorLog logger = getLogger();
final boolean identifyZip = context.getProperty(IDENTIFY_ZIP).asBoolean();
final boolean identifyTar = context.getProperty(IDENTIFY_TAR).asBoolean();
final ObjectHolder<String> mimeTypeRef = new ObjectHolder<>(null);
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(final InputStream stream) throws IOException {
try (final InputStream in = new BufferedInputStream(stream)) {
// read in up to magicHeaderMaxLength bytes
in.mark(magicHeaderMaxLength);
byte[] header = new byte[magicHeaderMaxLength];
for (int i = 0; i < header.length; i++) {
final int next = in.read();
if (next >= 0) {
header[i] = (byte) next;
} else if (i == 0) {
header = new byte[0];
} else {
final byte[] newBuffer = new byte[i - 1];
System.arraycopy(header, 0, newBuffer, 0, i - 1);
header = newBuffer;
break;
}
}
in.reset();
for (final MagicHeader magicHeader : magicHeaders) {
if (magicHeader.matches(header)) {
mimeTypeRef.set(magicHeader.getMimeType());
return;
}
}
if (!identifyZip) {
for (final MagicHeader magicHeader : zipMagicHeaders) {
if (magicHeader.matches(header)) {
mimeTypeRef.set(magicHeader.getMimeType());
return;
}
}
}
if (!identifyTar) {
for (final MagicHeader magicHeader : tarMagicHeaders) {
if (magicHeader.matches(header)) {
mimeTypeRef.set(magicHeader.getMimeType());
return;
}
}
}
TikaInputStream tikaStream = TikaInputStream.get(in);
Metadata metadata = new Metadata();
// Get mime type
MediaType mediatype = detector.detect(tikaStream, metadata);
mimeTypeRef.set(mediatype.toString());
}
}
});
String mimeType = mimeTypeRef.get();
if (mimeType == null) {
for (final ContentScanningMimeTypeIdentifier scanningIdentifier : this.contentScanners) {
if (scanningIdentifier.isEnabled(context)) {
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(final InputStream in) throws IOException {
String mimeType = scanningIdentifier.getMimeType(in);
if (mimeType != null) {
mimeTypeRef.set(mimeType);
}
}
});
if (mimeTypeRef.get() != null) {
break;
}
}
}
String extension = "";
try {
MimeType mimetype;
mimetype = config.getMimeRepository().forName(mimeType);
extension = mimetype.getExtension();
} catch (MimeTypeException ex) {
logger.warn("MIME type extension lookup failed: {}", new Object[]{ex});
}
// Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
extension = ".gz";
}
mimeType = mimeTypeRef.get();
if (mimeType == null) {
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
flowFile = session.putAttribute(flowFile, "mime.extension", "");
logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[]{flowFile});
} else {
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
flowFile = session.putAttribute(flowFile, "mime.extension", extension);
logger.info("Identified {} as having MIME Type {}", new Object[]{flowFile, mimeType});
}
session.getProvenanceReporter().modifyAttributes(flowFile);
session.transfer(flowFile, REL_SUCCESS);
}
private static interface ContentScanningMimeTypeIdentifier {
boolean isEnabled(ProcessContext context);
String getMimeType(InputStream in) throws IOException;
}
private static class ZipIdentifier implements ContentScanningMimeTypeIdentifier {
@Override
public String getMimeType(final InputStream in) throws IOException {
final ZipInputStream zipIn = new ZipInputStream(in);
try {
if (zipIn.getNextEntry() != null) {
return "application/zip";
}
} catch (final Exception e) {
}
return null;
}
@Override
public boolean isEnabled(final ProcessContext context) {
return context.getProperty(IDENTIFY_ZIP).asBoolean();
}
}
private static class TarIdentifier implements ContentScanningMimeTypeIdentifier {
@Override
public String getMimeType(final InputStream in) throws IOException {
try (final TarArchiveInputStream tarIn = new TarArchiveInputStream(in)) {
final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
if (firstEntry != null) {
if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
final TarArchiveEntry secondEntry = tarIn.getNextTarEntry();
if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
return "application/flowfile-v1";
}
}
return "application/tar";
}
} catch (final Exception e) {
}
return null;
}
@Override
public boolean isEnabled(final ProcessContext context) {
return context.getProperty(IDENTIFY_TAR).asBoolean();
}
}
private static interface MagicHeader {
int getRequiredBufferLength();
String getMimeType();
boolean matches(final byte[] header);
}
private static class SimpleMagicHeader implements MagicHeader {
private final String mimeType;
private final int offset;
private final byte[] byteSequence;
public SimpleMagicHeader(final String mimeType, final byte[] byteSequence) {
this(mimeType, byteSequence, 0);
}
public SimpleMagicHeader(final String mimeType, final byte[] byteSequence, final int offset) {
this.mimeType = mimeType;
this.byteSequence = byteSequence;
this.offset = offset;
}
@Override
public int getRequiredBufferLength() {
return byteSequence.length + offset;
}
@Override
public String getMimeType() {
return mimeType;
}
@Override
public boolean matches(final byte[] header) {
if (header.length < getRequiredBufferLength()) {
return false;
}
for (int i = 0; i < byteSequence.length; i++) {
if (byteSequence[i] != header[offset + i]) {
return false;
}
}
return true;
}
}
private static class CompoundMagicHeader implements MagicHeader {
private final MagicHeader[] headers;
private final int requiredLength;
private final String mimeType;
public CompoundMagicHeader(final String mimeType, final MagicHeader... headers) {
this.mimeType = mimeType;
this.headers = headers;
int max = 0;
for (final MagicHeader header : headers) {
max = Math.max(max, header.getRequiredBufferLength());
}
this.requiredLength = max;
}
@Override
public int getRequiredBufferLength() {
return requiredLength;
}
@Override
public String getMimeType() {
return mimeType;
}
@Override
public boolean matches(final byte[] header) {
for (final MagicHeader mh : headers) {
if (!mh.matches(header)) {
return false;
}
}
return true;
}
}
}

View File

@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<mime-info>
<mime-type type="application/flowfile-v1">
<_comment>NiFi FlowFile V1</_comment>
<sub-class-of type="application/x-tar"/>
<magic>
<match value="flowfile.attributes" type="string" offset="0" />
</magic>
</mime-type>
<mime-type type="application/flowfile-v3">
<_comment>NiFi FlowFile V3</_comment>
<magic>
<match value="NiFiFF3" type="string" offset="0" />
</magic>
</mime-type>
<mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet">
<_comment>Office Open XML Workbook</_comment>
<glob pattern="*.xlsx"/>
<sub-class-of type="application/x-tika-ooxml"/>
<magic priority="60">
<match value="PK\003\004" type="string" offset="0">
<match value="[Content_Types].xml" type="string" offset="30">
<match value="xl/_rels/workbook.xml.rels" type="string" offset="30:4096"/>
</match>
</match>
</magic>
</mime-type>
<mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.document">
<_comment>Office Open XML Document</_comment>
<glob pattern="*.docx"/>
<sub-class-of type="application/x-tika-ooxml"/>
<magic priority="60">
<match value="PK\003\004" type="string" offset="0">
<match value="[Content_Types].xml" type="string" offset="30">
<match value="word/_rels/document.xml.rels" type="string" offset="30:4096"/>
</match>
</match>
</magic>
</mime-type>
<mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
<_comment>Office Open XML Presentation</_comment>
<glob pattern="*.pptx"/>
<glob pattern="*.thmx"/>
<sub-class-of type="application/x-tika-ooxml"/>
<magic priority="60">
<match value="PK\003\004" type="string" offset="0">
<match value="[Content_Types].xml" type="string" offset="30">
<match value="ppt/slides/_rels/slide" type="string" offset="30:4096"/>
</match>
</match>
</magic>
</mime-type>
<mime-type type="application/java-archive">
<_comment>Java Archive</_comment>
<tika:link>http://en.wikipedia.org/wiki/.jar</tika:link>
<tika:uti>com.sun.java-archive</tika:uti>
<sub-class-of type="application/zip"/>
<glob pattern="*.jar"/>
<magic priority="50">
<match value="PK\003\004" type="string" offset="0">
<match value="META-INF/MANIFEST.MF" type="string" offset="0:1024"/>
</match>
</magic>
</mime-type>
<!-- Override tika's default behavior for GNU tar detection because nobody calls
a GNU tar a .gtar -->
<mime-type type="application/x-tar">
<_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
<magic priority="60">
<!-- GNU tar archive -->
<match value="ustar \0" type="string" offset="257" />
</magic>
<glob pattern="*.tar"/>
</mime-type>
</mime-info>

View File

@ -57,75 +57,59 @@ public class TestIdentifyMimeType {
final Map<String, String> expectedMimeTypes = new HashMap<>();
expectedMimeTypes.put("1.7z", "application/x-7z-compressed");
expectedMimeTypes.put("1.mdb", "application/msaccess");
expectedMimeTypes.put("1.txt.bz2", "application/bzip2");
expectedMimeTypes.put("1.mdb", "application/x-msaccess");
expectedMimeTypes.put("1.txt", "text/plain");
expectedMimeTypes.put("1.txt.bz2", "application/x-bzip2");
expectedMimeTypes.put("1.txt.gz", "application/gzip");
expectedMimeTypes.put("1.zip", "application/zip");
expectedMimeTypes.put("bgBannerFoot.png", "image/png");
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpg");
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpeg");
expectedMimeTypes.put("1.pdf", "application/pdf");
expectedMimeTypes.put("grid.gif", "image/gif");
expectedMimeTypes.put("1.tar", "application/octet-stream"); //wrong ID without IDENTIFY_TAR
expectedMimeTypes.put("1.jar", "application/java-archive");
expectedMimeTypes.put("1.xml", "application/xml");
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
expectedMimeTypes.put("flowfilev1.tar", "application/tar"); //wrong ID without IDENTIFY_TAR
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
for (final MockFlowFile file : filesOut) {
final String filename = file.getAttribute(CoreAttributes.FILENAME.key());
final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key());
final String expected = expectedMimeTypes.get(filename);
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
}
}
@Test
public void testFilesWithIdentifyTarAndZip() throws IOException {
final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType());
runner.setProperty(IdentifyMimeType.IDENTIFY_TAR.getName(), "true");
runner.setProperty(IdentifyMimeType.IDENTIFY_ZIP.getName(), "true");
final File dir = new File("src/test/resources/TestIdentifyMimeType");
final File[] files = dir.listFiles();
int fileCount = 0;
for (final File file : files) {
if (file.isDirectory()) {
continue;
}
runner.enqueue(file.toPath());
fileCount++;
}
runner.setThreadCount(4);
runner.run(fileCount);
runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, fileCount);
final Map<String, String> expectedMimeTypes = new HashMap<>();
expectedMimeTypes.put("1.7z", "application/x-7z-compressed");
expectedMimeTypes.put("1.mdb", "application/msaccess");
expectedMimeTypes.put("1.txt.bz2", "application/bzip2");
expectedMimeTypes.put("1.txt.gz", "application/gzip");
expectedMimeTypes.put("1.zip", "application/zip");
expectedMimeTypes.put("bgBannerFoot.png", "image/png");
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpg");
expectedMimeTypes.put("1.pdf", "application/pdf");
expectedMimeTypes.put("grid.gif", "image/gif");
expectedMimeTypes.put("1.tar", "application/tar");
expectedMimeTypes.put("1.tar", "application/x-tar");
expectedMimeTypes.put("1.tar.gz", "application/gzip");
expectedMimeTypes.put("1.jar", "application/java-archive");
expectedMimeTypes.put("1.xml", "application/xml");
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
// Office documents below randomly selected from govdocs1:
// http://digitalcorpora.org/corpora/govdocs
expectedMimeTypes.put("651924.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
expectedMimeTypes.put("528206.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
expectedMimeTypes.put("392790.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
final Map<String, String> expectedExtensions = new HashMap<>();
expectedExtensions.put("1.7z", ".7z");
expectedExtensions.put("1.mdb", ".mdb");
expectedExtensions.put("1.txt", ".txt");
expectedExtensions.put("1.txt.bz2", ".bz2");
expectedExtensions.put("1.txt.gz", ".gz");
expectedExtensions.put("1.zip", ".zip");
expectedExtensions.put("bgBannerFoot.png", ".png");
expectedExtensions.put("blueBtnBg.jpg", ".jpg");
expectedExtensions.put("1.pdf", ".pdf");
expectedExtensions.put("grid.gif", ".gif");
expectedExtensions.put("1.tar", ".tar");
expectedExtensions.put("1.tar.gz", ".gz");
expectedExtensions.put("1.jar", ".jar");
expectedExtensions.put("1.xml", ".xml");
expectedExtensions.put("flowfilev3", "");
expectedExtensions.put("flowfilev1.tar", "");
expectedExtensions.put("651924.docx", ".docx");
expectedExtensions.put("528206.xlsx", ".xlsx");
expectedExtensions.put("392790.pptx", ".pptx");
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
for (final MockFlowFile file : filesOut) {
final String filename = file.getAttribute(CoreAttributes.FILENAME.key());
final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key());
final String expected = expectedMimeTypes.get(filename);
final String extension = file.getAttribute("mime.extension");
final String expectedExtension = expectedExtensions.get(filename);
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension);
}
}
}