NIFI-296: Extend capability of IdentifyMimeType

This commit backs IdentifyMimeType with the Apache Tika library. Tika
provides detailed mime type identification such as the ability to
differentiate normal zip files from OOXML MS Office documents.

The mime.type attribute continues to be set, though some mime types
have changed due to Tika naming them differently. In addition,
the mime.extension attribute is set to provide the commonly used
extension for the mime type (if known).
This commit is contained in:
Adam Lamar 2015-02-14 20:57:41 +00:00
parent dde5fd51a4
commit 16fb2b826c
5 changed files with 97 additions and 399 deletions

View File

@ -155,5 +155,16 @@
<artifactId>nifi-ssl-context-service</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.7</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.7</version>
</dependency>
</dependencies>
</project>

View File

@ -20,13 +20,14 @@ import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ProcessorLog;
@ -44,50 +45,31 @@ import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.util.FlowFilePackagerV1;
import org.apache.nifi.util.FlowFilePackagerV3;
import org.apache.nifi.util.ObjectHolder;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.MagicDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
/**
* <p>
* Attempts to detect the MIME Type of a FlowFile by examining its contents. If
* the MIME Type is determined, it is added to an attribute with the name
* mime.type
* mime.type. In addition, mime.extension is set if a common file extension is known.
* </p>
*
* <p>
* The following MIME Types are supported:
* MIME Type detection is performed by Apache Tika; more information about
* detection is available at http://tika.apache.org.
*
* <ul>
* <li>application/gzip</li>
* <li>application/bzip2</li>
* <li>application/flowfile-v3</li>
* <li>application/flowfile-v1 (requires Identify TAR be set to true)</li>
* <li>application/xml</li>
* <li>video/mp4</li>
* <li>video/x-m4v</li>
* <li>video/mp4a-latm</li>
* <li>video/quicktime</li>
* <li>video/mpeg</li>
* <li>audio/wav</li>
* <li>audio/mp3</li>
* <li>image/bmp</li>
* <li>image/png</li>
* <li>image/jpg</li>
* <li>image/gif</li>
* <li>image/tif</li>
* <li>application/vnd.ms-works</li>
* <li>application/msexcel</li>
* <li>application/mspowerpoint</li>
* <li>application/msaccess</li>
* <li>application/x-ms-wmv</li>
* <li>application/pdf</li>
* <li>application/x-rpm</li>
* <li>application/tar</li>
* <li>application/x-7z-compressed</li>
* <li>application/java-archive</li>
* <li>application/zip</li>
* <li>application/x-lzh</li>
* <li>application/flowfile-v1</li>
* </ul>
* </p>
*/
@ -97,128 +79,40 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
@Tags({"compression", "gzip", "bzip2", "zip", "MIME", "mime.type", "file", "identify"})
@CapabilityDescription("Attempts to identify the MIME Type used for a FlowFile. If the MIME Type can be identified, "
+ "an attribute with the name 'mime.type' is added with the value being the MIME Type. If the MIME Type cannot be determined, "
+ "the value will be set to 'application/octet-stream'. Some MIME Types require reading a significant amount of data; for these MIME Types, their identification "
+ "is optional. The algorithm may have to read the entire contents of the file for each type of identification.")
+ "the value will be set to 'application/octet-stream'. In addition, the attribute mime.extension will be set if a common file "
+ "extension for the MIME Type is known.")
public class IdentifyMimeType extends AbstractProcessor {
public static final PropertyDescriptor IDENTIFY_ZIP = new PropertyDescriptor.Builder()
.name("Identify ZIP")
.description("Determines whether or not to attempt in depth identification of ZIP MIME types")
.required(true)
.allowableValues("true", "false")
.defaultValue("false")
.build();
public static final PropertyDescriptor IDENTIFY_TAR = new PropertyDescriptor.Builder()
.name("Identify TAR")
.description("Determines whether or not to attempt in depth identification of TAR MIME types")
.required(true)
.allowableValues("true", "false")
.defaultValue("false")
.build();
public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success").description("All FlowFiles are routed to success").build();
private Set<Relationship> relationships;
private List<PropertyDescriptor> properties;
public static final MediaType FLOWFILE_V1 = new MediaType("application", "flowfile-v1");
public static final MediaType FLOWFILE_V3 = new MediaType("application", "flowfile-v3");
private final List<MagicHeader> magicHeaders;
private final List<MagicHeader> zipMagicHeaders;
private final List<MagicHeader> tarMagicHeaders;
private final List<ContentScanningMimeTypeIdentifier> contentScanners;
private final int magicHeaderMaxLength;
private Set<Relationship> relationships;
private final TikaConfig config;
private final Detector detector;
public IdentifyMimeType() {
// compile a list of Magic Header detectors
final List<MagicHeader> headers = new ArrayList<>();
headers.add(new SimpleMagicHeader("application/gzip", new byte[]{0x1f, (byte) 0x8b}));
headers.add(new SimpleMagicHeader("application/bzip2", new byte[]{0x42, 0x5a}));
headers.add(new SimpleMagicHeader("application/flowfile-v3", FlowFilePackagerV3.MAGIC_HEADER));
headers.add(new SimpleMagicHeader("application/xml", new byte[]{0x3c, 0x3f, 0x78, 0x6d, 0x6c, 0x20}));
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6F, 0x6D}));
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x33, 0x67, 0x70, 0x35}));
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0X4d, 0X53, 0X4e, 0X56, 0X01, 0X29, 0, 0X46, 0X4d, 0X53, 0X4e, 0X56, 0X6d, 0X70, 0X34, 0X32}));
headers.add(new SimpleMagicHeader("video/x-m4v", new byte[]{0, 0, 0, 0x18, 0x66, 0x74, 0x79, 0x70, 0x6D, 0x70, 0x34, 0x32}));
headers.add(new SimpleMagicHeader("video/mp4a-latm", new byte[]{0, 0, 0, 0x18, 0x66, 0x74, 0x79, 0x70, 0x4D, 0x34, 0x41, 0x20}));
headers.add(new SimpleMagicHeader("video/quicktime", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x71, 0x74, 0x20, 0x20}));
headers.add(new SimpleMagicHeader("video/quicktime", new byte[]{0x6D, 0x6F, 0x6F, 0x76}, 4));
headers.add(new SimpleMagicHeader("audio/mp3", new byte[]{0x49, 0x44, 0x33}));
headers.add(new SimpleMagicHeader("image/bmp", new byte[]{0x42, 0x4D}));
headers.add(new SimpleMagicHeader("image/png", new byte[]{(byte) 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}));
headers.add(new SimpleMagicHeader("image/jpg", new byte[]{(byte) 0xFF, (byte) 0xD8, (byte) 0xFF}));
headers.add(new SimpleMagicHeader("image/gif", new byte[]{0x47, 0x49, 0x46, 0x38, 0x37, 0x61}));
headers.add(new SimpleMagicHeader("image/gif", new byte[]{0x47, 0x49, 0x46, 0x38, 0x39, 0x61}));
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x49, 0x20, 0x49}));
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x49, 0x49, 0x2A, 0x00}));
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x4D, 0x4D, 0x00, 0x2A}));
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x4D, 0x4D, 0x00, 0x2B}));
headers.add(new SimpleMagicHeader("application/vnd.ms-works", new byte[]{(byte) 0xFF, 0x00, 0x02, 0x00, 0x04, 0x04, 0x05, 0x54, 0x02, 0x00}));
headers.add(new SimpleMagicHeader("application/msexcel", new byte[]{0x09, 0x08, 0x10, 0, 0, 0x06, 0x05, 0}, 512));
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{0x00, 0x6E, 0x1E, (byte) 0xF0}, 512));
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{0x0F, 0x00, (byte) 0xE8, 0x03}, 512));
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{(byte) 0xA0, 0x46, 0x1D, (byte) 0xF0}, 512));
headers.add(new CompoundMagicHeader("application/mspowerpoint",
new SimpleMagicHeader("", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}, 512),
new SimpleMagicHeader("", new byte[]{0x00, 0x00, 0x00}, 517)));
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{0x00, 0x01, 0x00, 0x00, 0x53, 0x74, 0x61, 0x6E, 0x64, 0x61, 0x72, 0x64, 0x20, 0x41, 0x43, 0x45, 0x20, 0x44, 0x42}));
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{0x00, 0x01, 0x00, 0x00, 0x53, 0x74, 0x61, 0x6E, 0x64, 0x61, 0x72, 0x64, 0x20, 0x4A, 0x65, 0x74, 0x20, 0x44, 0x42}));
for (byte b : new byte[]{0x10, 0x1F, 0x22, 0x23, 0x28, 0x29}) {
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, b, 0x00}, 512));
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, b, 0x02}, 512));
}
headers.add(new SimpleMagicHeader("application/x-ms-wmv", new byte[]{0x30, 0x26, (byte) 0xB2, 0x75, (byte) 0x8E, 0x66, (byte) 0xCF, 0x11, (byte) 0xA6, (byte) 0xD9, 0x00, (byte) 0xAA, 0x00, 0x62, (byte) 0xCE, 0x6C}));
headers.add(new SimpleMagicHeader("application/pdf", new byte[]{0x25, 0x50, 0x44, 0x46}));
headers.add(new SimpleMagicHeader("application/x-rpm", new byte[]{(byte) 0xED, (byte) 0xAB, (byte) 0xEE, (byte) 0xDB}));
headers.add(new SimpleMagicHeader("application/x-7z-compressed", new byte[]{0x37, 0x7A, (byte) 0xBC, (byte) 0xAF, 0x27, 0x1C}));
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x4A, 0x41, 0x52, 0x43, 0x53, 0x00}));
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x08}));
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x50, 0x4B, 0x03, 0x04, (byte) 0xA0, 0x00, 0x00}));
headers.add(new SimpleMagicHeader("application/x-lzh", new byte[]{0x2D, 0x6C, 0x68}, 2));
headers.add(new CompoundMagicHeader("audio/wav",
new SimpleMagicHeader("", new byte[]{0x52, 0x49, 0x46, 0x46}),
new SimpleMagicHeader("", new byte[]{0x57, 0x41, 0x56, 0x45, 0x66, 0x6D, 0x74, 0x20}, 8)));
for (int nibble = 0xB0; nibble <= 0xBF; nibble++) {
headers.add(new SimpleMagicHeader("video/mpeg", new byte[]{0x00, 0x00, 0x01, (byte) nibble}));
}
this.magicHeaders = Collections.unmodifiableList(headers);
// Setup Tika
this.config = TikaConfig.getDefaultConfig();
DefaultDetector ddetector = new DefaultDetector();
// additional Magic Header detectors that will be turned off based on property settings
final List<MagicHeader> zipHeaders = new ArrayList<>();
zipHeaders.add(new SimpleMagicHeader("application/zip", new byte[]{0x50, 0x4B, 0x03, 0x04}));
this.zipMagicHeaders = Collections.unmodifiableList(zipHeaders);
final List<MagicHeader> tarHeaders = new ArrayList<>();
tarHeaders.add(new SimpleMagicHeader("application/tar", new byte[]{0x75, 0x73, 0x74, 0x61, 0x72}, 257));
this.tarMagicHeaders = Collections.unmodifiableList(tarHeaders);
// Create list of detectors, preferring our custom detectors first
List<Detector> detectors = new ArrayList<>();
detectors.add(getFlowFileV3Detector());
detectors.add(getFlowFileV1Detector());
detectors.addAll(ddetector.getDetectors());
// determine the max length that we need to buffer for magic headers
int max = 0;
for (final MagicHeader header : magicHeaders) {
max = Math.max(max, header.getRequiredBufferLength());
}
for (final MagicHeader header : zipMagicHeaders) {
max = Math.max(max, header.getRequiredBufferLength());
}
for (final MagicHeader header : tarMagicHeaders) {
max = Math.max(max, header.getRequiredBufferLength());
}
this.magicHeaderMaxLength = max;
// create list of Content Scanners
final List<ContentScanningMimeTypeIdentifier> scanningIdentifiers = new ArrayList<>();
scanningIdentifiers.add(new ZipIdentifier());
scanningIdentifiers.add(new TarIdentifier());
this.contentScanners = Collections.unmodifiableList(scanningIdentifiers);
CompositeDetector compositeDetector = new CompositeDetector(detectors);
this.detector = compositeDetector;
}
@Override
protected void init(final ProcessorInitializationContext context) {
final Set<Relationship> relationships = new HashSet<>();
relationships.add(REL_SUCCESS);
this.relationships = Collections.unmodifiableSet(relationships);
final List<PropertyDescriptor> properties = new ArrayList<>();
properties.add(IDENTIFY_ZIP);
properties.add(IDENTIFY_TAR);
this.properties = Collections.unmodifiableList(properties);
final Set<Relationship> rels = new HashSet<>();
rels.add(REL_SUCCESS);
this.relationships = Collections.unmodifiableSet(rels);
}
@Override
@ -226,11 +120,6 @@ public class IdentifyMimeType extends AbstractProcessor {
return relationships;
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return properties;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
FlowFile flowFile = session.get();
@ -239,87 +128,39 @@ public class IdentifyMimeType extends AbstractProcessor {
}
final ProcessorLog logger = getLogger();
final boolean identifyZip = context.getProperty(IDENTIFY_ZIP).asBoolean();
final boolean identifyTar = context.getProperty(IDENTIFY_TAR).asBoolean();
final ObjectHolder<String> mimeTypeRef = new ObjectHolder<>(null);
final ObjectHolder<String> extensionRef = new ObjectHolder<>(null);
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(final InputStream stream) throws IOException {
try (final InputStream in = new BufferedInputStream(stream)) {
// read in up to magicHeaderMaxLength bytes
in.mark(magicHeaderMaxLength);
byte[] header = new byte[magicHeaderMaxLength];
for (int i = 0; i < header.length; i++) {
final int next = in.read();
if (next >= 0) {
header[i] = (byte) next;
} else if (i == 0) {
header = new byte[0];
} else {
final byte[] newBuffer = new byte[i - 1];
System.arraycopy(header, 0, newBuffer, 0, i - 1);
header = newBuffer;
break;
}
}
in.reset();
for (final MagicHeader magicHeader : magicHeaders) {
if (magicHeader.matches(header)) {
mimeTypeRef.set(magicHeader.getMimeType());
return;
}
}
if (!identifyZip) {
for (final MagicHeader magicHeader : zipMagicHeaders) {
if (magicHeader.matches(header)) {
mimeTypeRef.set(magicHeader.getMimeType());
return;
}
}
}
if (!identifyTar) {
for (final MagicHeader magicHeader : tarMagicHeaders) {
if (magicHeader.matches(header)) {
mimeTypeRef.set(magicHeader.getMimeType());
return;
}
}
TikaInputStream tikaStream = TikaInputStream.get(in);
Metadata metadata = new Metadata();
// Get mime type
MediaType mediatype = detector.detect(tikaStream, metadata);
mimeTypeRef.set(mediatype.toString());
// Get common file extension
try {
MimeType mimetype;
mimetype = config.getMimeRepository().forName(mediatype.toString());
extensionRef.set(mimetype.getExtension());
} catch (MimeTypeException ex) {
logger.warn("MIME type detection failed: {}", new Object[]{ex.toString()});
}
}
}
});
String mimeType = mimeTypeRef.get();
if (mimeType == null) {
for (final ContentScanningMimeTypeIdentifier scanningIdentifier : this.contentScanners) {
if (scanningIdentifier.isEnabled(context)) {
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(final InputStream in) throws IOException {
String mimeType = scanningIdentifier.getMimeType(in);
if (mimeType != null) {
mimeTypeRef.set(mimeType);
}
}
});
if (mimeTypeRef.get() != null) {
break;
}
}
}
}
mimeType = mimeTypeRef.get();
String extension = extensionRef.get();
if (mimeType == null) {
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
flowFile = session.putAttribute(flowFile, "mime.extension", "");
logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[]{flowFile});
} else {
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
flowFile = session.putAttribute(flowFile, "mime.extension", extension);
logger.info("Identified {} as having MIME Type {}", new Object[]{flowFile, mimeType});
}
@ -327,148 +168,41 @@ public class IdentifyMimeType extends AbstractProcessor {
session.transfer(flowFile, REL_SUCCESS);
}
private static interface ContentScanningMimeTypeIdentifier {
boolean isEnabled(ProcessContext context);
String getMimeType(InputStream in) throws IOException;
private Detector getFlowFileV3Detector() {
return new MagicDetector(FLOWFILE_V3, FlowFilePackagerV3.MAGIC_HEADER);
}
private static class ZipIdentifier implements ContentScanningMimeTypeIdentifier {
private Detector getFlowFileV1Detector() {
return new FlowFileV1Detector();
}
private class FlowFileV1Detector implements Detector {
@Override
public String getMimeType(final InputStream in) throws IOException {
final ZipInputStream zipIn = new ZipInputStream(in);
try {
if (zipIn.getNextEntry() != null) {
return "application/zip";
}
} catch (final Exception e) {
public MediaType detect(InputStream in, Metadata mtdt) throws IOException {
// Sanity check the stream. This may not be a tarfile at all
in.mark(FlowFilePackagerV1.FILENAME_ATTRIBUTES.length());
byte[] bytes = new byte[FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()];
in.read(bytes);
in.reset();
// Quick exit if the first filename is not correct
if (!Arrays.equals(bytes, FlowFilePackagerV1.FILENAME_ATTRIBUTES.getBytes())) {
return MediaType.OCTET_STREAM;
}
return null;
}
@Override
public boolean isEnabled(final ProcessContext context) {
return context.getProperty(IDENTIFY_ZIP).asBoolean();
}
}
private static class TarIdentifier implements ContentScanningMimeTypeIdentifier {
@Override
public String getMimeType(final InputStream in) throws IOException {
try (final TarArchiveInputStream tarIn = new TarArchiveInputStream(in)) {
final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
if (firstEntry != null) {
if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
final TarArchiveEntry secondEntry = tarIn.getNextTarEntry();
if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
return "application/flowfile-v1";
}
// More in-depth detection
final TarArchiveInputStream tarIn = new TarArchiveInputStream(in);
final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
if (firstEntry != null) {
if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
final TarArchiveEntry secondEntry = tarIn.getNextTarEntry();
if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
return FLOWFILE_V1;
}
return "application/tar";
}
} catch (final Exception e) {
}
return null;
}
@Override
public boolean isEnabled(final ProcessContext context) {
return context.getProperty(IDENTIFY_TAR).asBoolean();
}
}
private static interface MagicHeader {
int getRequiredBufferLength();
String getMimeType();
boolean matches(final byte[] header);
}
private static class SimpleMagicHeader implements MagicHeader {
private final String mimeType;
private final int offset;
private final byte[] byteSequence;
public SimpleMagicHeader(final String mimeType, final byte[] byteSequence) {
this(mimeType, byteSequence, 0);
}
public SimpleMagicHeader(final String mimeType, final byte[] byteSequence, final int offset) {
this.mimeType = mimeType;
this.byteSequence = byteSequence;
this.offset = offset;
}
@Override
public int getRequiredBufferLength() {
return byteSequence.length + offset;
}
@Override
public String getMimeType() {
return mimeType;
}
@Override
public boolean matches(final byte[] header) {
if (header.length < getRequiredBufferLength()) {
return false;
}
for (int i = 0; i < byteSequence.length; i++) {
if (byteSequence[i] != header[offset + i]) {
return false;
}
}
return true;
return MediaType.OCTET_STREAM;
}
}
private static class CompoundMagicHeader implements MagicHeader {
private final MagicHeader[] headers;
private final int requiredLength;
private final String mimeType;
public CompoundMagicHeader(final String mimeType, final MagicHeader... headers) {
this.mimeType = mimeType;
this.headers = headers;
int max = 0;
for (final MagicHeader header : headers) {
max = Math.max(max, header.getRequiredBufferLength());
}
this.requiredLength = max;
}
@Override
public int getRequiredBufferLength() {
return requiredLength;
}
@Override
public String getMimeType() {
return mimeType;
}
@Override
public boolean matches(final byte[] header) {
for (final MagicHeader mh : headers) {
if (!mh.matches(header)) {
return false;
}
}
return true;
}
}
}

View File

@ -57,63 +57,16 @@ public class TestIdentifyMimeType {
final Map<String, String> expectedMimeTypes = new HashMap<>();
expectedMimeTypes.put("1.7z", "application/x-7z-compressed");
expectedMimeTypes.put("1.mdb", "application/msaccess");
expectedMimeTypes.put("1.txt.bz2", "application/bzip2");
expectedMimeTypes.put("1.mdb", "application/x-msaccess");
expectedMimeTypes.put("1.txt", "text/plain");
expectedMimeTypes.put("1.txt.bz2", "application/x-bzip2");
expectedMimeTypes.put("1.txt.gz", "application/gzip");
expectedMimeTypes.put("1.zip", "application/zip");
expectedMimeTypes.put("bgBannerFoot.png", "image/png");
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpg");
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpeg");
expectedMimeTypes.put("1.pdf", "application/pdf");
expectedMimeTypes.put("grid.gif", "image/gif");
expectedMimeTypes.put("1.tar", "application/octet-stream"); //wrong ID without IDENTIFY_TAR
expectedMimeTypes.put("1.jar", "application/java-archive");
expectedMimeTypes.put("1.xml", "application/xml");
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
expectedMimeTypes.put("flowfilev1.tar", "application/tar"); //wrong ID without IDENTIFY_TAR
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
for (final MockFlowFile file : filesOut) {
final String filename = file.getAttribute(CoreAttributes.FILENAME.key());
final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key());
final String expected = expectedMimeTypes.get(filename);
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
}
}
@Test
public void testFilesWithIdentifyTarAndZip() throws IOException {
final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType());
runner.setProperty(IdentifyMimeType.IDENTIFY_TAR.getName(), "true");
runner.setProperty(IdentifyMimeType.IDENTIFY_ZIP.getName(), "true");
final File dir = new File("src/test/resources/TestIdentifyMimeType");
final File[] files = dir.listFiles();
int fileCount = 0;
for (final File file : files) {
if (file.isDirectory()) {
continue;
}
runner.enqueue(file.toPath());
fileCount++;
}
runner.setThreadCount(4);
runner.run(fileCount);
runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, fileCount);
final Map<String, String> expectedMimeTypes = new HashMap<>();
expectedMimeTypes.put("1.7z", "application/x-7z-compressed");
expectedMimeTypes.put("1.mdb", "application/msaccess");
expectedMimeTypes.put("1.txt.bz2", "application/bzip2");
expectedMimeTypes.put("1.txt.gz", "application/gzip");
expectedMimeTypes.put("1.zip", "application/zip");
expectedMimeTypes.put("bgBannerFoot.png", "image/png");
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpg");
expectedMimeTypes.put("1.pdf", "application/pdf");
expectedMimeTypes.put("grid.gif", "image/gif");
expectedMimeTypes.put("1.tar", "application/tar");
expectedMimeTypes.put("1.tar", "application/x-tar");
expectedMimeTypes.put("1.jar", "application/java-archive");
expectedMimeTypes.put("1.xml", "application/xml");
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
@ -127,5 +80,4 @@ public class TestIdentifyMimeType {
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
}
}
}