mirror of
https://github.com/apache/nifi.git
synced 2025-02-28 06:29:25 +00:00
NIFI-296: Extend capability of IdentifyMimeType
This commit backs IdentifyMimeType with the Apache Tika library. Tika provides detailed mime type identification such as the ability to differentiate normal zip files from OOXML MS Office documents. The mime.type attribute continues to be set, though some mime types have changed due to Tika naming them differently. In addition, the mime.extension attribute is set to provide the commonly used extension for the mime type (if known).
This commit is contained in:
parent
dde5fd51a4
commit
16fb2b826c
@ -155,5 +155,16 @@
|
||||
<artifactId>nifi-ssl-context-service</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-core</artifactId>
|
||||
<version>1.7</version>
|
||||
<type>pom</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-parsers</artifactId>
|
||||
<version>1.7</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
@ -20,13 +20,14 @@ import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipInputStream;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
|
||||
import org.apache.nifi.components.PropertyDescriptor;
|
||||
import org.apache.nifi.flowfile.FlowFile;
|
||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||
import org.apache.nifi.logging.ProcessorLog;
|
||||
@ -44,50 +45,31 @@ import org.apache.nifi.processor.io.InputStreamCallback;
|
||||
import org.apache.nifi.util.FlowFilePackagerV1;
|
||||
import org.apache.nifi.util.FlowFilePackagerV3;
|
||||
import org.apache.nifi.util.ObjectHolder;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.detect.CompositeDetector;
|
||||
import org.apache.tika.detect.DefaultDetector;
|
||||
import org.apache.tika.detect.Detector;
|
||||
import org.apache.tika.detect.MagicDetector;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.mime.MimeType;
|
||||
import org.apache.tika.mime.MimeTypeException;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Attempts to detect the MIME Type of a FlowFile by examining its contents. If
|
||||
* the MIME Type is determined, it is added to an attribute with the name
|
||||
* mime.type
|
||||
* mime.type. In addition, mime.extension is set if a common file extension is known.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The following MIME Types are supported:
|
||||
* MIME Type detection is performed by Apache Tika; more information about
|
||||
* detection is available at http://tika.apache.org.
|
||||
*
|
||||
* <ul>
|
||||
* <li>application/gzip</li>
|
||||
* <li>application/bzip2</li>
|
||||
* <li>application/flowfile-v3</li>
|
||||
* <li>application/flowfile-v1 (requires Identify TAR be set to true)</li>
|
||||
* <li>application/xml</li>
|
||||
* <li>video/mp4</li>
|
||||
* <li>video/x-m4v</li>
|
||||
* <li>video/mp4a-latm</li>
|
||||
* <li>video/quicktime</li>
|
||||
* <li>video/mpeg</li>
|
||||
* <li>audio/wav</li>
|
||||
* <li>audio/mp3</li>
|
||||
* <li>image/bmp</li>
|
||||
* <li>image/png</li>
|
||||
* <li>image/jpg</li>
|
||||
* <li>image/gif</li>
|
||||
* <li>image/tif</li>
|
||||
* <li>application/vnd.ms-works</li>
|
||||
* <li>application/msexcel</li>
|
||||
* <li>application/mspowerpoint</li>
|
||||
* <li>application/msaccess</li>
|
||||
* <li>application/x-ms-wmv</li>
|
||||
* <li>application/pdf</li>
|
||||
* <li>application/x-rpm</li>
|
||||
* <li>application/tar</li>
|
||||
* <li>application/x-7z-compressed</li>
|
||||
* <li>application/java-archive</li>
|
||||
* <li>application/zip</li>
|
||||
* <li>application/x-lzh</li>
|
||||
* <li>application/flowfile-v1</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
@ -97,128 +79,40 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
@Tags({"compression", "gzip", "bzip2", "zip", "MIME", "mime.type", "file", "identify"})
|
||||
@CapabilityDescription("Attempts to identify the MIME Type used for a FlowFile. If the MIME Type can be identified, "
|
||||
+ "an attribute with the name 'mime.type' is added with the value being the MIME Type. If the MIME Type cannot be determined, "
|
||||
+ "the value will be set to 'application/octet-stream'. Some MIME Types require reading a significant amount of data; for these MIME Types, their identification "
|
||||
+ "is optional. The algorithm may have to read the entire contents of the file for each type of identification.")
|
||||
+ "the value will be set to 'application/octet-stream'. In addition, the attribute mime.extension will be set if a common file "
|
||||
+ "extension for the MIME Type is known.")
|
||||
public class IdentifyMimeType extends AbstractProcessor {
|
||||
|
||||
public static final PropertyDescriptor IDENTIFY_ZIP = new PropertyDescriptor.Builder()
|
||||
.name("Identify ZIP")
|
||||
.description("Determines whether or not to attempt in depth identification of ZIP MIME types")
|
||||
.required(true)
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("false")
|
||||
.build();
|
||||
public static final PropertyDescriptor IDENTIFY_TAR = new PropertyDescriptor.Builder()
|
||||
.name("Identify TAR")
|
||||
.description("Determines whether or not to attempt in depth identification of TAR MIME types")
|
||||
.required(true)
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("false")
|
||||
.build();
|
||||
|
||||
public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success").description("All FlowFiles are routed to success").build();
|
||||
|
||||
private Set<Relationship> relationships;
|
||||
private List<PropertyDescriptor> properties;
|
||||
public static final MediaType FLOWFILE_V1 = new MediaType("application", "flowfile-v1");
|
||||
public static final MediaType FLOWFILE_V3 = new MediaType("application", "flowfile-v3");
|
||||
|
||||
private final List<MagicHeader> magicHeaders;
|
||||
private final List<MagicHeader> zipMagicHeaders;
|
||||
private final List<MagicHeader> tarMagicHeaders;
|
||||
private final List<ContentScanningMimeTypeIdentifier> contentScanners;
|
||||
private final int magicHeaderMaxLength;
|
||||
private Set<Relationship> relationships;
|
||||
|
||||
private final TikaConfig config;
|
||||
private final Detector detector;
|
||||
|
||||
public IdentifyMimeType() {
|
||||
// compile a list of Magic Header detectors
|
||||
final List<MagicHeader> headers = new ArrayList<>();
|
||||
headers.add(new SimpleMagicHeader("application/gzip", new byte[]{0x1f, (byte) 0x8b}));
|
||||
headers.add(new SimpleMagicHeader("application/bzip2", new byte[]{0x42, 0x5a}));
|
||||
headers.add(new SimpleMagicHeader("application/flowfile-v3", FlowFilePackagerV3.MAGIC_HEADER));
|
||||
headers.add(new SimpleMagicHeader("application/xml", new byte[]{0x3c, 0x3f, 0x78, 0x6d, 0x6c, 0x20}));
|
||||
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6F, 0x6D}));
|
||||
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x33, 0x67, 0x70, 0x35}));
|
||||
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0X4d, 0X53, 0X4e, 0X56, 0X01, 0X29, 0, 0X46, 0X4d, 0X53, 0X4e, 0X56, 0X6d, 0X70, 0X34, 0X32}));
|
||||
headers.add(new SimpleMagicHeader("video/x-m4v", new byte[]{0, 0, 0, 0x18, 0x66, 0x74, 0x79, 0x70, 0x6D, 0x70, 0x34, 0x32}));
|
||||
headers.add(new SimpleMagicHeader("video/mp4a-latm", new byte[]{0, 0, 0, 0x18, 0x66, 0x74, 0x79, 0x70, 0x4D, 0x34, 0x41, 0x20}));
|
||||
headers.add(new SimpleMagicHeader("video/quicktime", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x71, 0x74, 0x20, 0x20}));
|
||||
headers.add(new SimpleMagicHeader("video/quicktime", new byte[]{0x6D, 0x6F, 0x6F, 0x76}, 4));
|
||||
headers.add(new SimpleMagicHeader("audio/mp3", new byte[]{0x49, 0x44, 0x33}));
|
||||
headers.add(new SimpleMagicHeader("image/bmp", new byte[]{0x42, 0x4D}));
|
||||
headers.add(new SimpleMagicHeader("image/png", new byte[]{(byte) 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}));
|
||||
headers.add(new SimpleMagicHeader("image/jpg", new byte[]{(byte) 0xFF, (byte) 0xD8, (byte) 0xFF}));
|
||||
headers.add(new SimpleMagicHeader("image/gif", new byte[]{0x47, 0x49, 0x46, 0x38, 0x37, 0x61}));
|
||||
headers.add(new SimpleMagicHeader("image/gif", new byte[]{0x47, 0x49, 0x46, 0x38, 0x39, 0x61}));
|
||||
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x49, 0x20, 0x49}));
|
||||
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x49, 0x49, 0x2A, 0x00}));
|
||||
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x4D, 0x4D, 0x00, 0x2A}));
|
||||
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x4D, 0x4D, 0x00, 0x2B}));
|
||||
headers.add(new SimpleMagicHeader("application/vnd.ms-works", new byte[]{(byte) 0xFF, 0x00, 0x02, 0x00, 0x04, 0x04, 0x05, 0x54, 0x02, 0x00}));
|
||||
headers.add(new SimpleMagicHeader("application/msexcel", new byte[]{0x09, 0x08, 0x10, 0, 0, 0x06, 0x05, 0}, 512));
|
||||
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{0x00, 0x6E, 0x1E, (byte) 0xF0}, 512));
|
||||
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{0x0F, 0x00, (byte) 0xE8, 0x03}, 512));
|
||||
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{(byte) 0xA0, 0x46, 0x1D, (byte) 0xF0}, 512));
|
||||
headers.add(new CompoundMagicHeader("application/mspowerpoint",
|
||||
new SimpleMagicHeader("", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}, 512),
|
||||
new SimpleMagicHeader("", new byte[]{0x00, 0x00, 0x00}, 517)));
|
||||
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{0x00, 0x01, 0x00, 0x00, 0x53, 0x74, 0x61, 0x6E, 0x64, 0x61, 0x72, 0x64, 0x20, 0x41, 0x43, 0x45, 0x20, 0x44, 0x42}));
|
||||
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{0x00, 0x01, 0x00, 0x00, 0x53, 0x74, 0x61, 0x6E, 0x64, 0x61, 0x72, 0x64, 0x20, 0x4A, 0x65, 0x74, 0x20, 0x44, 0x42}));
|
||||
for (byte b : new byte[]{0x10, 0x1F, 0x22, 0x23, 0x28, 0x29}) {
|
||||
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, b, 0x00}, 512));
|
||||
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, b, 0x02}, 512));
|
||||
}
|
||||
headers.add(new SimpleMagicHeader("application/x-ms-wmv", new byte[]{0x30, 0x26, (byte) 0xB2, 0x75, (byte) 0x8E, 0x66, (byte) 0xCF, 0x11, (byte) 0xA6, (byte) 0xD9, 0x00, (byte) 0xAA, 0x00, 0x62, (byte) 0xCE, 0x6C}));
|
||||
headers.add(new SimpleMagicHeader("application/pdf", new byte[]{0x25, 0x50, 0x44, 0x46}));
|
||||
headers.add(new SimpleMagicHeader("application/x-rpm", new byte[]{(byte) 0xED, (byte) 0xAB, (byte) 0xEE, (byte) 0xDB}));
|
||||
headers.add(new SimpleMagicHeader("application/x-7z-compressed", new byte[]{0x37, 0x7A, (byte) 0xBC, (byte) 0xAF, 0x27, 0x1C}));
|
||||
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x4A, 0x41, 0x52, 0x43, 0x53, 0x00}));
|
||||
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x08}));
|
||||
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x50, 0x4B, 0x03, 0x04, (byte) 0xA0, 0x00, 0x00}));
|
||||
headers.add(new SimpleMagicHeader("application/x-lzh", new byte[]{0x2D, 0x6C, 0x68}, 2));
|
||||
headers.add(new CompoundMagicHeader("audio/wav",
|
||||
new SimpleMagicHeader("", new byte[]{0x52, 0x49, 0x46, 0x46}),
|
||||
new SimpleMagicHeader("", new byte[]{0x57, 0x41, 0x56, 0x45, 0x66, 0x6D, 0x74, 0x20}, 8)));
|
||||
for (int nibble = 0xB0; nibble <= 0xBF; nibble++) {
|
||||
headers.add(new SimpleMagicHeader("video/mpeg", new byte[]{0x00, 0x00, 0x01, (byte) nibble}));
|
||||
}
|
||||
this.magicHeaders = Collections.unmodifiableList(headers);
|
||||
// Setup Tika
|
||||
this.config = TikaConfig.getDefaultConfig();
|
||||
DefaultDetector ddetector = new DefaultDetector();
|
||||
|
||||
// additional Magic Header detectors that will be turned off based on property settings
|
||||
final List<MagicHeader> zipHeaders = new ArrayList<>();
|
||||
zipHeaders.add(new SimpleMagicHeader("application/zip", new byte[]{0x50, 0x4B, 0x03, 0x04}));
|
||||
this.zipMagicHeaders = Collections.unmodifiableList(zipHeaders);
|
||||
final List<MagicHeader> tarHeaders = new ArrayList<>();
|
||||
tarHeaders.add(new SimpleMagicHeader("application/tar", new byte[]{0x75, 0x73, 0x74, 0x61, 0x72}, 257));
|
||||
this.tarMagicHeaders = Collections.unmodifiableList(tarHeaders);
|
||||
// Create list of detectors, preferring our custom detectors first
|
||||
List<Detector> detectors = new ArrayList<>();
|
||||
detectors.add(getFlowFileV3Detector());
|
||||
detectors.add(getFlowFileV1Detector());
|
||||
detectors.addAll(ddetector.getDetectors());
|
||||
|
||||
// determine the max length that we need to buffer for magic headers
|
||||
int max = 0;
|
||||
for (final MagicHeader header : magicHeaders) {
|
||||
max = Math.max(max, header.getRequiredBufferLength());
|
||||
}
|
||||
for (final MagicHeader header : zipMagicHeaders) {
|
||||
max = Math.max(max, header.getRequiredBufferLength());
|
||||
}
|
||||
for (final MagicHeader header : tarMagicHeaders) {
|
||||
max = Math.max(max, header.getRequiredBufferLength());
|
||||
}
|
||||
this.magicHeaderMaxLength = max;
|
||||
|
||||
// create list of Content Scanners
|
||||
final List<ContentScanningMimeTypeIdentifier> scanningIdentifiers = new ArrayList<>();
|
||||
scanningIdentifiers.add(new ZipIdentifier());
|
||||
scanningIdentifiers.add(new TarIdentifier());
|
||||
this.contentScanners = Collections.unmodifiableList(scanningIdentifiers);
|
||||
CompositeDetector compositeDetector = new CompositeDetector(detectors);
|
||||
this.detector = compositeDetector;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void init(final ProcessorInitializationContext context) {
|
||||
final Set<Relationship> relationships = new HashSet<>();
|
||||
relationships.add(REL_SUCCESS);
|
||||
this.relationships = Collections.unmodifiableSet(relationships);
|
||||
|
||||
final List<PropertyDescriptor> properties = new ArrayList<>();
|
||||
properties.add(IDENTIFY_ZIP);
|
||||
properties.add(IDENTIFY_TAR);
|
||||
this.properties = Collections.unmodifiableList(properties);
|
||||
final Set<Relationship> rels = new HashSet<>();
|
||||
rels.add(REL_SUCCESS);
|
||||
this.relationships = Collections.unmodifiableSet(rels);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -226,11 +120,6 @@ public class IdentifyMimeType extends AbstractProcessor {
|
||||
return relationships;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTrigger(final ProcessContext context, final ProcessSession session) {
|
||||
FlowFile flowFile = session.get();
|
||||
@ -239,87 +128,39 @@ public class IdentifyMimeType extends AbstractProcessor {
|
||||
}
|
||||
|
||||
final ProcessorLog logger = getLogger();
|
||||
final boolean identifyZip = context.getProperty(IDENTIFY_ZIP).asBoolean();
|
||||
final boolean identifyTar = context.getProperty(IDENTIFY_TAR).asBoolean();
|
||||
|
||||
final ObjectHolder<String> mimeTypeRef = new ObjectHolder<>(null);
|
||||
final ObjectHolder<String> extensionRef = new ObjectHolder<>(null);
|
||||
session.read(flowFile, new InputStreamCallback() {
|
||||
@Override
|
||||
public void process(final InputStream stream) throws IOException {
|
||||
try (final InputStream in = new BufferedInputStream(stream)) {
|
||||
// read in up to magicHeaderMaxLength bytes
|
||||
in.mark(magicHeaderMaxLength);
|
||||
byte[] header = new byte[magicHeaderMaxLength];
|
||||
for (int i = 0; i < header.length; i++) {
|
||||
final int next = in.read();
|
||||
if (next >= 0) {
|
||||
header[i] = (byte) next;
|
||||
} else if (i == 0) {
|
||||
header = new byte[0];
|
||||
} else {
|
||||
final byte[] newBuffer = new byte[i - 1];
|
||||
System.arraycopy(header, 0, newBuffer, 0, i - 1);
|
||||
header = newBuffer;
|
||||
break;
|
||||
}
|
||||
}
|
||||
in.reset();
|
||||
|
||||
for (final MagicHeader magicHeader : magicHeaders) {
|
||||
if (magicHeader.matches(header)) {
|
||||
mimeTypeRef.set(magicHeader.getMimeType());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!identifyZip) {
|
||||
for (final MagicHeader magicHeader : zipMagicHeaders) {
|
||||
if (magicHeader.matches(header)) {
|
||||
mimeTypeRef.set(magicHeader.getMimeType());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!identifyTar) {
|
||||
for (final MagicHeader magicHeader : tarMagicHeaders) {
|
||||
if (magicHeader.matches(header)) {
|
||||
mimeTypeRef.set(magicHeader.getMimeType());
|
||||
return;
|
||||
}
|
||||
}
|
||||
TikaInputStream tikaStream = TikaInputStream.get(in);
|
||||
Metadata metadata = new Metadata();
|
||||
// Get mime type
|
||||
MediaType mediatype = detector.detect(tikaStream, metadata);
|
||||
mimeTypeRef.set(mediatype.toString());
|
||||
// Get common file extension
|
||||
try {
|
||||
MimeType mimetype;
|
||||
mimetype = config.getMimeRepository().forName(mediatype.toString());
|
||||
extensionRef.set(mimetype.getExtension());
|
||||
} catch (MimeTypeException ex) {
|
||||
logger.warn("MIME type detection failed: {}", new Object[]{ex.toString()});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
String mimeType = mimeTypeRef.get();
|
||||
if (mimeType == null) {
|
||||
for (final ContentScanningMimeTypeIdentifier scanningIdentifier : this.contentScanners) {
|
||||
if (scanningIdentifier.isEnabled(context)) {
|
||||
session.read(flowFile, new InputStreamCallback() {
|
||||
@Override
|
||||
public void process(final InputStream in) throws IOException {
|
||||
String mimeType = scanningIdentifier.getMimeType(in);
|
||||
if (mimeType != null) {
|
||||
mimeTypeRef.set(mimeType);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (mimeTypeRef.get() != null) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mimeType = mimeTypeRef.get();
|
||||
String extension = extensionRef.get();
|
||||
if (mimeType == null) {
|
||||
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
|
||||
flowFile = session.putAttribute(flowFile, "mime.extension", "");
|
||||
logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[]{flowFile});
|
||||
} else {
|
||||
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
|
||||
flowFile = session.putAttribute(flowFile, "mime.extension", extension);
|
||||
logger.info("Identified {} as having MIME Type {}", new Object[]{flowFile, mimeType});
|
||||
}
|
||||
|
||||
@ -327,148 +168,41 @@ public class IdentifyMimeType extends AbstractProcessor {
|
||||
session.transfer(flowFile, REL_SUCCESS);
|
||||
}
|
||||
|
||||
private static interface ContentScanningMimeTypeIdentifier {
|
||||
|
||||
boolean isEnabled(ProcessContext context);
|
||||
|
||||
String getMimeType(InputStream in) throws IOException;
|
||||
private Detector getFlowFileV3Detector() {
|
||||
return new MagicDetector(FLOWFILE_V3, FlowFilePackagerV3.MAGIC_HEADER);
|
||||
}
|
||||
|
||||
private static class ZipIdentifier implements ContentScanningMimeTypeIdentifier {
|
||||
private Detector getFlowFileV1Detector() {
|
||||
return new FlowFileV1Detector();
|
||||
}
|
||||
|
||||
private class FlowFileV1Detector implements Detector {
|
||||
|
||||
@Override
|
||||
public String getMimeType(final InputStream in) throws IOException {
|
||||
final ZipInputStream zipIn = new ZipInputStream(in);
|
||||
try {
|
||||
if (zipIn.getNextEntry() != null) {
|
||||
return "application/zip";
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
public MediaType detect(InputStream in, Metadata mtdt) throws IOException {
|
||||
// Sanity check the stream. This may not be a tarfile at all
|
||||
in.mark(FlowFilePackagerV1.FILENAME_ATTRIBUTES.length());
|
||||
byte[] bytes = new byte[FlowFilePackagerV1.FILENAME_ATTRIBUTES.length()];
|
||||
in.read(bytes);
|
||||
in.reset();
|
||||
|
||||
// Quick exit if the first filename is not correct
|
||||
if (!Arrays.equals(bytes, FlowFilePackagerV1.FILENAME_ATTRIBUTES.getBytes())) {
|
||||
return MediaType.OCTET_STREAM;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEnabled(final ProcessContext context) {
|
||||
return context.getProperty(IDENTIFY_ZIP).asBoolean();
|
||||
}
|
||||
}
|
||||
|
||||
private static class TarIdentifier implements ContentScanningMimeTypeIdentifier {
|
||||
|
||||
@Override
|
||||
public String getMimeType(final InputStream in) throws IOException {
|
||||
try (final TarArchiveInputStream tarIn = new TarArchiveInputStream(in)) {
|
||||
final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
|
||||
if (firstEntry != null) {
|
||||
if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
|
||||
final TarArchiveEntry secondEntry = tarIn.getNextTarEntry();
|
||||
if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
|
||||
return "application/flowfile-v1";
|
||||
}
|
||||
// More in-depth detection
|
||||
final TarArchiveInputStream tarIn = new TarArchiveInputStream(in);
|
||||
final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
|
||||
if (firstEntry != null) {
|
||||
if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
|
||||
final TarArchiveEntry secondEntry = tarIn.getNextTarEntry();
|
||||
if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
|
||||
return FLOWFILE_V1;
|
||||
}
|
||||
return "application/tar";
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEnabled(final ProcessContext context) {
|
||||
return context.getProperty(IDENTIFY_TAR).asBoolean();
|
||||
}
|
||||
}
|
||||
|
||||
private static interface MagicHeader {
|
||||
|
||||
int getRequiredBufferLength();
|
||||
|
||||
String getMimeType();
|
||||
|
||||
boolean matches(final byte[] header);
|
||||
}
|
||||
|
||||
private static class SimpleMagicHeader implements MagicHeader {
|
||||
|
||||
private final String mimeType;
|
||||
private final int offset;
|
||||
private final byte[] byteSequence;
|
||||
|
||||
public SimpleMagicHeader(final String mimeType, final byte[] byteSequence) {
|
||||
this(mimeType, byteSequence, 0);
|
||||
}
|
||||
|
||||
public SimpleMagicHeader(final String mimeType, final byte[] byteSequence, final int offset) {
|
||||
this.mimeType = mimeType;
|
||||
this.byteSequence = byteSequence;
|
||||
this.offset = offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getRequiredBufferLength() {
|
||||
return byteSequence.length + offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMimeType() {
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(final byte[] header) {
|
||||
if (header.length < getRequiredBufferLength()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < byteSequence.length; i++) {
|
||||
if (byteSequence[i] != header[offset + i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return MediaType.OCTET_STREAM;
|
||||
}
|
||||
}
|
||||
|
||||
private static class CompoundMagicHeader implements MagicHeader {
|
||||
|
||||
private final MagicHeader[] headers;
|
||||
private final int requiredLength;
|
||||
private final String mimeType;
|
||||
|
||||
public CompoundMagicHeader(final String mimeType, final MagicHeader... headers) {
|
||||
this.mimeType = mimeType;
|
||||
this.headers = headers;
|
||||
|
||||
int max = 0;
|
||||
for (final MagicHeader header : headers) {
|
||||
max = Math.max(max, header.getRequiredBufferLength());
|
||||
}
|
||||
|
||||
this.requiredLength = max;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getRequiredBufferLength() {
|
||||
return requiredLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMimeType() {
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(final byte[] header) {
|
||||
for (final MagicHeader mh : headers) {
|
||||
if (!mh.matches(header)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -57,63 +57,16 @@ public class TestIdentifyMimeType {
|
||||
|
||||
final Map<String, String> expectedMimeTypes = new HashMap<>();
|
||||
expectedMimeTypes.put("1.7z", "application/x-7z-compressed");
|
||||
expectedMimeTypes.put("1.mdb", "application/msaccess");
|
||||
expectedMimeTypes.put("1.txt.bz2", "application/bzip2");
|
||||
expectedMimeTypes.put("1.mdb", "application/x-msaccess");
|
||||
expectedMimeTypes.put("1.txt", "text/plain");
|
||||
expectedMimeTypes.put("1.txt.bz2", "application/x-bzip2");
|
||||
expectedMimeTypes.put("1.txt.gz", "application/gzip");
|
||||
expectedMimeTypes.put("1.zip", "application/zip");
|
||||
expectedMimeTypes.put("bgBannerFoot.png", "image/png");
|
||||
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpg");
|
||||
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpeg");
|
||||
expectedMimeTypes.put("1.pdf", "application/pdf");
|
||||
expectedMimeTypes.put("grid.gif", "image/gif");
|
||||
expectedMimeTypes.put("1.tar", "application/octet-stream"); //wrong ID without IDENTIFY_TAR
|
||||
expectedMimeTypes.put("1.jar", "application/java-archive");
|
||||
expectedMimeTypes.put("1.xml", "application/xml");
|
||||
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
|
||||
expectedMimeTypes.put("flowfilev1.tar", "application/tar"); //wrong ID without IDENTIFY_TAR
|
||||
|
||||
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
|
||||
for (final MockFlowFile file : filesOut) {
|
||||
final String filename = file.getAttribute(CoreAttributes.FILENAME.key());
|
||||
final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key());
|
||||
final String expected = expectedMimeTypes.get(filename);
|
||||
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilesWithIdentifyTarAndZip() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType());
|
||||
runner.setProperty(IdentifyMimeType.IDENTIFY_TAR.getName(), "true");
|
||||
runner.setProperty(IdentifyMimeType.IDENTIFY_ZIP.getName(), "true");
|
||||
|
||||
final File dir = new File("src/test/resources/TestIdentifyMimeType");
|
||||
final File[] files = dir.listFiles();
|
||||
int fileCount = 0;
|
||||
for (final File file : files) {
|
||||
if (file.isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
runner.enqueue(file.toPath());
|
||||
fileCount++;
|
||||
}
|
||||
|
||||
runner.setThreadCount(4);
|
||||
runner.run(fileCount);
|
||||
|
||||
runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, fileCount);
|
||||
|
||||
final Map<String, String> expectedMimeTypes = new HashMap<>();
|
||||
expectedMimeTypes.put("1.7z", "application/x-7z-compressed");
|
||||
expectedMimeTypes.put("1.mdb", "application/msaccess");
|
||||
expectedMimeTypes.put("1.txt.bz2", "application/bzip2");
|
||||
expectedMimeTypes.put("1.txt.gz", "application/gzip");
|
||||
expectedMimeTypes.put("1.zip", "application/zip");
|
||||
expectedMimeTypes.put("bgBannerFoot.png", "image/png");
|
||||
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpg");
|
||||
expectedMimeTypes.put("1.pdf", "application/pdf");
|
||||
expectedMimeTypes.put("grid.gif", "image/gif");
|
||||
expectedMimeTypes.put("1.tar", "application/tar");
|
||||
expectedMimeTypes.put("1.tar", "application/x-tar");
|
||||
expectedMimeTypes.put("1.jar", "application/java-archive");
|
||||
expectedMimeTypes.put("1.xml", "application/xml");
|
||||
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
|
||||
@ -127,5 +80,4 @@ public class TestIdentifyMimeType {
|
||||
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
Binary file not shown.
@ -0,0 +1 @@
|
||||
Test IdentifyMimeType
|
Loading…
x
Reference in New Issue
Block a user