mirror of https://github.com/apache/nifi.git
Merge branch 'NIFI-296-extend-IdentifyMimeType' of https://github.com/adamonduty/incubator-nifi into develop
This commit is contained in:
commit
ee795a7d01
|
@ -163,5 +163,10 @@
|
|||
<artifactId>nifi-ssl-context-service</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-core</artifactId>
|
||||
<version>1.7</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
|
|
@ -19,14 +19,10 @@ package org.apache.nifi.processors.standard;
|
|||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import org.apache.nifi.components.PropertyDescriptor;
|
||||
import org.apache.nifi.flowfile.FlowFile;
|
||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||
import org.apache.nifi.logging.ProcessorLog;
|
||||
|
@ -41,53 +37,29 @@ import org.apache.nifi.annotation.behavior.SideEffectFree;
|
|||
import org.apache.nifi.annotation.behavior.SupportsBatching;
|
||||
import org.apache.nifi.annotation.documentation.Tags;
|
||||
import org.apache.nifi.processor.io.InputStreamCallback;
|
||||
import org.apache.nifi.util.FlowFilePackagerV1;
|
||||
import org.apache.nifi.util.FlowFilePackagerV3;
|
||||
import org.apache.nifi.util.ObjectHolder;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.detect.Detector;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.mime.MimeType;
|
||||
import org.apache.tika.mime.MimeTypeException;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Attempts to detect the MIME Type of a FlowFile by examining its contents. If
|
||||
* the MIME Type is determined, it is added to an attribute with the name
|
||||
* mime.type
|
||||
* mime.type. In addition, mime.extension is set if a common file extension is known.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The following MIME Types are supported:
|
||||
* MIME Type detection is performed by Apache Tika; more information about
|
||||
* detection is available at http://tika.apache.org.
|
||||
*
|
||||
* <ul>
|
||||
* <li>application/gzip</li>
|
||||
* <li>application/bzip2</li>
|
||||
* <li>application/flowfile-v3</li>
|
||||
* <li>application/flowfile-v1 (requires Identify TAR be set to true)</li>
|
||||
* <li>application/xml</li>
|
||||
* <li>video/mp4</li>
|
||||
* <li>video/x-m4v</li>
|
||||
* <li>video/mp4a-latm</li>
|
||||
* <li>video/quicktime</li>
|
||||
* <li>video/mpeg</li>
|
||||
* <li>audio/wav</li>
|
||||
* <li>audio/mp3</li>
|
||||
* <li>image/bmp</li>
|
||||
* <li>image/png</li>
|
||||
* <li>image/jpg</li>
|
||||
* <li>image/gif</li>
|
||||
* <li>image/tif</li>
|
||||
* <li>application/vnd.ms-works</li>
|
||||
* <li>application/msexcel</li>
|
||||
* <li>application/mspowerpoint</li>
|
||||
* <li>application/msaccess</li>
|
||||
* <li>application/x-ms-wmv</li>
|
||||
* <li>application/pdf</li>
|
||||
* <li>application/x-rpm</li>
|
||||
* <li>application/tar</li>
|
||||
* <li>application/x-7z-compressed</li>
|
||||
* <li>application/java-archive</li>
|
||||
* <li>application/zip</li>
|
||||
* <li>application/x-lzh</li>
|
||||
* <li>application/flowfile-v1</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
|
@ -97,128 +69,31 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
|||
@Tags({"compression", "gzip", "bzip2", "zip", "MIME", "mime.type", "file", "identify"})
|
||||
@CapabilityDescription("Attempts to identify the MIME Type used for a FlowFile. If the MIME Type can be identified, "
|
||||
+ "an attribute with the name 'mime.type' is added with the value being the MIME Type. If the MIME Type cannot be determined, "
|
||||
+ "the value will be set to 'application/octet-stream'. Some MIME Types require reading a significant amount of data; for these MIME Types, their identification "
|
||||
+ "is optional. The algorithm may have to read the entire contents of the file for each type of identification.")
|
||||
+ "the value will be set to 'application/octet-stream'. In addition, the attribute mime.extension will be set if a common file "
|
||||
+ "extension for the MIME Type is known.")
|
||||
public class IdentifyMimeType extends AbstractProcessor {
|
||||
|
||||
public static final PropertyDescriptor IDENTIFY_ZIP = new PropertyDescriptor.Builder()
|
||||
.name("Identify ZIP")
|
||||
.description("Determines whether or not to attempt in depth identification of ZIP MIME types")
|
||||
.required(true)
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("false")
|
||||
.build();
|
||||
public static final PropertyDescriptor IDENTIFY_TAR = new PropertyDescriptor.Builder()
|
||||
.name("Identify TAR")
|
||||
.description("Determines whether or not to attempt in depth identification of TAR MIME types")
|
||||
.required(true)
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("false")
|
||||
.build();
|
||||
|
||||
public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success").description("All FlowFiles are routed to success").build();
|
||||
|
||||
private Set<Relationship> relationships;
|
||||
private List<PropertyDescriptor> properties;
|
||||
public static final MediaType FLOWFILE_V1 = new MediaType("application", "flowfile-v1");
|
||||
public static final MediaType FLOWFILE_V3 = new MediaType("application", "flowfile-v3");
|
||||
|
||||
private final List<MagicHeader> magicHeaders;
|
||||
private final List<MagicHeader> zipMagicHeaders;
|
||||
private final List<MagicHeader> tarMagicHeaders;
|
||||
private final List<ContentScanningMimeTypeIdentifier> contentScanners;
|
||||
private final int magicHeaderMaxLength;
|
||||
private Set<Relationship> relationships;
|
||||
|
||||
private final TikaConfig config;
|
||||
private final Detector detector;
|
||||
|
||||
public IdentifyMimeType() {
|
||||
// compile a list of Magic Header detectors
|
||||
final List<MagicHeader> headers = new ArrayList<>();
|
||||
headers.add(new SimpleMagicHeader("application/gzip", new byte[]{0x1f, (byte) 0x8b}));
|
||||
headers.add(new SimpleMagicHeader("application/bzip2", new byte[]{0x42, 0x5a}));
|
||||
headers.add(new SimpleMagicHeader("application/flowfile-v3", FlowFilePackagerV3.MAGIC_HEADER));
|
||||
headers.add(new SimpleMagicHeader("application/xml", new byte[]{0x3c, 0x3f, 0x78, 0x6d, 0x6c, 0x20}));
|
||||
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6F, 0x6D}));
|
||||
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x33, 0x67, 0x70, 0x35}));
|
||||
headers.add(new SimpleMagicHeader("video/mp4", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0X4d, 0X53, 0X4e, 0X56, 0X01, 0X29, 0, 0X46, 0X4d, 0X53, 0X4e, 0X56, 0X6d, 0X70, 0X34, 0X32}));
|
||||
headers.add(new SimpleMagicHeader("video/x-m4v", new byte[]{0, 0, 0, 0x18, 0x66, 0x74, 0x79, 0x70, 0x6D, 0x70, 0x34, 0x32}));
|
||||
headers.add(new SimpleMagicHeader("video/mp4a-latm", new byte[]{0, 0, 0, 0x18, 0x66, 0x74, 0x79, 0x70, 0x4D, 0x34, 0x41, 0x20}));
|
||||
headers.add(new SimpleMagicHeader("video/quicktime", new byte[]{0, 0, 0, 0x14, 0x66, 0x74, 0x79, 0x70, 0x71, 0x74, 0x20, 0x20}));
|
||||
headers.add(new SimpleMagicHeader("video/quicktime", new byte[]{0x6D, 0x6F, 0x6F, 0x76}, 4));
|
||||
headers.add(new SimpleMagicHeader("audio/mp3", new byte[]{0x49, 0x44, 0x33}));
|
||||
headers.add(new SimpleMagicHeader("image/bmp", new byte[]{0x42, 0x4D}));
|
||||
headers.add(new SimpleMagicHeader("image/png", new byte[]{(byte) 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}));
|
||||
headers.add(new SimpleMagicHeader("image/jpg", new byte[]{(byte) 0xFF, (byte) 0xD8, (byte) 0xFF}));
|
||||
headers.add(new SimpleMagicHeader("image/gif", new byte[]{0x47, 0x49, 0x46, 0x38, 0x37, 0x61}));
|
||||
headers.add(new SimpleMagicHeader("image/gif", new byte[]{0x47, 0x49, 0x46, 0x38, 0x39, 0x61}));
|
||||
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x49, 0x20, 0x49}));
|
||||
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x49, 0x49, 0x2A, 0x00}));
|
||||
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x4D, 0x4D, 0x00, 0x2A}));
|
||||
headers.add(new SimpleMagicHeader("image/tif", new byte[]{0x4D, 0x4D, 0x00, 0x2B}));
|
||||
headers.add(new SimpleMagicHeader("application/vnd.ms-works", new byte[]{(byte) 0xFF, 0x00, 0x02, 0x00, 0x04, 0x04, 0x05, 0x54, 0x02, 0x00}));
|
||||
headers.add(new SimpleMagicHeader("application/msexcel", new byte[]{0x09, 0x08, 0x10, 0, 0, 0x06, 0x05, 0}, 512));
|
||||
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{0x00, 0x6E, 0x1E, (byte) 0xF0}, 512));
|
||||
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{0x0F, 0x00, (byte) 0xE8, 0x03}, 512));
|
||||
headers.add(new SimpleMagicHeader("application/mspowerpoint", new byte[]{(byte) 0xA0, 0x46, 0x1D, (byte) 0xF0}, 512));
|
||||
headers.add(new CompoundMagicHeader("application/mspowerpoint",
|
||||
new SimpleMagicHeader("", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}, 512),
|
||||
new SimpleMagicHeader("", new byte[]{0x00, 0x00, 0x00}, 517)));
|
||||
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{0x00, 0x01, 0x00, 0x00, 0x53, 0x74, 0x61, 0x6E, 0x64, 0x61, 0x72, 0x64, 0x20, 0x41, 0x43, 0x45, 0x20, 0x44, 0x42}));
|
||||
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{0x00, 0x01, 0x00, 0x00, 0x53, 0x74, 0x61, 0x6E, 0x64, 0x61, 0x72, 0x64, 0x20, 0x4A, 0x65, 0x74, 0x20, 0x44, 0x42}));
|
||||
for (byte b : new byte[]{0x10, 0x1F, 0x22, 0x23, 0x28, 0x29}) {
|
||||
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, b, 0x00}, 512));
|
||||
headers.add(new SimpleMagicHeader("application/msaccess", new byte[]{(byte) 0xFD, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, b, 0x02}, 512));
|
||||
}
|
||||
headers.add(new SimpleMagicHeader("application/x-ms-wmv", new byte[]{0x30, 0x26, (byte) 0xB2, 0x75, (byte) 0x8E, 0x66, (byte) 0xCF, 0x11, (byte) 0xA6, (byte) 0xD9, 0x00, (byte) 0xAA, 0x00, 0x62, (byte) 0xCE, 0x6C}));
|
||||
headers.add(new SimpleMagicHeader("application/pdf", new byte[]{0x25, 0x50, 0x44, 0x46}));
|
||||
headers.add(new SimpleMagicHeader("application/x-rpm", new byte[]{(byte) 0xED, (byte) 0xAB, (byte) 0xEE, (byte) 0xDB}));
|
||||
headers.add(new SimpleMagicHeader("application/x-7z-compressed", new byte[]{0x37, 0x7A, (byte) 0xBC, (byte) 0xAF, 0x27, 0x1C}));
|
||||
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x4A, 0x41, 0x52, 0x43, 0x53, 0x00}));
|
||||
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x08}));
|
||||
headers.add(new SimpleMagicHeader("application/java-archive", new byte[]{0x50, 0x4B, 0x03, 0x04, (byte) 0xA0, 0x00, 0x00}));
|
||||
headers.add(new SimpleMagicHeader("application/x-lzh", new byte[]{0x2D, 0x6C, 0x68}, 2));
|
||||
headers.add(new CompoundMagicHeader("audio/wav",
|
||||
new SimpleMagicHeader("", new byte[]{0x52, 0x49, 0x46, 0x46}),
|
||||
new SimpleMagicHeader("", new byte[]{0x57, 0x41, 0x56, 0x45, 0x66, 0x6D, 0x74, 0x20}, 8)));
|
||||
for (int nibble = 0xB0; nibble <= 0xBF; nibble++) {
|
||||
headers.add(new SimpleMagicHeader("video/mpeg", new byte[]{0x00, 0x00, 0x01, (byte) nibble}));
|
||||
}
|
||||
this.magicHeaders = Collections.unmodifiableList(headers);
|
||||
|
||||
// additional Magic Header detectors that will be turned off based on property settings
|
||||
final List<MagicHeader> zipHeaders = new ArrayList<>();
|
||||
zipHeaders.add(new SimpleMagicHeader("application/zip", new byte[]{0x50, 0x4B, 0x03, 0x04}));
|
||||
this.zipMagicHeaders = Collections.unmodifiableList(zipHeaders);
|
||||
final List<MagicHeader> tarHeaders = new ArrayList<>();
|
||||
tarHeaders.add(new SimpleMagicHeader("application/tar", new byte[]{0x75, 0x73, 0x74, 0x61, 0x72}, 257));
|
||||
this.tarMagicHeaders = Collections.unmodifiableList(tarHeaders);
|
||||
|
||||
// determine the max length that we need to buffer for magic headers
|
||||
int max = 0;
|
||||
for (final MagicHeader header : magicHeaders) {
|
||||
max = Math.max(max, header.getRequiredBufferLength());
|
||||
}
|
||||
for (final MagicHeader header : zipMagicHeaders) {
|
||||
max = Math.max(max, header.getRequiredBufferLength());
|
||||
}
|
||||
for (final MagicHeader header : tarMagicHeaders) {
|
||||
max = Math.max(max, header.getRequiredBufferLength());
|
||||
}
|
||||
this.magicHeaderMaxLength = max;
|
||||
|
||||
// create list of Content Scanners
|
||||
final List<ContentScanningMimeTypeIdentifier> scanningIdentifiers = new ArrayList<>();
|
||||
scanningIdentifiers.add(new ZipIdentifier());
|
||||
scanningIdentifiers.add(new TarIdentifier());
|
||||
this.contentScanners = Collections.unmodifiableList(scanningIdentifiers);
|
||||
// Setup Tika
|
||||
this.config = TikaConfig.getDefaultConfig();
|
||||
this.detector = config.getDetector();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void init(final ProcessorInitializationContext context) {
|
||||
final Set<Relationship> relationships = new HashSet<>();
|
||||
relationships.add(REL_SUCCESS);
|
||||
this.relationships = Collections.unmodifiableSet(relationships);
|
||||
|
||||
final List<PropertyDescriptor> properties = new ArrayList<>();
|
||||
properties.add(IDENTIFY_ZIP);
|
||||
properties.add(IDENTIFY_TAR);
|
||||
this.properties = Collections.unmodifiableList(properties);
|
||||
final Set<Relationship> rels = new HashSet<>();
|
||||
rels.add(REL_SUCCESS);
|
||||
this.relationships = Collections.unmodifiableSet(rels);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -226,11 +101,6 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
return relationships;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTrigger(final ProcessContext context, final ProcessSession session) {
|
||||
FlowFile flowFile = session.get();
|
||||
|
@ -239,236 +109,47 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
}
|
||||
|
||||
final ProcessorLog logger = getLogger();
|
||||
final boolean identifyZip = context.getProperty(IDENTIFY_ZIP).asBoolean();
|
||||
final boolean identifyTar = context.getProperty(IDENTIFY_TAR).asBoolean();
|
||||
|
||||
final ObjectHolder<String> mimeTypeRef = new ObjectHolder<>(null);
|
||||
|
||||
session.read(flowFile, new InputStreamCallback() {
|
||||
@Override
|
||||
public void process(final InputStream stream) throws IOException {
|
||||
try (final InputStream in = new BufferedInputStream(stream)) {
|
||||
// read in up to magicHeaderMaxLength bytes
|
||||
in.mark(magicHeaderMaxLength);
|
||||
byte[] header = new byte[magicHeaderMaxLength];
|
||||
for (int i = 0; i < header.length; i++) {
|
||||
final int next = in.read();
|
||||
if (next >= 0) {
|
||||
header[i] = (byte) next;
|
||||
} else if (i == 0) {
|
||||
header = new byte[0];
|
||||
} else {
|
||||
final byte[] newBuffer = new byte[i - 1];
|
||||
System.arraycopy(header, 0, newBuffer, 0, i - 1);
|
||||
header = newBuffer;
|
||||
break;
|
||||
}
|
||||
}
|
||||
in.reset();
|
||||
|
||||
for (final MagicHeader magicHeader : magicHeaders) {
|
||||
if (magicHeader.matches(header)) {
|
||||
mimeTypeRef.set(magicHeader.getMimeType());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!identifyZip) {
|
||||
for (final MagicHeader magicHeader : zipMagicHeaders) {
|
||||
if (magicHeader.matches(header)) {
|
||||
mimeTypeRef.set(magicHeader.getMimeType());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!identifyTar) {
|
||||
for (final MagicHeader magicHeader : tarMagicHeaders) {
|
||||
if (magicHeader.matches(header)) {
|
||||
mimeTypeRef.set(magicHeader.getMimeType());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
TikaInputStream tikaStream = TikaInputStream.get(in);
|
||||
Metadata metadata = new Metadata();
|
||||
// Get mime type
|
||||
MediaType mediatype = detector.detect(tikaStream, metadata);
|
||||
mimeTypeRef.set(mediatype.toString());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
String mimeType = mimeTypeRef.get();
|
||||
if (mimeType == null) {
|
||||
for (final ContentScanningMimeTypeIdentifier scanningIdentifier : this.contentScanners) {
|
||||
if (scanningIdentifier.isEnabled(context)) {
|
||||
session.read(flowFile, new InputStreamCallback() {
|
||||
@Override
|
||||
public void process(final InputStream in) throws IOException {
|
||||
String mimeType = scanningIdentifier.getMimeType(in);
|
||||
if (mimeType != null) {
|
||||
mimeTypeRef.set(mimeType);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (mimeTypeRef.get() != null) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
String extension = "";
|
||||
try {
|
||||
MimeType mimetype;
|
||||
mimetype = config.getMimeRepository().forName(mimeType);
|
||||
extension = mimetype.getExtension();
|
||||
} catch (MimeTypeException ex) {
|
||||
logger.warn("MIME type extension lookup failed: {}", new Object[]{ex});
|
||||
}
|
||||
|
||||
// Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
|
||||
if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
|
||||
extension = ".gz";
|
||||
}
|
||||
|
||||
mimeType = mimeTypeRef.get();
|
||||
if (mimeType == null) {
|
||||
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
|
||||
flowFile = session.putAttribute(flowFile, "mime.extension", "");
|
||||
logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[]{flowFile});
|
||||
} else {
|
||||
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
|
||||
flowFile = session.putAttribute(flowFile, "mime.extension", extension);
|
||||
logger.info("Identified {} as having MIME Type {}", new Object[]{flowFile, mimeType});
|
||||
}
|
||||
|
||||
session.getProvenanceReporter().modifyAttributes(flowFile);
|
||||
session.transfer(flowFile, REL_SUCCESS);
|
||||
}
|
||||
|
||||
private static interface ContentScanningMimeTypeIdentifier {
|
||||
|
||||
boolean isEnabled(ProcessContext context);
|
||||
|
||||
String getMimeType(InputStream in) throws IOException;
|
||||
}
|
||||
|
||||
private static class ZipIdentifier implements ContentScanningMimeTypeIdentifier {
|
||||
|
||||
@Override
|
||||
public String getMimeType(final InputStream in) throws IOException {
|
||||
final ZipInputStream zipIn = new ZipInputStream(in);
|
||||
try {
|
||||
if (zipIn.getNextEntry() != null) {
|
||||
return "application/zip";
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEnabled(final ProcessContext context) {
|
||||
return context.getProperty(IDENTIFY_ZIP).asBoolean();
|
||||
}
|
||||
}
|
||||
|
||||
private static class TarIdentifier implements ContentScanningMimeTypeIdentifier {
|
||||
|
||||
@Override
|
||||
public String getMimeType(final InputStream in) throws IOException {
|
||||
try (final TarArchiveInputStream tarIn = new TarArchiveInputStream(in)) {
|
||||
final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
|
||||
if (firstEntry != null) {
|
||||
if (firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
|
||||
final TarArchiveEntry secondEntry = tarIn.getNextTarEntry();
|
||||
if (secondEntry != null && secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
|
||||
return "application/flowfile-v1";
|
||||
}
|
||||
}
|
||||
return "application/tar";
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEnabled(final ProcessContext context) {
|
||||
return context.getProperty(IDENTIFY_TAR).asBoolean();
|
||||
}
|
||||
}
|
||||
|
||||
private static interface MagicHeader {
|
||||
|
||||
int getRequiredBufferLength();
|
||||
|
||||
String getMimeType();
|
||||
|
||||
boolean matches(final byte[] header);
|
||||
}
|
||||
|
||||
private static class SimpleMagicHeader implements MagicHeader {
|
||||
|
||||
private final String mimeType;
|
||||
private final int offset;
|
||||
private final byte[] byteSequence;
|
||||
|
||||
public SimpleMagicHeader(final String mimeType, final byte[] byteSequence) {
|
||||
this(mimeType, byteSequence, 0);
|
||||
}
|
||||
|
||||
public SimpleMagicHeader(final String mimeType, final byte[] byteSequence, final int offset) {
|
||||
this.mimeType = mimeType;
|
||||
this.byteSequence = byteSequence;
|
||||
this.offset = offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getRequiredBufferLength() {
|
||||
return byteSequence.length + offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMimeType() {
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(final byte[] header) {
|
||||
if (header.length < getRequiredBufferLength()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < byteSequence.length; i++) {
|
||||
if (byteSequence[i] != header[offset + i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private static class CompoundMagicHeader implements MagicHeader {
|
||||
|
||||
private final MagicHeader[] headers;
|
||||
private final int requiredLength;
|
||||
private final String mimeType;
|
||||
|
||||
public CompoundMagicHeader(final String mimeType, final MagicHeader... headers) {
|
||||
this.mimeType = mimeType;
|
||||
this.headers = headers;
|
||||
|
||||
int max = 0;
|
||||
for (final MagicHeader header : headers) {
|
||||
max = Math.max(max, header.getRequiredBufferLength());
|
||||
}
|
||||
|
||||
this.requiredLength = max;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getRequiredBufferLength() {
|
||||
return requiredLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMimeType() {
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(final byte[] header) {
|
||||
for (final MagicHeader mh : headers) {
|
||||
if (!mh.matches(header)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<mime-info>
|
||||
|
||||
<mime-type type="application/flowfile-v1">
|
||||
<_comment>NiFi FlowFile V1</_comment>
|
||||
<sub-class-of type="application/x-tar"/>
|
||||
<magic>
|
||||
<match value="flowfile.attributes" type="string" offset="0" />
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/flowfile-v3">
|
||||
<_comment>NiFi FlowFile V3</_comment>
|
||||
<magic>
|
||||
<match value="NiFiFF3" type="string" offset="0" />
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet">
|
||||
<_comment>Office Open XML Workbook</_comment>
|
||||
<glob pattern="*.xlsx"/>
|
||||
<sub-class-of type="application/x-tika-ooxml"/>
|
||||
<magic priority="60">
|
||||
<match value="PK\003\004" type="string" offset="0">
|
||||
<match value="[Content_Types].xml" type="string" offset="30">
|
||||
<match value="xl/_rels/workbook.xml.rels" type="string" offset="30:4096"/>
|
||||
</match>
|
||||
</match>
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.document">
|
||||
<_comment>Office Open XML Document</_comment>
|
||||
<glob pattern="*.docx"/>
|
||||
<sub-class-of type="application/x-tika-ooxml"/>
|
||||
<magic priority="60">
|
||||
<match value="PK\003\004" type="string" offset="0">
|
||||
<match value="[Content_Types].xml" type="string" offset="30">
|
||||
<match value="word/_rels/document.xml.rels" type="string" offset="30:4096"/>
|
||||
</match>
|
||||
</match>
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
|
||||
<_comment>Office Open XML Presentation</_comment>
|
||||
<glob pattern="*.pptx"/>
|
||||
<glob pattern="*.thmx"/>
|
||||
<sub-class-of type="application/x-tika-ooxml"/>
|
||||
<magic priority="60">
|
||||
<match value="PK\003\004" type="string" offset="0">
|
||||
<match value="[Content_Types].xml" type="string" offset="30">
|
||||
<match value="ppt/slides/_rels/slide" type="string" offset="30:4096"/>
|
||||
</match>
|
||||
</match>
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<mime-type type="application/java-archive">
|
||||
<_comment>Java Archive</_comment>
|
||||
<tika:link>http://en.wikipedia.org/wiki/.jar</tika:link>
|
||||
<tika:uti>com.sun.java-archive</tika:uti>
|
||||
<sub-class-of type="application/zip"/>
|
||||
<glob pattern="*.jar"/>
|
||||
<magic priority="50">
|
||||
<match value="PK\003\004" type="string" offset="0">
|
||||
<match value="META-INF/MANIFEST.MF" type="string" offset="0:1024"/>
|
||||
</match>
|
||||
</magic>
|
||||
</mime-type>
|
||||
|
||||
<!-- Override tika's default behavior for GNU tar detection because nobody calls
|
||||
a GNU tar a .gtar -->
|
||||
<mime-type type="application/x-tar">
|
||||
<_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
|
||||
<magic priority="60">
|
||||
<!-- GNU tar archive -->
|
||||
<match value="ustar \0" type="string" offset="257" />
|
||||
</magic>
|
||||
<glob pattern="*.tar"/>
|
||||
</mime-type>
|
||||
|
||||
</mime-info>
|
|
@ -57,75 +57,59 @@ public class TestIdentifyMimeType {
|
|||
|
||||
final Map<String, String> expectedMimeTypes = new HashMap<>();
|
||||
expectedMimeTypes.put("1.7z", "application/x-7z-compressed");
|
||||
expectedMimeTypes.put("1.mdb", "application/msaccess");
|
||||
expectedMimeTypes.put("1.txt.bz2", "application/bzip2");
|
||||
expectedMimeTypes.put("1.mdb", "application/x-msaccess");
|
||||
expectedMimeTypes.put("1.txt", "text/plain");
|
||||
expectedMimeTypes.put("1.txt.bz2", "application/x-bzip2");
|
||||
expectedMimeTypes.put("1.txt.gz", "application/gzip");
|
||||
expectedMimeTypes.put("1.zip", "application/zip");
|
||||
expectedMimeTypes.put("bgBannerFoot.png", "image/png");
|
||||
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpg");
|
||||
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpeg");
|
||||
expectedMimeTypes.put("1.pdf", "application/pdf");
|
||||
expectedMimeTypes.put("grid.gif", "image/gif");
|
||||
expectedMimeTypes.put("1.tar", "application/octet-stream"); //wrong ID without IDENTIFY_TAR
|
||||
expectedMimeTypes.put("1.jar", "application/java-archive");
|
||||
expectedMimeTypes.put("1.xml", "application/xml");
|
||||
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
|
||||
expectedMimeTypes.put("flowfilev1.tar", "application/tar"); //wrong ID without IDENTIFY_TAR
|
||||
|
||||
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
|
||||
for (final MockFlowFile file : filesOut) {
|
||||
final String filename = file.getAttribute(CoreAttributes.FILENAME.key());
|
||||
final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key());
|
||||
final String expected = expectedMimeTypes.get(filename);
|
||||
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilesWithIdentifyTarAndZip() throws IOException {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType());
|
||||
runner.setProperty(IdentifyMimeType.IDENTIFY_TAR.getName(), "true");
|
||||
runner.setProperty(IdentifyMimeType.IDENTIFY_ZIP.getName(), "true");
|
||||
|
||||
final File dir = new File("src/test/resources/TestIdentifyMimeType");
|
||||
final File[] files = dir.listFiles();
|
||||
int fileCount = 0;
|
||||
for (final File file : files) {
|
||||
if (file.isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
runner.enqueue(file.toPath());
|
||||
fileCount++;
|
||||
}
|
||||
|
||||
runner.setThreadCount(4);
|
||||
runner.run(fileCount);
|
||||
|
||||
runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, fileCount);
|
||||
|
||||
final Map<String, String> expectedMimeTypes = new HashMap<>();
|
||||
expectedMimeTypes.put("1.7z", "application/x-7z-compressed");
|
||||
expectedMimeTypes.put("1.mdb", "application/msaccess");
|
||||
expectedMimeTypes.put("1.txt.bz2", "application/bzip2");
|
||||
expectedMimeTypes.put("1.txt.gz", "application/gzip");
|
||||
expectedMimeTypes.put("1.zip", "application/zip");
|
||||
expectedMimeTypes.put("bgBannerFoot.png", "image/png");
|
||||
expectedMimeTypes.put("blueBtnBg.jpg", "image/jpg");
|
||||
expectedMimeTypes.put("1.pdf", "application/pdf");
|
||||
expectedMimeTypes.put("grid.gif", "image/gif");
|
||||
expectedMimeTypes.put("1.tar", "application/tar");
|
||||
expectedMimeTypes.put("1.tar", "application/x-tar");
|
||||
expectedMimeTypes.put("1.tar.gz", "application/gzip");
|
||||
expectedMimeTypes.put("1.jar", "application/java-archive");
|
||||
expectedMimeTypes.put("1.xml", "application/xml");
|
||||
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
|
||||
expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
|
||||
// Office documents below randomly selected from govdocs1:
|
||||
// http://digitalcorpora.org/corpora/govdocs
|
||||
expectedMimeTypes.put("651924.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
||||
expectedMimeTypes.put("528206.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
|
||||
expectedMimeTypes.put("392790.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
|
||||
|
||||
final Map<String, String> expectedExtensions = new HashMap<>();
|
||||
expectedExtensions.put("1.7z", ".7z");
|
||||
expectedExtensions.put("1.mdb", ".mdb");
|
||||
expectedExtensions.put("1.txt", ".txt");
|
||||
expectedExtensions.put("1.txt.bz2", ".bz2");
|
||||
expectedExtensions.put("1.txt.gz", ".gz");
|
||||
expectedExtensions.put("1.zip", ".zip");
|
||||
expectedExtensions.put("bgBannerFoot.png", ".png");
|
||||
expectedExtensions.put("blueBtnBg.jpg", ".jpg");
|
||||
expectedExtensions.put("1.pdf", ".pdf");
|
||||
expectedExtensions.put("grid.gif", ".gif");
|
||||
expectedExtensions.put("1.tar", ".tar");
|
||||
expectedExtensions.put("1.tar.gz", ".gz");
|
||||
expectedExtensions.put("1.jar", ".jar");
|
||||
expectedExtensions.put("1.xml", ".xml");
|
||||
expectedExtensions.put("flowfilev3", "");
|
||||
expectedExtensions.put("flowfilev1.tar", "");
|
||||
expectedExtensions.put("651924.docx", ".docx");
|
||||
expectedExtensions.put("528206.xlsx", ".xlsx");
|
||||
expectedExtensions.put("392790.pptx", ".pptx");
|
||||
|
||||
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
|
||||
for (final MockFlowFile file : filesOut) {
|
||||
final String filename = file.getAttribute(CoreAttributes.FILENAME.key());
|
||||
final String mimeType = file.getAttribute(CoreAttributes.MIME_TYPE.key());
|
||||
final String expected = expectedMimeTypes.get(filename);
|
||||
|
||||
final String extension = file.getAttribute("mime.extension");
|
||||
final String expectedExtension = expectedExtensions.get(filename);
|
||||
|
||||
assertEquals("Expected " + file + " to have MIME Type " + expected + ", but it was " + mimeType, expected, mimeType);
|
||||
assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1 @@
|
|||
Test IdentifyMimeType
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue