NIFI-4087 This closes #2026. Fix to allow exclusion of filename from tika criteria.

This commit is contained in:
Leah Anderson 2017-07-20 19:20:54 -04:00 committed by joewitt
parent 695e8aa98f
commit 3371e915cc
3 changed files with 62 additions and 2 deletions

View File

@ -19,8 +19,10 @@ package org.apache.nifi.processors.standard;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
@ -32,6 +34,7 @@ import org.apache.nifi.annotation.behavior.SupportsBatching;
import org.apache.nifi.annotation.behavior.WritesAttribute; import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.logging.ComponentLog;
@ -78,12 +81,22 @@ import org.apache.tika.mime.MimeTypeException;
+ "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream") + "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream")
public class IdentifyMimeType extends AbstractProcessor { public class IdentifyMimeType extends AbstractProcessor {
public static final PropertyDescriptor USE_FILENAME_IN_DETECTION = new PropertyDescriptor.Builder()
.displayName("Use Filename In Detection")
.name("use-filename-in-detection")
.description("If true will pass the filename to Tika to aid in detection.")
.required(true)
.allowableValues("true", "false")
.defaultValue("true")
.build();
public static final Relationship REL_SUCCESS = new Relationship.Builder() public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success") .name("success")
.description("All FlowFiles are routed to success") .description("All FlowFiles are routed to success")
.build(); .build();
private Set<Relationship> relationships; private Set<Relationship> relationships;
private List<PropertyDescriptor> properties;
private final TikaConfig config; private final TikaConfig config;
private final Detector detector; private final Detector detector;
@ -96,6 +109,11 @@ public class IdentifyMimeType extends AbstractProcessor {
@Override @Override
protected void init(final ProcessorInitializationContext context) { protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> properties = new ArrayList<>();
properties.add(USE_FILENAME_IN_DETECTION);
this.properties = Collections.unmodifiableList(properties);
final Set<Relationship> rels = new HashSet<>(); final Set<Relationship> rels = new HashSet<>();
rels.add(REL_SUCCESS); rels.add(REL_SUCCESS);
this.relationships = Collections.unmodifiableSet(rels); this.relationships = Collections.unmodifiableSet(rels);
@ -106,6 +124,11 @@ public class IdentifyMimeType extends AbstractProcessor {
return relationships; return relationships;
} }
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return properties;
}
@Override @Override
public void onTrigger(final ProcessContext context, final ProcessSession session) { public void onTrigger(final ProcessContext context, final ProcessSession session) {
FlowFile flowFile = session.get(); FlowFile flowFile = session.get();
@ -123,8 +146,8 @@ public class IdentifyMimeType extends AbstractProcessor {
try (final InputStream in = new BufferedInputStream(stream)) { try (final InputStream in = new BufferedInputStream(stream)) {
TikaInputStream tikaStream = TikaInputStream.get(in); TikaInputStream tikaStream = TikaInputStream.get(in);
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
// Add filename if it exists
if (filename != null) { if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
} }
// Get mime type // Get mime type

View File

@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -72,6 +73,7 @@ public class TestIdentifyMimeType {
expectedMimeTypes.put("1.xml", "application/xml"); expectedMimeTypes.put("1.xml", "application/xml");
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3"); expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1"); expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
expectedMimeTypes.put("fake.csv", "text/csv");
final Map<String, String> expectedExtensions = new HashMap<>(); final Map<String, String> expectedExtensions = new HashMap<>();
expectedExtensions.put("1.7z", ".7z"); expectedExtensions.put("1.7z", ".7z");
@ -91,6 +93,7 @@ public class TestIdentifyMimeType {
expectedExtensions.put("1.xml", ".xml"); expectedExtensions.put("1.xml", ".xml");
expectedExtensions.put("flowfilev3", ""); expectedExtensions.put("flowfilev3", "");
expectedExtensions.put("flowfilev1.tar", ""); expectedExtensions.put("flowfilev1.tar", "");
expectedExtensions.put("fake.csv", ".csv");
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS); final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
for (final MockFlowFile file : filesOut) { for (final MockFlowFile file : filesOut) {
@ -105,4 +108,18 @@ public class TestIdentifyMimeType {
assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension); assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension);
} }
} }
@Test
public void testIgnoreFileName() throws Exception {
final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType());
runner.setProperty(IdentifyMimeType.USE_FILENAME_IN_DETECTION, "false");
runner.enqueue(Paths.get("src/test/resources/TestIdentifyMimeType/fake.csv"));
runner.run();
runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals("mime.extension", ".txt");
flowFile.assertAttributeEquals("mime.type", "text/plain");
}
} }

View File

@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
this is not a valid CSV file but
is intended to verify that the updated
IdentifyMIMEType works as expected.
Can't render this file because it contains an unexpected character in line 6 and column 3.