mirror of https://github.com/apache/nifi.git
NIFI-4087 This closes #2026. Fix to allow exclusion of filename from tika criteria.
This commit is contained in:
parent
695e8aa98f
commit
3371e915cc
|
@ -19,8 +19,10 @@ package org.apache.nifi.processors.standard;
|
|||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
|
@ -32,6 +34,7 @@ import org.apache.nifi.annotation.behavior.SupportsBatching;
|
|||
import org.apache.nifi.annotation.behavior.WritesAttribute;
|
||||
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||
import org.apache.nifi.annotation.documentation.Tags;
|
||||
import org.apache.nifi.components.PropertyDescriptor;
|
||||
import org.apache.nifi.flowfile.FlowFile;
|
||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||
import org.apache.nifi.logging.ComponentLog;
|
||||
|
@ -78,12 +81,22 @@ import org.apache.tika.mime.MimeTypeException;
|
|||
+ "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream")
|
||||
public class IdentifyMimeType extends AbstractProcessor {
|
||||
|
||||
public static final PropertyDescriptor USE_FILENAME_IN_DETECTION = new PropertyDescriptor.Builder()
|
||||
.displayName("Use Filename In Detection")
|
||||
.name("use-filename-in-detection")
|
||||
.description("If true will pass the filename to Tika to aid in detection.")
|
||||
.required(true)
|
||||
.allowableValues("true", "false")
|
||||
.defaultValue("true")
|
||||
.build();
|
||||
|
||||
public static final Relationship REL_SUCCESS = new Relationship.Builder()
|
||||
.name("success")
|
||||
.description("All FlowFiles are routed to success")
|
||||
.build();
|
||||
|
||||
private Set<Relationship> relationships;
|
||||
private List<PropertyDescriptor> properties;
|
||||
|
||||
private final TikaConfig config;
|
||||
private final Detector detector;
|
||||
|
@ -96,6 +109,11 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
|
||||
@Override
|
||||
protected void init(final ProcessorInitializationContext context) {
|
||||
|
||||
final List<PropertyDescriptor> properties = new ArrayList<>();
|
||||
properties.add(USE_FILENAME_IN_DETECTION);
|
||||
this.properties = Collections.unmodifiableList(properties);
|
||||
|
||||
final Set<Relationship> rels = new HashSet<>();
|
||||
rels.add(REL_SUCCESS);
|
||||
this.relationships = Collections.unmodifiableSet(rels);
|
||||
|
@ -106,6 +124,11 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
return relationships;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTrigger(final ProcessContext context, final ProcessSession session) {
|
||||
FlowFile flowFile = session.get();
|
||||
|
@ -123,8 +146,8 @@ public class IdentifyMimeType extends AbstractProcessor {
|
|||
try (final InputStream in = new BufferedInputStream(stream)) {
|
||||
TikaInputStream tikaStream = TikaInputStream.get(in);
|
||||
Metadata metadata = new Metadata();
|
||||
// Add filename if it exists
|
||||
if (filename != null) {
|
||||
|
||||
if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
|
||||
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
|
||||
}
|
||||
// Get mime type
|
||||
|
|
|
@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -72,6 +73,7 @@ public class TestIdentifyMimeType {
|
|||
expectedMimeTypes.put("1.xml", "application/xml");
|
||||
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
|
||||
expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
|
||||
expectedMimeTypes.put("fake.csv", "text/csv");
|
||||
|
||||
final Map<String, String> expectedExtensions = new HashMap<>();
|
||||
expectedExtensions.put("1.7z", ".7z");
|
||||
|
@ -91,6 +93,7 @@ public class TestIdentifyMimeType {
|
|||
expectedExtensions.put("1.xml", ".xml");
|
||||
expectedExtensions.put("flowfilev3", "");
|
||||
expectedExtensions.put("flowfilev1.tar", "");
|
||||
expectedExtensions.put("fake.csv", ".csv");
|
||||
|
||||
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
|
||||
for (final MockFlowFile file : filesOut) {
|
||||
|
@ -105,4 +108,18 @@ public class TestIdentifyMimeType {
|
|||
assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreFileName() throws Exception {
|
||||
final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType());
|
||||
runner.setProperty(IdentifyMimeType.USE_FILENAME_IN_DETECTION, "false");
|
||||
|
||||
runner.enqueue(Paths.get("src/test/resources/TestIdentifyMimeType/fake.csv"));
|
||||
runner.run();
|
||||
|
||||
runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, 1);
|
||||
MockFlowFile flowFile = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS).get(0);
|
||||
flowFile.assertAttributeEquals("mime.extension", ".txt");
|
||||
flowFile.assertAttributeEquals("mime.type", "text/plain");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
this is not a valid CSV file but
|
||||
is intended to verify that the updated
|
||||
IdentifyMIMEType works as expected.
|
Can't render this file because it contains an unexpected character in line 6 and column 3.
|
Loading…
Reference in New Issue