mirror of https://github.com/apache/nifi.git
NIFI-4087 This closes #2026. Fix to allow exclusion of filename from tika criteria.
This commit is contained in:
parent
695e8aa98f
commit
3371e915cc
|
@ -19,8 +19,10 @@ package org.apache.nifi.processors.standard;
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
|
@ -32,6 +34,7 @@ import org.apache.nifi.annotation.behavior.SupportsBatching;
|
||||||
import org.apache.nifi.annotation.behavior.WritesAttribute;
|
import org.apache.nifi.annotation.behavior.WritesAttribute;
|
||||||
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||||
import org.apache.nifi.annotation.documentation.Tags;
|
import org.apache.nifi.annotation.documentation.Tags;
|
||||||
|
import org.apache.nifi.components.PropertyDescriptor;
|
||||||
import org.apache.nifi.flowfile.FlowFile;
|
import org.apache.nifi.flowfile.FlowFile;
|
||||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||||
import org.apache.nifi.logging.ComponentLog;
|
import org.apache.nifi.logging.ComponentLog;
|
||||||
|
@ -78,12 +81,22 @@ import org.apache.tika.mime.MimeTypeException;
|
||||||
+ "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream")
|
+ "If unable to detect the MIME Type, the attribute's value will be set to application/octet-stream")
|
||||||
public class IdentifyMimeType extends AbstractProcessor {
|
public class IdentifyMimeType extends AbstractProcessor {
|
||||||
|
|
||||||
|
public static final PropertyDescriptor USE_FILENAME_IN_DETECTION = new PropertyDescriptor.Builder()
|
||||||
|
.displayName("Use Filename In Detection")
|
||||||
|
.name("use-filename-in-detection")
|
||||||
|
.description("If true will pass the filename to Tika to aid in detection.")
|
||||||
|
.required(true)
|
||||||
|
.allowableValues("true", "false")
|
||||||
|
.defaultValue("true")
|
||||||
|
.build();
|
||||||
|
|
||||||
public static final Relationship REL_SUCCESS = new Relationship.Builder()
|
public static final Relationship REL_SUCCESS = new Relationship.Builder()
|
||||||
.name("success")
|
.name("success")
|
||||||
.description("All FlowFiles are routed to success")
|
.description("All FlowFiles are routed to success")
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
private Set<Relationship> relationships;
|
private Set<Relationship> relationships;
|
||||||
|
private List<PropertyDescriptor> properties;
|
||||||
|
|
||||||
private final TikaConfig config;
|
private final TikaConfig config;
|
||||||
private final Detector detector;
|
private final Detector detector;
|
||||||
|
@ -96,6 +109,11 @@ public class IdentifyMimeType extends AbstractProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void init(final ProcessorInitializationContext context) {
|
protected void init(final ProcessorInitializationContext context) {
|
||||||
|
|
||||||
|
final List<PropertyDescriptor> properties = new ArrayList<>();
|
||||||
|
properties.add(USE_FILENAME_IN_DETECTION);
|
||||||
|
this.properties = Collections.unmodifiableList(properties);
|
||||||
|
|
||||||
final Set<Relationship> rels = new HashSet<>();
|
final Set<Relationship> rels = new HashSet<>();
|
||||||
rels.add(REL_SUCCESS);
|
rels.add(REL_SUCCESS);
|
||||||
this.relationships = Collections.unmodifiableSet(rels);
|
this.relationships = Collections.unmodifiableSet(rels);
|
||||||
|
@ -106,6 +124,11 @@ public class IdentifyMimeType extends AbstractProcessor {
|
||||||
return relationships;
|
return relationships;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onTrigger(final ProcessContext context, final ProcessSession session) {
|
public void onTrigger(final ProcessContext context, final ProcessSession session) {
|
||||||
FlowFile flowFile = session.get();
|
FlowFile flowFile = session.get();
|
||||||
|
@ -123,8 +146,8 @@ public class IdentifyMimeType extends AbstractProcessor {
|
||||||
try (final InputStream in = new BufferedInputStream(stream)) {
|
try (final InputStream in = new BufferedInputStream(stream)) {
|
||||||
TikaInputStream tikaStream = TikaInputStream.get(in);
|
TikaInputStream tikaStream = TikaInputStream.get(in);
|
||||||
Metadata metadata = new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
// Add filename if it exists
|
|
||||||
if (filename != null) {
|
if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
|
||||||
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
|
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
|
||||||
}
|
}
|
||||||
// Get mime type
|
// Get mime type
|
||||||
|
|
|
@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -72,6 +73,7 @@ public class TestIdentifyMimeType {
|
||||||
expectedMimeTypes.put("1.xml", "application/xml");
|
expectedMimeTypes.put("1.xml", "application/xml");
|
||||||
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
|
expectedMimeTypes.put("flowfilev3", "application/flowfile-v3");
|
||||||
expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
|
expectedMimeTypes.put("flowfilev1.tar", "application/flowfile-v1");
|
||||||
|
expectedMimeTypes.put("fake.csv", "text/csv");
|
||||||
|
|
||||||
final Map<String, String> expectedExtensions = new HashMap<>();
|
final Map<String, String> expectedExtensions = new HashMap<>();
|
||||||
expectedExtensions.put("1.7z", ".7z");
|
expectedExtensions.put("1.7z", ".7z");
|
||||||
|
@ -91,6 +93,7 @@ public class TestIdentifyMimeType {
|
||||||
expectedExtensions.put("1.xml", ".xml");
|
expectedExtensions.put("1.xml", ".xml");
|
||||||
expectedExtensions.put("flowfilev3", "");
|
expectedExtensions.put("flowfilev3", "");
|
||||||
expectedExtensions.put("flowfilev1.tar", "");
|
expectedExtensions.put("flowfilev1.tar", "");
|
||||||
|
expectedExtensions.put("fake.csv", ".csv");
|
||||||
|
|
||||||
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
|
final List<MockFlowFile> filesOut = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS);
|
||||||
for (final MockFlowFile file : filesOut) {
|
for (final MockFlowFile file : filesOut) {
|
||||||
|
@ -105,4 +108,18 @@ public class TestIdentifyMimeType {
|
||||||
assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension);
|
assertEquals("Expected " + file + " to have extension " + expectedExtension + ", but it was " + extension, expectedExtension, extension);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIgnoreFileName() throws Exception {
|
||||||
|
final TestRunner runner = TestRunners.newTestRunner(new IdentifyMimeType());
|
||||||
|
runner.setProperty(IdentifyMimeType.USE_FILENAME_IN_DETECTION, "false");
|
||||||
|
|
||||||
|
runner.enqueue(Paths.get("src/test/resources/TestIdentifyMimeType/fake.csv"));
|
||||||
|
runner.run();
|
||||||
|
|
||||||
|
runner.assertAllFlowFilesTransferred(IdentifyMimeType.REL_SUCCESS, 1);
|
||||||
|
MockFlowFile flowFile = runner.getFlowFilesForRelationship(IdentifyMimeType.REL_SUCCESS).get(0);
|
||||||
|
flowFile.assertAttributeEquals("mime.extension", ".txt");
|
||||||
|
flowFile.assertAttributeEquals("mime.type", "text/plain");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
this is not a valid CSV file but
|
||||||
|
is intended to verify that the updated
|
||||||
|
IdentifyMIMEType works as expected.
|
Can't render this file because it contains an unexpected character in line 6 and column 3.
|
Loading…
Reference in New Issue