mirror of https://github.com/apache/nifi.git
NIFI-9647 Added ExtractDocumentText Processor
- Based on https://github.com/tspannhw/nifi-extracttext-processor This closes #5732 Signed-off-by: David Handermann <exceptionfactory@apache.org>
This commit is contained in:
parent
635824904d
commit
4141ed29ec
|
@ -954,6 +954,22 @@ language governing permissions and limitations under the License. -->
|
|||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>include-media</id>
|
||||
<!-- This profile includes the NiFi Media Bundle which is a large package that exposes Apache Tika functionality
|
||||
through multiple processors. It is not included with the convenience binary due to its size. -->
|
||||
<activation>
|
||||
<activeByDefault>false</activeByDefault>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.nifi</groupId>
|
||||
<artifactId>nifi-media-nar</artifactId>
|
||||
<version>1.16.0-SNAPSHOT</version>
|
||||
<type>nar</type>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>include-rules</id>
|
||||
<!-- This profile handles includes of rules related artifacts. -->
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.nifi.processors.document;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||
import org.apache.nifi.annotation.documentation.Tags;
|
||||
import org.apache.nifi.flowfile.FlowFile;
|
||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||
import org.apache.nifi.processor.AbstractProcessor;
|
||||
import org.apache.nifi.processor.ProcessContext;
|
||||
import org.apache.nifi.processor.ProcessSession;
|
||||
import org.apache.nifi.processor.Relationship;
|
||||
import org.apache.nifi.processor.exception.ProcessException;
|
||||
import org.apache.tika.Tika;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@Tags({"extract, document, text"})
|
||||
@CapabilityDescription("Extract text contents from supported binary document formats using Apache Tika")
|
||||
public class ExtractDocumentText extends AbstractProcessor {
|
||||
private static final String TEXT_PLAIN = "text/plain";
|
||||
|
||||
public static final Relationship REL_ORIGINAL = new Relationship.Builder().name("original")
|
||||
.description("Success for original input FlowFiles").build();
|
||||
|
||||
public static final Relationship REL_EXTRACTED = new Relationship.Builder().name("extracted")
|
||||
.description("Success for extracted text FlowFiles").build();
|
||||
|
||||
public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
|
||||
.description("Content extraction failed").build();
|
||||
|
||||
private static final Set<Relationship> RELATIONSHIPS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(REL_ORIGINAL, REL_EXTRACTED, REL_FAILURE)));
|
||||
|
||||
@Override
|
||||
public Set<Relationship> getRelationships() {
|
||||
return RELATIONSHIPS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
||||
FlowFile flowFile = session.get();
|
||||
if (flowFile == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
FlowFile extracted = session.create(flowFile);
|
||||
boolean error = false;
|
||||
try (InputStream is = session.read(flowFile);
|
||||
Reader tikaReader = new Tika().parse(is);
|
||||
OutputStream os = session.write(extracted);
|
||||
OutputStreamWriter writer = new OutputStreamWriter(os)) {
|
||||
IOUtils.copy(tikaReader, writer);
|
||||
} catch (final Throwable t) {
|
||||
error = true;
|
||||
getLogger().error("Extraction Failed {}", flowFile, t);
|
||||
session.remove(extracted);
|
||||
session.transfer(flowFile, REL_FAILURE);
|
||||
} finally {
|
||||
if (!error) {
|
||||
final Map<String, String> attributes = new HashMap<>();
|
||||
attributes.put(CoreAttributes.MIME_TYPE.key(), TEXT_PLAIN);
|
||||
extracted = session.putAllAttributes(extracted, attributes);
|
||||
session.transfer(extracted, REL_EXTRACTED);
|
||||
session.transfer(flowFile, REL_ORIGINAL);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -12,6 +12,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
org.apache.nifi.processors.document.ExtractDocumentText
|
||||
org.apache.nifi.processors.image.ExtractImageMetadata
|
||||
org.apache.nifi.processors.image.ResizeImage
|
||||
org.apache.nifi.processors.media.ExtractMediaMetadata
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.nifi.processors.document;
|
||||
|
||||
import org.apache.nifi.util.MockFlowFile;
|
||||
import org.apache.nifi.util.TestRunner;
|
||||
import org.apache.nifi.util.TestRunners;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class ExtractDocumentTextTest {
|
||||
private TestRunner testRunner;
|
||||
|
||||
@BeforeEach
|
||||
public void setTestRunner() {
|
||||
testRunner = TestRunners.newTestRunner(ExtractDocumentText.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRunPdf() throws Exception {
|
||||
final String filename = "simple.pdf";
|
||||
testRunner.enqueue(getFileInputStream(filename));
|
||||
testRunner.run();
|
||||
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
|
||||
|
||||
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
|
||||
for (MockFlowFile mockFile : successFiles) {
|
||||
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
|
||||
String trimmedResult = result.trim();
|
||||
assertTrue(trimmedResult.startsWith("A Simple PDF File"));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRunDoc() throws Exception {
|
||||
final String filename = "simple.doc";
|
||||
testRunner.enqueue(getFileInputStream(filename));
|
||||
testRunner.run();
|
||||
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
|
||||
|
||||
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
|
||||
for (MockFlowFile mockFile : successFiles) {
|
||||
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
|
||||
String trimmedResult = result.trim();
|
||||
assertTrue(trimmedResult.startsWith("A Simple WORD DOC File"));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRunDocx() throws Exception {
|
||||
final String filename = "simple.docx";
|
||||
testRunner.enqueue(getFileInputStream(filename));
|
||||
testRunner.run();
|
||||
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
|
||||
|
||||
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
|
||||
for (MockFlowFile mockFile : successFiles) {
|
||||
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
|
||||
String trimmedResult = result.trim();
|
||||
assertTrue(trimmedResult.startsWith("A Simple WORD DOCX File"));
|
||||
}
|
||||
}
|
||||
|
||||
private FileInputStream getFileInputStream(final String filename) throws FileNotFoundException {
|
||||
return new FileInputStream("src/test/resources/" + filename);
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,198 @@
|
|||
%PDF-1.3
|
||||
%âãÏÓ
|
||||
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Outlines 2 0 R
|
||||
/Pages 3 0 R
|
||||
>>
|
||||
endobj
|
||||
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Outlines
|
||||
/Count 0
|
||||
>>
|
||||
endobj
|
||||
|
||||
3 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Count 2
|
||||
/Kids [ 4 0 R 6 0 R ]
|
||||
>>
|
||||
endobj
|
||||
|
||||
4 0 obj
|
||||
<<
|
||||
/Type /Page
|
||||
/Parent 3 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 9 0 R
|
||||
>>
|
||||
/ProcSet 8 0 R
|
||||
>>
|
||||
/MediaBox [0 0 612.0000 792.0000]
|
||||
/Contents 5 0 R
|
||||
>>
|
||||
endobj
|
||||
|
||||
5 0 obj
|
||||
<< /Length 1074 >>
|
||||
stream
|
||||
2 J
|
||||
BT
|
||||
0 0 0 rg
|
||||
/F1 0027 Tf
|
||||
57.3750 722.2800 Td
|
||||
( A Simple PDF File ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 688.6080 Td
|
||||
( This is a small demonstration .pdf file - ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 664.7040 Td
|
||||
( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 652.7520 Td
|
||||
( text. And more text. And more text. And more text. ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 628.8480 Td
|
||||
( And more text. And more text. And more text. And more text. And more ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 616.8960 Td
|
||||
( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 604.9440 Td
|
||||
( more text. And more text. And more text. And more text. And more text. ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 592.9920 Td
|
||||
( And more text. And more text. ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 569.0880 Td
|
||||
( And more text. And more text. And more text. And more text. And more ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 557.1360 Td
|
||||
( text. And more text. And more text. Even more. Continued on page 2 ...) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
|
||||
6 0 obj
|
||||
<<
|
||||
/Type /Page
|
||||
/Parent 3 0 R
|
||||
/Resources <<
|
||||
/Font <<
|
||||
/F1 9 0 R
|
||||
>>
|
||||
/ProcSet 8 0 R
|
||||
>>
|
||||
/MediaBox [0 0 612.0000 792.0000]
|
||||
/Contents 7 0 R
|
||||
>>
|
||||
endobj
|
||||
|
||||
7 0 obj
|
||||
<< /Length 676 >>
|
||||
stream
|
||||
2 J
|
||||
BT
|
||||
0 0 0 rg
|
||||
/F1 0027 Tf
|
||||
57.3750 722.2800 Td
|
||||
( Simple PDF File 2 ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 688.6080 Td
|
||||
( ...continued from page 1. Yet more text. And more text. And more text. ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 676.6560 Td
|
||||
( And more text. And more text. And more text. And more text. And more ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 664.7040 Td
|
||||
( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 652.7520 Td
|
||||
( paint dry. And more text. And more text. And more text. And more text. ) Tj
|
||||
ET
|
||||
BT
|
||||
/F1 0010 Tf
|
||||
69.2500 640.8000 Td
|
||||
( Boring. More, a little more text. The end, and just as well. ) Tj
|
||||
ET
|
||||
endstream
|
||||
endobj
|
||||
|
||||
8 0 obj
|
||||
[/PDF /Text]
|
||||
endobj
|
||||
|
||||
9 0 obj
|
||||
<<
|
||||
/Type /Font
|
||||
/Subtype /Type1
|
||||
/Name /F1
|
||||
/BaseFont /Helvetica
|
||||
/Encoding /WinAnsiEncoding
|
||||
>>
|
||||
endobj
|
||||
|
||||
10 0 obj
|
||||
<<
|
||||
/Creator (Rave \(http://www.nevrona.com/rave\))
|
||||
/Producer (Nevrona Designs)
|
||||
/CreationDate (D:20060301072826)
|
||||
>>
|
||||
endobj
|
||||
|
||||
xref
|
||||
0 11
|
||||
0000000000 65535 f
|
||||
0000000019 00000 n
|
||||
0000000093 00000 n
|
||||
0000000147 00000 n
|
||||
0000000222 00000 n
|
||||
0000000390 00000 n
|
||||
0000001522 00000 n
|
||||
0000001690 00000 n
|
||||
0000002423 00000 n
|
||||
0000002456 00000 n
|
||||
0000002574 00000 n
|
||||
|
||||
trailer
|
||||
<<
|
||||
/Size 11
|
||||
/Root 1 0 R
|
||||
/Info 10 0 R
|
||||
>>
|
||||
|
||||
startxref
|
||||
2714
|
||||
%%EOF
|
Loading…
Reference in New Issue