NIFI-9647 Added ExtractDocumentText Processor

- Based on https://github.com/tspannhw/nifi-extracttext-processor

This closes #5732

Signed-off-by: David Handermann <exceptionfactory@apache.org>
This commit is contained in:
Mike Thomsen 2022-02-01 12:34:31 -05:00 committed by exceptionfactory
parent 635824904d
commit 4141ed29ec
No known key found for this signature in database
GPG Key ID: 29B6A52D2AAE8DBA
7 changed files with 395 additions and 0 deletions

View File

@ -954,6 +954,22 @@ language governing permissions and limitations under the License. -->
</dependency>
</dependencies>
</profile>
<profile>
<id>include-media</id>
<!-- This profile includes the NiFi Media Bundle which is a large package that exposes Apache Tika functionality
through multiple processors. It is not included with the convenience binary due to its size. -->
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-media-nar</artifactId>
<version>1.16.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
</dependencies>
</profile>
<profile>
<id>include-rules</id>
<!-- This profile handles includes of rules related artifacts. -->

View File

@ -0,0 +1,92 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.document;
import org.apache.commons.io.IOUtils;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.tika.Tika;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
@Tags({"extract, document, text"})
@CapabilityDescription("Extract text contents from supported binary document formats using Apache Tika")
public class ExtractDocumentText extends AbstractProcessor {
private static final String TEXT_PLAIN = "text/plain";
public static final Relationship REL_ORIGINAL = new Relationship.Builder().name("original")
.description("Success for original input FlowFiles").build();
public static final Relationship REL_EXTRACTED = new Relationship.Builder().name("extracted")
.description("Success for extracted text FlowFiles").build();
public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
.description("Content extraction failed").build();
private static final Set<Relationship> RELATIONSHIPS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(REL_ORIGINAL, REL_EXTRACTED, REL_FAILURE)));
@Override
public Set<Relationship> getRelationships() {
return RELATIONSHIPS;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
FlowFile extracted = session.create(flowFile);
boolean error = false;
try (InputStream is = session.read(flowFile);
Reader tikaReader = new Tika().parse(is);
OutputStream os = session.write(extracted);
OutputStreamWriter writer = new OutputStreamWriter(os)) {
IOUtils.copy(tikaReader, writer);
} catch (final Throwable t) {
error = true;
getLogger().error("Extraction Failed {}", flowFile, t);
session.remove(extracted);
session.transfer(flowFile, REL_FAILURE);
} finally {
if (!error) {
final Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), TEXT_PLAIN);
extracted = session.putAllAttributes(extracted, attributes);
session.transfer(extracted, REL_EXTRACTED);
session.transfer(flowFile, REL_ORIGINAL);
}
}
}
}

View File

@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.nifi.processors.document.ExtractDocumentText
org.apache.nifi.processors.image.ExtractImageMetadata
org.apache.nifi.processors.image.ResizeImage
org.apache.nifi.processors.media.ExtractMediaMetadata

View File

@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.document;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class ExtractDocumentTextTest {
private TestRunner testRunner;
@BeforeEach
public void setTestRunner() {
testRunner = TestRunners.newTestRunner(ExtractDocumentText.class);
}
@Test
public void testRunPdf() throws Exception {
final String filename = "simple.pdf";
testRunner.enqueue(getFileInputStream(filename));
testRunner.run();
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
for (MockFlowFile mockFile : successFiles) {
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
String trimmedResult = result.trim();
assertTrue(trimmedResult.startsWith("A Simple PDF File"));
}
}
@Test
public void testRunDoc() throws Exception {
final String filename = "simple.doc";
testRunner.enqueue(getFileInputStream(filename));
testRunner.run();
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
for (MockFlowFile mockFile : successFiles) {
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
String trimmedResult = result.trim();
assertTrue(trimmedResult.startsWith("A Simple WORD DOC File"));
}
}
@Test
public void testRunDocx() throws Exception {
final String filename = "simple.docx";
testRunner.enqueue(getFileInputStream(filename));
testRunner.run();
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
for (MockFlowFile mockFile : successFiles) {
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
String trimmedResult = result.trim();
assertTrue(trimmedResult.startsWith("A Simple WORD DOCX File"));
}
}
private FileInputStream getFileInputStream(final String filename) throws FileNotFoundException {
return new FileInputStream("src/test/resources/" + filename);
}
}

View File

@ -0,0 +1,198 @@
%PDF-1.3
%âãÏÓ
1 0 obj
<<
/Type /Catalog
/Outlines 2 0 R
/Pages 3 0 R
>>
endobj
2 0 obj
<<
/Type /Outlines
/Count 0
>>
endobj
3 0 obj
<<
/Type /Pages
/Count 2
/Kids [ 4 0 R 6 0 R ]
>>
endobj
4 0 obj
<<
/Type /Page
/Parent 3 0 R
/Resources <<
/Font <<
/F1 9 0 R
>>
/ProcSet 8 0 R
>>
/MediaBox [0 0 612.0000 792.0000]
/Contents 5 0 R
>>
endobj
5 0 obj
<< /Length 1074 >>
stream
2 J
BT
0 0 0 rg
/F1 0027 Tf
57.3750 722.2800 Td
( A Simple PDF File ) Tj
ET
BT
/F1 0010 Tf
69.2500 688.6080 Td
( This is a small demonstration .pdf file - ) Tj
ET
BT
/F1 0010 Tf
69.2500 664.7040 Td
( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj
ET
BT
/F1 0010 Tf
69.2500 652.7520 Td
( text. And more text. And more text. And more text. ) Tj
ET
BT
/F1 0010 Tf
69.2500 628.8480 Td
( And more text. And more text. And more text. And more text. And more ) Tj
ET
BT
/F1 0010 Tf
69.2500 616.8960 Td
( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj
ET
BT
/F1 0010 Tf
69.2500 604.9440 Td
( more text. And more text. And more text. And more text. And more text. ) Tj
ET
BT
/F1 0010 Tf
69.2500 592.9920 Td
( And more text. And more text. ) Tj
ET
BT
/F1 0010 Tf
69.2500 569.0880 Td
( And more text. And more text. And more text. And more text. And more ) Tj
ET
BT
/F1 0010 Tf
69.2500 557.1360 Td
( text. And more text. And more text. Even more. Continued on page 2 ...) Tj
ET
endstream
endobj
6 0 obj
<<
/Type /Page
/Parent 3 0 R
/Resources <<
/Font <<
/F1 9 0 R
>>
/ProcSet 8 0 R
>>
/MediaBox [0 0 612.0000 792.0000]
/Contents 7 0 R
>>
endobj
7 0 obj
<< /Length 676 >>
stream
2 J
BT
0 0 0 rg
/F1 0027 Tf
57.3750 722.2800 Td
( Simple PDF File 2 ) Tj
ET
BT
/F1 0010 Tf
69.2500 688.6080 Td
( ...continued from page 1. Yet more text. And more text. And more text. ) Tj
ET
BT
/F1 0010 Tf
69.2500 676.6560 Td
( And more text. And more text. And more text. And more text. And more ) Tj
ET
BT
/F1 0010 Tf
69.2500 664.7040 Td
( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj
ET
BT
/F1 0010 Tf
69.2500 652.7520 Td
( paint dry. And more text. And more text. And more text. And more text. ) Tj
ET
BT
/F1 0010 Tf
69.2500 640.8000 Td
( Boring. More, a little more text. The end, and just as well. ) Tj
ET
endstream
endobj
8 0 obj
[/PDF /Text]
endobj
9 0 obj
<<
/Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
>>
endobj
10 0 obj
<<
/Creator (Rave \(http://www.nevrona.com/rave\))
/Producer (Nevrona Designs)
/CreationDate (D:20060301072826)
>>
endobj
xref
0 11
0000000000 65535 f
0000000019 00000 n
0000000093 00000 n
0000000147 00000 n
0000000222 00000 n
0000000390 00000 n
0000001522 00000 n
0000001690 00000 n
0000002423 00000 n
0000002456 00000 n
0000002574 00000 n
trailer
<<
/Size 11
/Root 1 0 R
/Info 10 0 R
>>
startxref
2714
%%EOF