mirror of https://github.com/apache/nifi.git
NIFI-9647 Added ExtractDocumentText Processor
- Based on https://github.com/tspannhw/nifi-extracttext-processor This closes #5732 Signed-off-by: David Handermann <exceptionfactory@apache.org>
This commit is contained in:
parent
635824904d
commit
4141ed29ec
|
@ -954,6 +954,22 @@ language governing permissions and limitations under the License. -->
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</profile>
|
</profile>
|
||||||
|
<profile>
|
||||||
|
<id>include-media</id>
|
||||||
|
<!-- This profile includes the NiFi Media Bundle which is a large package that exposes Apache Tika functionality
|
||||||
|
through multiple processors. It is not included with the convenience binary due to its size. -->
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>false</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.nifi</groupId>
|
||||||
|
<artifactId>nifi-media-nar</artifactId>
|
||||||
|
<version>1.16.0-SNAPSHOT</version>
|
||||||
|
<type>nar</type>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
<profile>
|
<profile>
|
||||||
<id>include-rules</id>
|
<id>include-rules</id>
|
||||||
<!-- This profile handles includes of rules related artifacts. -->
|
<!-- This profile handles includes of rules related artifacts. -->
|
||||||
|
|
|
@ -0,0 +1,92 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.nifi.processors.document;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.nifi.annotation.documentation.CapabilityDescription;
|
||||||
|
import org.apache.nifi.annotation.documentation.Tags;
|
||||||
|
import org.apache.nifi.flowfile.FlowFile;
|
||||||
|
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||||
|
import org.apache.nifi.processor.AbstractProcessor;
|
||||||
|
import org.apache.nifi.processor.ProcessContext;
|
||||||
|
import org.apache.nifi.processor.ProcessSession;
|
||||||
|
import org.apache.nifi.processor.Relationship;
|
||||||
|
import org.apache.nifi.processor.exception.ProcessException;
|
||||||
|
import org.apache.tika.Tika;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@Tags({"extract, document, text"})
|
||||||
|
@CapabilityDescription("Extract text contents from supported binary document formats using Apache Tika")
|
||||||
|
public class ExtractDocumentText extends AbstractProcessor {
|
||||||
|
private static final String TEXT_PLAIN = "text/plain";
|
||||||
|
|
||||||
|
public static final Relationship REL_ORIGINAL = new Relationship.Builder().name("original")
|
||||||
|
.description("Success for original input FlowFiles").build();
|
||||||
|
|
||||||
|
public static final Relationship REL_EXTRACTED = new Relationship.Builder().name("extracted")
|
||||||
|
.description("Success for extracted text FlowFiles").build();
|
||||||
|
|
||||||
|
public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
|
||||||
|
.description("Content extraction failed").build();
|
||||||
|
|
||||||
|
private static final Set<Relationship> RELATIONSHIPS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(REL_ORIGINAL, REL_EXTRACTED, REL_FAILURE)));
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Set<Relationship> getRelationships() {
|
||||||
|
return RELATIONSHIPS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
|
||||||
|
FlowFile flowFile = session.get();
|
||||||
|
if (flowFile == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
FlowFile extracted = session.create(flowFile);
|
||||||
|
boolean error = false;
|
||||||
|
try (InputStream is = session.read(flowFile);
|
||||||
|
Reader tikaReader = new Tika().parse(is);
|
||||||
|
OutputStream os = session.write(extracted);
|
||||||
|
OutputStreamWriter writer = new OutputStreamWriter(os)) {
|
||||||
|
IOUtils.copy(tikaReader, writer);
|
||||||
|
} catch (final Throwable t) {
|
||||||
|
error = true;
|
||||||
|
getLogger().error("Extraction Failed {}", flowFile, t);
|
||||||
|
session.remove(extracted);
|
||||||
|
session.transfer(flowFile, REL_FAILURE);
|
||||||
|
} finally {
|
||||||
|
if (!error) {
|
||||||
|
final Map<String, String> attributes = new HashMap<>();
|
||||||
|
attributes.put(CoreAttributes.MIME_TYPE.key(), TEXT_PLAIN);
|
||||||
|
extracted = session.putAllAttributes(extracted, attributes);
|
||||||
|
session.transfer(extracted, REL_EXTRACTED);
|
||||||
|
session.transfer(flowFile, REL_ORIGINAL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -12,6 +12,7 @@
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
org.apache.nifi.processors.document.ExtractDocumentText
|
||||||
org.apache.nifi.processors.image.ExtractImageMetadata
|
org.apache.nifi.processors.image.ExtractImageMetadata
|
||||||
org.apache.nifi.processors.image.ResizeImage
|
org.apache.nifi.processors.image.ResizeImage
|
||||||
org.apache.nifi.processors.media.ExtractMediaMetadata
|
org.apache.nifi.processors.media.ExtractMediaMetadata
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.nifi.processors.document;
|
||||||
|
|
||||||
|
import org.apache.nifi.util.MockFlowFile;
|
||||||
|
import org.apache.nifi.util.TestRunner;
|
||||||
|
import org.apache.nifi.util.TestRunners;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
public class ExtractDocumentTextTest {
|
||||||
|
private TestRunner testRunner;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setTestRunner() {
|
||||||
|
testRunner = TestRunners.newTestRunner(ExtractDocumentText.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRunPdf() throws Exception {
|
||||||
|
final String filename = "simple.pdf";
|
||||||
|
testRunner.enqueue(getFileInputStream(filename));
|
||||||
|
testRunner.run();
|
||||||
|
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
|
||||||
|
|
||||||
|
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
|
||||||
|
for (MockFlowFile mockFile : successFiles) {
|
||||||
|
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
|
||||||
|
String trimmedResult = result.trim();
|
||||||
|
assertTrue(trimmedResult.startsWith("A Simple PDF File"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRunDoc() throws Exception {
|
||||||
|
final String filename = "simple.doc";
|
||||||
|
testRunner.enqueue(getFileInputStream(filename));
|
||||||
|
testRunner.run();
|
||||||
|
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
|
||||||
|
|
||||||
|
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
|
||||||
|
for (MockFlowFile mockFile : successFiles) {
|
||||||
|
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
|
||||||
|
String trimmedResult = result.trim();
|
||||||
|
assertTrue(trimmedResult.startsWith("A Simple WORD DOC File"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRunDocx() throws Exception {
|
||||||
|
final String filename = "simple.docx";
|
||||||
|
testRunner.enqueue(getFileInputStream(filename));
|
||||||
|
testRunner.run();
|
||||||
|
testRunner.assertTransferCount(ExtractDocumentText.REL_FAILURE, 0);
|
||||||
|
|
||||||
|
List<MockFlowFile> successFiles = testRunner.getFlowFilesForRelationship(ExtractDocumentText.REL_EXTRACTED);
|
||||||
|
for (MockFlowFile mockFile : successFiles) {
|
||||||
|
String result = new String(mockFile.toByteArray(), StandardCharsets.UTF_8);
|
||||||
|
String trimmedResult = result.trim();
|
||||||
|
assertTrue(trimmedResult.startsWith("A Simple WORD DOCX File"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private FileInputStream getFileInputStream(final String filename) throws FileNotFoundException {
|
||||||
|
return new FileInputStream("src/test/resources/" + filename);
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,198 @@
|
||||||
|
%PDF-1.3
|
||||||
|
%âãÏÓ
|
||||||
|
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Catalog
|
||||||
|
/Outlines 2 0 R
|
||||||
|
/Pages 3 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Outlines
|
||||||
|
/Count 0
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Pages
|
||||||
|
/Count 2
|
||||||
|
/Kids [ 4 0 R 6 0 R ]
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Page
|
||||||
|
/Parent 3 0 R
|
||||||
|
/Resources <<
|
||||||
|
/Font <<
|
||||||
|
/F1 9 0 R
|
||||||
|
>>
|
||||||
|
/ProcSet 8 0 R
|
||||||
|
>>
|
||||||
|
/MediaBox [0 0 612.0000 792.0000]
|
||||||
|
/Contents 5 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
5 0 obj
|
||||||
|
<< /Length 1074 >>
|
||||||
|
stream
|
||||||
|
2 J
|
||||||
|
BT
|
||||||
|
0 0 0 rg
|
||||||
|
/F1 0027 Tf
|
||||||
|
57.3750 722.2800 Td
|
||||||
|
( A Simple PDF File ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 688.6080 Td
|
||||||
|
( This is a small demonstration .pdf file - ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 664.7040 Td
|
||||||
|
( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 652.7520 Td
|
||||||
|
( text. And more text. And more text. And more text. ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 628.8480 Td
|
||||||
|
( And more text. And more text. And more text. And more text. And more ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 616.8960 Td
|
||||||
|
( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 604.9440 Td
|
||||||
|
( more text. And more text. And more text. And more text. And more text. ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 592.9920 Td
|
||||||
|
( And more text. And more text. ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 569.0880 Td
|
||||||
|
( And more text. And more text. And more text. And more text. And more ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 557.1360 Td
|
||||||
|
( text. And more text. And more text. Even more. Continued on page 2 ...) Tj
|
||||||
|
ET
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
6 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Page
|
||||||
|
/Parent 3 0 R
|
||||||
|
/Resources <<
|
||||||
|
/Font <<
|
||||||
|
/F1 9 0 R
|
||||||
|
>>
|
||||||
|
/ProcSet 8 0 R
|
||||||
|
>>
|
||||||
|
/MediaBox [0 0 612.0000 792.0000]
|
||||||
|
/Contents 7 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
7 0 obj
|
||||||
|
<< /Length 676 >>
|
||||||
|
stream
|
||||||
|
2 J
|
||||||
|
BT
|
||||||
|
0 0 0 rg
|
||||||
|
/F1 0027 Tf
|
||||||
|
57.3750 722.2800 Td
|
||||||
|
( Simple PDF File 2 ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 688.6080 Td
|
||||||
|
( ...continued from page 1. Yet more text. And more text. And more text. ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 676.6560 Td
|
||||||
|
( And more text. And more text. And more text. And more text. And more ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 664.7040 Td
|
||||||
|
( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 652.7520 Td
|
||||||
|
( paint dry. And more text. And more text. And more text. And more text. ) Tj
|
||||||
|
ET
|
||||||
|
BT
|
||||||
|
/F1 0010 Tf
|
||||||
|
69.2500 640.8000 Td
|
||||||
|
( Boring. More, a little more text. The end, and just as well. ) Tj
|
||||||
|
ET
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
8 0 obj
|
||||||
|
[/PDF /Text]
|
||||||
|
endobj
|
||||||
|
|
||||||
|
9 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Font
|
||||||
|
/Subtype /Type1
|
||||||
|
/Name /F1
|
||||||
|
/BaseFont /Helvetica
|
||||||
|
/Encoding /WinAnsiEncoding
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
10 0 obj
|
||||||
|
<<
|
||||||
|
/Creator (Rave \(http://www.nevrona.com/rave\))
|
||||||
|
/Producer (Nevrona Designs)
|
||||||
|
/CreationDate (D:20060301072826)
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
xref
|
||||||
|
0 11
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000019 00000 n
|
||||||
|
0000000093 00000 n
|
||||||
|
0000000147 00000 n
|
||||||
|
0000000222 00000 n
|
||||||
|
0000000390 00000 n
|
||||||
|
0000001522 00000 n
|
||||||
|
0000001690 00000 n
|
||||||
|
0000002423 00000 n
|
||||||
|
0000002456 00000 n
|
||||||
|
0000002574 00000 n
|
||||||
|
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/Size 11
|
||||||
|
/Root 1 0 R
|
||||||
|
/Info 10 0 R
|
||||||
|
>>
|
||||||
|
|
||||||
|
startxref
|
||||||
|
2714
|
||||||
|
%%EOF
|
Loading…
Reference in New Issue