Merge branch 'fix/22077-ingest-attachment'
This commit is contained in:
commit
76675229c7
|
@ -74,9 +74,11 @@ dependencyLicenses {
|
|||
}
|
||||
|
||||
forbiddenPatterns {
|
||||
exclude '**/*.doc'
|
||||
exclude '**/*.docx'
|
||||
exclude '**/*.pdf'
|
||||
exclude '**/*.epub'
|
||||
exclude '**/*.vsdx'
|
||||
}
|
||||
|
||||
thirdPartyAudit.excludes = [
|
||||
|
|
|
@ -22,8 +22,10 @@ package org.elasticsearch.ingest.attachment;
|
|||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.ParserDecorator;
|
||||
import org.elasticsearch.SpecialPermission;
|
||||
import org.elasticsearch.bootstrap.JarHell;
|
||||
import org.elasticsearch.common.SuppressForbidden;
|
||||
|
@ -45,7 +47,9 @@ import java.security.PrivilegedActionException;
|
|||
import java.security.PrivilegedExceptionAction;
|
||||
import java.security.ProtectionDomain;
|
||||
import java.security.SecurityPermission;
|
||||
import java.util.Collections;
|
||||
import java.util.PropertyPermission;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Runs tika with limited parsers and limited permissions.
|
||||
|
@ -54,6 +58,9 @@ import java.util.PropertyPermission;
|
|||
*/
|
||||
final class TikaImpl {
|
||||
|
||||
/** Exclude some formats */
|
||||
private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));
|
||||
|
||||
/** subset of parsers for types we support */
|
||||
private static final Parser PARSERS[] = new Parser[] {
|
||||
// documents
|
||||
|
@ -63,7 +70,7 @@ final class TikaImpl {
|
|||
new org.apache.tika.parser.txt.TXTParser(),
|
||||
new org.apache.tika.parser.microsoft.OfficeParser(),
|
||||
new org.apache.tika.parser.microsoft.OldExcelParser(),
|
||||
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
|
||||
ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
|
||||
new org.apache.tika.parser.odf.OpenDocumentParser(),
|
||||
new org.apache.tika.parser.iwork.IWorkPackageParser(),
|
||||
new org.apache.tika.parser.xml.DcXMLParser(),
|
||||
|
|
|
@ -47,6 +47,7 @@ import static org.hamcrest.Matchers.hasSize;
|
|||
import static org.hamcrest.Matchers.is;
|
||||
import static org.hamcrest.Matchers.not;
|
||||
import static org.hamcrest.Matchers.notNullValue;
|
||||
import static org.hamcrest.Matchers.nullValue;
|
||||
import static org.hamcrest.core.IsCollectionContaining.hasItem;
|
||||
|
||||
public class AttachmentProcessorTests extends ESTestCase {
|
||||
|
@ -130,6 +131,34 @@ public class AttachmentProcessorTests extends ESTestCase {
|
|||
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
|
||||
}
|
||||
|
||||
public void testWordDocumentWithVisioSchema() throws Exception {
|
||||
Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);
|
||||
|
||||
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
|
||||
"content_length"));
|
||||
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
|
||||
assertThat(attachmentData.get("language"), is("en"));
|
||||
assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
|
||||
assertThat(attachmentData.get("author"), is(notNullValue()));
|
||||
assertThat(attachmentData.get("content_length"), is(notNullValue()));
|
||||
assertThat(attachmentData.get("content_type").toString(),
|
||||
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
|
||||
}
|
||||
|
||||
public void testLegacyWordDocumentWithVisioSchema() throws Exception {
|
||||
Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);
|
||||
|
||||
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
|
||||
"content_length"));
|
||||
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
|
||||
assertThat(attachmentData.get("language"), is("en"));
|
||||
assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
|
||||
assertThat(attachmentData.get("author"), is(notNullValue()));
|
||||
assertThat(attachmentData.get("content_length"), is(notNullValue()));
|
||||
assertThat(attachmentData.get("content_type").toString(),
|
||||
is("application/msword"));
|
||||
}
|
||||
|
||||
public void testPdf() throws Exception {
|
||||
Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
|
||||
assertThat(attachmentData.get("content"),
|
||||
|
@ -138,6 +167,13 @@ public class AttachmentProcessorTests extends ESTestCase {
|
|||
assertThat(attachmentData.get("content_length"), is(notNullValue()));
|
||||
}
|
||||
|
||||
public void testVisioIsExcluded() throws Exception {
|
||||
Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor);
|
||||
assertThat(attachmentData.get("content"), nullValue());
|
||||
assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing"));
|
||||
assertThat(attachmentData.get("content_length"), is(0L));
|
||||
}
|
||||
|
||||
public void testEncryptedPdf() throws Exception {
|
||||
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
|
||||
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue