diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index 97b5a23f116..8a4f038b4c3 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -74,9 +74,11 @@ dependencyLicenses { } forbiddenPatterns { + exclude '**/*.doc' exclude '**/*.docx' exclude '**/*.pdf' exclude '**/*.epub' + exclude '**/*.vsdx' } thirdPartyAudit.excludes = [ diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java index f99a2a630ab..c7ffe4f287f 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java @@ -22,8 +22,10 @@ package org.elasticsearch.ingest.attachment; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.elasticsearch.SpecialPermission; import org.elasticsearch.bootstrap.JarHell; import org.elasticsearch.common.SuppressForbidden; @@ -45,7 +47,9 @@ import java.security.PrivilegedActionException; import java.security.PrivilegedExceptionAction; import java.security.ProtectionDomain; import java.security.SecurityPermission; +import java.util.Collections; import java.util.PropertyPermission; +import java.util.Set; /** * Runs tika with limited parsers and limited permissions. @@ -54,6 +58,9 @@ import java.util.PropertyPermission; */ final class TikaImpl { + /** Exclude some formats */ + private static final Set EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml")); + /** subset of parsers for types we support */ private static final Parser PARSERS[] = new Parser[] { // documents @@ -63,7 +70,7 @@ final class TikaImpl { new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), - new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), + ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index b59457b5b01..e5b9d72017f 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -47,6 +47,7 @@ import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.notNullValue; +import static org.hamcrest.Matchers.nullValue; import static org.hamcrest.core.IsCollectionContaining.hasItem; public class AttachmentProcessorTests extends ESTestCase { @@ -130,6 +131,34 @@ public class AttachmentProcessorTests extends ESTestCase { is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); } + public void testWordDocumentWithVisioSchema() throws Exception { + Map attachmentData = parseDocument("issue-22077.docx", processor); + + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", + "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); + assertThat(attachmentData.get("language"), is("en")); + assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z")); + assertThat(attachmentData.get("author"), is(notNullValue())); + assertThat(attachmentData.get("content_length"), is(notNullValue())); + assertThat(attachmentData.get("content_type").toString(), + is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + } + + public void testLegacyWordDocumentWithVisioSchema() throws Exception { + Map attachmentData = parseDocument("issue-22077.doc", processor); + + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", + "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); + assertThat(attachmentData.get("language"), is("en")); + assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z")); + assertThat(attachmentData.get("author"), is(notNullValue())); + assertThat(attachmentData.get("content_length"), is(notNullValue())); + assertThat(attachmentData.get("content_type").toString(), + is("application/msword")); + } + public void testPdf() throws Exception { Map attachmentData = parseDocument("test.pdf", processor); assertThat(attachmentData.get("content"), @@ -138,6 +167,13 @@ public class AttachmentProcessorTests extends ESTestCase { assertThat(attachmentData.get("content_length"), is(notNullValue())); } + public void testVisioIsExcluded() throws Exception { + Map attachmentData = parseDocument("issue-22077.vsdx", processor); + assertThat(attachmentData.get("content"), nullValue()); + assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing")); + assertThat(attachmentData.get("content_length"), is(0L)); + } + public void testEncryptedPdf() throws Exception { ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor)); assertThat(e.getDetailedMessage(), containsString("document is encrypted")); diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc new file mode 100644 index 00000000000..10badd5809b Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc differ diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx new file mode 100644 index 00000000000..bab550607a9 Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx differ diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx new file mode 100644 index 00000000000..fb9cde51b4b Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx differ diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip index cfc2e54b79b..67d1316cb4c 100644 Binary files a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip differ