Merge branch 'fix/22077-ingest-attachment'

2017-02-16 15:49:04 +01:00 · 2017-02-16 15:49:04 +01:00 · 76675229c7
parent c9cde11a5e 6b66e29435
commit 76675229c7
7 changed files with 46 additions and 1 deletions
--- a/plugins/ingest-attachment/build.gradle
+++ b/plugins/ingest-attachment/build.gradle
@ -74,9 +74,11 @@ dependencyLicenses {
 }

 forbiddenPatterns {
+  exclude '**/*.doc'
  exclude '**/*.docx'
  exclude '**/*.pdf'
  exclude '**/*.epub'
+  exclude '**/*.vsdx'
 }

 thirdPartyAudit.excludes = [
--- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
+++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
@ -22,8 +22,10 @@ package org.elasticsearch.ingest.attachment;
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 import org.elasticsearch.SpecialPermission;
 import org.elasticsearch.bootstrap.JarHell;
 import org.elasticsearch.common.SuppressForbidden;
@ -45,7 +47,9 @@ import java.security.PrivilegedActionException;
 import java.security.PrivilegedExceptionAction;
 import java.security.ProtectionDomain;
 import java.security.SecurityPermission;
+import java.util.Collections;
 import java.util.PropertyPermission;
+import java.util.Set;

 /**
 * Runs tika with limited parsers and limited permissions.
@ -54,6 +58,9 @@ import java.util.PropertyPermission;
 */
 final class TikaImpl {

+    /** Exclude some formats */
+    private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));
+
    /** subset of parsers for types we support */
    private static final Parser PARSERS[] = new Parser[] {
        // documents
@ -63,7 +70,7 @@ final class TikaImpl {
        new org.apache.tika.parser.txt.TXTParser(),
        new org.apache.tika.parser.microsoft.OfficeParser(),
        new org.apache.tika.parser.microsoft.OldExcelParser(),
-        new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
+        ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
        new org.apache.tika.parser.odf.OpenDocumentParser(),
        new org.apache.tika.parser.iwork.IWorkPackageParser(),
        new org.apache.tika.parser.xml.DcXMLParser(),
--- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
+++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
@ -47,6 +47,7 @@ import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.notNullValue;
+import static org.hamcrest.Matchers.nullValue;
 import static org.hamcrest.core.IsCollectionContaining.hasItem;

 public class AttachmentProcessorTests extends ESTestCase {
@ -130,6 +131,34 @@ public class AttachmentProcessorTests extends ESTestCase {
            is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
    }

+    public void testWordDocumentWithVisioSchema() throws Exception {
+        Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);
+
+        assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
+            "content_length"));
+        assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
+        assertThat(attachmentData.get("language"), is("en"));
+        assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
+        assertThat(attachmentData.get("author"), is(notNullValue()));
+        assertThat(attachmentData.get("content_length"), is(notNullValue()));
+        assertThat(attachmentData.get("content_type").toString(),
+            is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+    }
+
+    public void testLegacyWordDocumentWithVisioSchema() throws Exception {
+        Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);
+
+        assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
+            "content_length"));
+        assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
+        assertThat(attachmentData.get("language"), is("en"));
+        assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
+        assertThat(attachmentData.get("author"), is(notNullValue()));
+        assertThat(attachmentData.get("content_length"), is(notNullValue()));
+        assertThat(attachmentData.get("content_type").toString(),
+            is("application/msword"));
+    }
+
    public void testPdf() throws Exception {
        Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
        assertThat(attachmentData.get("content"),
@ -138,6 +167,13 @@ public class AttachmentProcessorTests extends ESTestCase {
        assertThat(attachmentData.get("content_length"), is(notNullValue()));
    }

+    public void testVisioIsExcluded() throws Exception {
+        Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor);
+        assertThat(attachmentData.get("content"), nullValue());
+        assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing"));
+        assertThat(attachmentData.get("content_length"), is(0L));
+    }
+
    public void testEncryptedPdf() throws Exception {
        ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
        assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
--- a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc
--- a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx
--- a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx
--- a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files/testPPT.potm.zip
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files/testPPT.potm.zip