Merge branch 'fix/22077-ingest-attachment'

This commit is contained in:
David Pilato 2017-02-16 15:49:04 +01:00
commit 76675229c7
7 changed files with 46 additions and 1 deletions

View File

@ -74,9 +74,11 @@ dependencyLicenses {
}
forbiddenPatterns {
exclude '**/*.doc'
exclude '**/*.docx'
exclude '**/*.pdf'
exclude '**/*.epub'
exclude '**/*.vsdx'
}
thirdPartyAudit.excludes = [

View File

@ -22,8 +22,10 @@ package org.elasticsearch.ingest.attachment;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.elasticsearch.SpecialPermission;
import org.elasticsearch.bootstrap.JarHell;
import org.elasticsearch.common.SuppressForbidden;
@ -45,7 +47,9 @@ import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
import java.security.ProtectionDomain;
import java.security.SecurityPermission;
import java.util.Collections;
import java.util.PropertyPermission;
import java.util.Set;
/**
* Runs tika with limited parsers and limited permissions.
@ -54,6 +58,9 @@ import java.util.PropertyPermission;
*/
final class TikaImpl {
/** Exclude some formats */
private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));
/** subset of parsers for types we support */
private static final Parser PARSERS[] = new Parser[] {
// documents
@ -63,7 +70,7 @@ final class TikaImpl {
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),

View File

@ -47,6 +47,7 @@ import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
import static org.hamcrest.core.IsCollectionContaining.hasItem;
public class AttachmentProcessorTests extends ESTestCase {
@ -130,6 +131,34 @@ public class AttachmentProcessorTests extends ESTestCase {
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
}
public void testWordDocumentWithVisioSchema() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
"content_length"));
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
assertThat(attachmentData.get("author"), is(notNullValue()));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
assertThat(attachmentData.get("content_type").toString(),
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
}
public void testLegacyWordDocumentWithVisioSchema() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
"content_length"));
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
assertThat(attachmentData.get("author"), is(notNullValue()));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
assertThat(attachmentData.get("content_type").toString(),
is("application/msword"));
}
public void testPdf() throws Exception {
Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
assertThat(attachmentData.get("content"),
@ -138,6 +167,13 @@ public class AttachmentProcessorTests extends ESTestCase {
assertThat(attachmentData.get("content_length"), is(notNullValue()));
}
public void testVisioIsExcluded() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor);
assertThat(attachmentData.get("content"), nullValue());
assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing"));
assertThat(attachmentData.get("content_length"), is(0L));
}
public void testEncryptedPdf() throws Exception {
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));