diff --git a/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java b/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java index 68c3958dd02..265ba830cbb 100644 --- a/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java +++ b/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java @@ -29,15 +29,6 @@ final class TikaImpl { new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), - // images: - new org.apache.tika.parser.image.BPGParser(), - new org.apache.tika.parser.image.ImageParser(), - new org.apache.tika.parser.image.TiffParser(), - new org.apache.tika.parser.image.WebPParser(), - new org.apache.tika.parser.jpeg.JpegParser(), - // compression / packaging: - new org.apache.tika.parser.pkg.CompressorParser(), - new org.apache.tika.parser.pkg.PackageParser(), }; /** autodetector based on this subset */ diff --git a/src/test/java/org/elasticsearch/mapper/attachments/TikaDocTests.java b/src/test/java/org/elasticsearch/mapper/attachments/TikaDocTests.java new file mode 100644 index 00000000000..a5e3ec9c17c --- /dev/null +++ b/src/test/java/org/elasticsearch/mapper/attachments/TikaDocTests.java @@ -0,0 +1,66 @@ +package org.elasticsearch.mapper.attachments; + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.lucene.util.LuceneTestCase.SuppressFileSystems; +import org.apache.lucene.util.TestUtil; +import org.apache.tika.metadata.Metadata; + +import org.elasticsearch.test.ESTestCase; + +/** + * Evil test-coverage cheat, we parse a bunch of docs from tika + * so that we have a nice grab-bag variety, and assert some content + * comes back and no exception. + */ +@SuppressFileSystems("ExtrasFS") // don't try to parse extraN +public class TikaDocTests extends ESTestCase { + + /** some test files from tika test suite, zipped up */ + static final String TIKA_FILES = "/org/elasticsearch/index/mapper/attachment/test/tika-files.zip"; + + public void testFiles() throws Exception { + Path tmp = createTempDir(); + TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES), tmp); + + try (DirectoryStream stream = Files.newDirectoryStream(tmp)) { + for (Path doc : stream) { + logger.debug("parsing: {}", doc); + assertParseable(doc); + } + } + } + + void assertParseable(Path fileName) throws Exception { + try { + byte bytes[] = Files.readAllBytes(fileName); + String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1); + assertNotNull(parsedContent); + assertFalse(parsedContent.isEmpty()); + logger.debug("extracted content: {}", parsedContent); + } catch (Throwable e) { + throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e); + } + } +} diff --git a/src/test/resources/org/elasticsearch/index/mapper/attachment/test/tika-files.zip b/src/test/resources/org/elasticsearch/index/mapper/attachment/test/tika-files.zip new file mode 100644 index 00000000000..10f5d507677 Binary files /dev/null and b/src/test/resources/org/elasticsearch/index/mapper/attachment/test/tika-files.zip differ