Add test documents from tika test suite

This patch adds a zip of about 200 files from tika's test suite,
and we assert some content comes back from each. This is a good exercise
of the various formats.

I removed any huge files to try to keep size reasonable, but we want
a bit of a variety so we know stuff is working.

I fixed issues with the parser config by running this.
This commit is contained in:
Robert Muir 2015-11-08 00:22:36 -05:00
parent 997fac15f4
commit 6b91e53ab5
3 changed files with 66 additions and 9 deletions

View File

@ -29,15 +29,6 @@ final class TikaImpl {
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
// images:
new org.apache.tika.parser.image.BPGParser(),
new org.apache.tika.parser.image.ImageParser(),
new org.apache.tika.parser.image.TiffParser(),
new org.apache.tika.parser.image.WebPParser(),
new org.apache.tika.parser.jpeg.JpegParser(),
// compression / packaging:
new org.apache.tika.parser.pkg.CompressorParser(),
new org.apache.tika.parser.pkg.PackageParser(),
};
/** autodetector based on this subset */

View File

@ -0,0 +1,66 @@
package org.elasticsearch.mapper.attachments;
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.util.LuceneTestCase.SuppressFileSystems;
import org.apache.lucene.util.TestUtil;
import org.apache.tika.metadata.Metadata;
import org.elasticsearch.test.ESTestCase;
/**
* Evil test-coverage cheat, we parse a bunch of docs from tika
* so that we have a nice grab-bag variety, and assert some content
* comes back and no exception.
*/
@SuppressFileSystems("ExtrasFS") // don't try to parse extraN
public class TikaDocTests extends ESTestCase {
/** some test files from tika test suite, zipped up */
static final String TIKA_FILES = "/org/elasticsearch/index/mapper/attachment/test/tika-files.zip";
public void testFiles() throws Exception {
Path tmp = createTempDir();
TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES), tmp);
try (DirectoryStream<Path> stream = Files.newDirectoryStream(tmp)) {
for (Path doc : stream) {
logger.debug("parsing: {}", doc);
assertParseable(doc);
}
}
}
void assertParseable(Path fileName) throws Exception {
try {
byte bytes[] = Files.readAllBytes(fileName);
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
assertNotNull(parsedContent);
assertFalse(parsedContent.isEmpty());
logger.debug("extracted content: {}", parsedContent);
} catch (Throwable e) {
throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e);
}
}
}