Add test documents from tika test suite
This patch adds a zip of about 200 files from tika's test suite, and we assert some content comes back from each. This is a good exercise of the various formats. I removed any huge files to try to keep size reasonable, but we want a bit of a variety so we know stuff is working. I fixed issues with the parser config by running this.
This commit is contained in:
parent
997fac15f4
commit
6b91e53ab5
|
@ -29,15 +29,6 @@ final class TikaImpl {
|
|||
new org.apache.tika.parser.odf.OpenDocumentParser(),
|
||||
new org.apache.tika.parser.iwork.IWorkPackageParser(),
|
||||
new org.apache.tika.parser.xml.DcXMLParser(),
|
||||
// images:
|
||||
new org.apache.tika.parser.image.BPGParser(),
|
||||
new org.apache.tika.parser.image.ImageParser(),
|
||||
new org.apache.tika.parser.image.TiffParser(),
|
||||
new org.apache.tika.parser.image.WebPParser(),
|
||||
new org.apache.tika.parser.jpeg.JpegParser(),
|
||||
// compression / packaging:
|
||||
new org.apache.tika.parser.pkg.CompressorParser(),
|
||||
new org.apache.tika.parser.pkg.PackageParser(),
|
||||
};
|
||||
|
||||
/** autodetector based on this subset */
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
package org.elasticsearch.mapper.attachments;
|
||||
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
import java.nio.file.DirectoryStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressFileSystems;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
/**
|
||||
* Evil test-coverage cheat, we parse a bunch of docs from tika
|
||||
* so that we have a nice grab-bag variety, and assert some content
|
||||
* comes back and no exception.
|
||||
*/
|
||||
@SuppressFileSystems("ExtrasFS") // don't try to parse extraN
|
||||
public class TikaDocTests extends ESTestCase {
|
||||
|
||||
/** some test files from tika test suite, zipped up */
|
||||
static final String TIKA_FILES = "/org/elasticsearch/index/mapper/attachment/test/tika-files.zip";
|
||||
|
||||
public void testFiles() throws Exception {
|
||||
Path tmp = createTempDir();
|
||||
TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES), tmp);
|
||||
|
||||
try (DirectoryStream<Path> stream = Files.newDirectoryStream(tmp)) {
|
||||
for (Path doc : stream) {
|
||||
logger.debug("parsing: {}", doc);
|
||||
assertParseable(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void assertParseable(Path fileName) throws Exception {
|
||||
try {
|
||||
byte bytes[] = Files.readAllBytes(fileName);
|
||||
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
|
||||
assertNotNull(parsedContent);
|
||||
assertFalse(parsedContent.isEmpty());
|
||||
logger.debug("extracted content: {}", parsedContent);
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e);
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
Loading…
Reference in New Issue