mirror of https://github.com/apache/lucene.git
SOLR-7076: TikaEntityProcessor should have support for onError=skip
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1658664 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ac50da1613
commit
9a77ceee6a
|
@ -140,6 +140,10 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
|||
}
|
||||
tikaParser.parse(is, contentHandler, metadata , context);
|
||||
} catch (Exception e) {
|
||||
if(SKIP.equals(onError)) {
|
||||
throw new DataImportHandlerException(DataImportHandlerException.SKIP_ROW,
|
||||
"Document skipped :" + e.getMessage());
|
||||
}
|
||||
wrapAndThrow(SEVERE, e, "Unable to read content");
|
||||
}
|
||||
IOUtils.closeQuietly(is);
|
||||
|
|
Binary file not shown.
|
@ -49,6 +49,19 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
|||
" </document>" +
|
||||
"</dataConfig>";
|
||||
|
||||
private String skipOnErrConf =
|
||||
"<dataConfig>" +
|
||||
" <dataSource type=\"BinFileDataSource\"/>" +
|
||||
" <document>" +
|
||||
" <entity name=\"Tika\" onError=\"skip\" processor=\"TikaEntityProcessor\" url=\"" + getFile("dihextras/bad.doc").getAbsolutePath() + "\" >" +
|
||||
"<field column=\"content\" name=\"text\"/>" +
|
||||
" </entity>" +
|
||||
" <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" + getFile("dihextras/solr-word.pdf").getAbsolutePath() + "\" >" +
|
||||
" <field column=\"text\"/>" +
|
||||
"</entity>" +
|
||||
" </document>" +
|
||||
"</dataConfig>";
|
||||
|
||||
private String[] tests = {
|
||||
"//*[@numFound='1']"
|
||||
,"//str[@name='author'][.='Grant Ingersoll']"
|
||||
|
@ -85,6 +98,12 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
|||
assertQ(req("*:*"), tests );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSkip() throws Exception {
|
||||
runFullImport(skipOnErrConf);
|
||||
assertQ(req("*:*"), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTikaHTMLMapperEmpty() throws Exception {
|
||||
runFullImport(getConfigHTML(null));
|
||||
|
|
Loading…
Reference in New Issue