mirror of https://github.com/apache/lucene.git
tika's default was switched to "process embedded" -- need to send EmptyParser
into ParseContext now to turn off parsing of embedded documents add curvesapi for vsdx and unit test.
This commit is contained in:
parent
4c7ff73c98
commit
605304d927
|
@ -22,6 +22,7 @@ com.fasterxml.jackson.core.version = 2.5.4
|
|||
/com.fasterxml.jackson.dataformat/jackson-dataformat-smile = ${com.fasterxml.jackson.core.version}
|
||||
|
||||
/com.github.ben-manes.caffeine/caffeine = 2.4.0
|
||||
/com.github.virtuald/curvesapi = 1.04
|
||||
|
||||
/com.google.guava/guava = 14.0.1
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.EmptyParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.html.HtmlMapper;
|
||||
|
@ -62,6 +63,7 @@ import static org.apache.solr.handler.dataimport.XPathEntityProcessor.URL;
|
|||
* @since solr 3.1
|
||||
*/
|
||||
public class TikaEntityProcessor extends EntityProcessorBase {
|
||||
private static Parser EMPTY_PARSER = new EmptyParser();
|
||||
private TikaConfig tikaConfig;
|
||||
private String format = "text";
|
||||
private boolean done = false;
|
||||
|
@ -155,6 +157,8 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
|||
}
|
||||
if (extractEmbedded) {
|
||||
context.set(Parser.class, tikaParser);
|
||||
} else {
|
||||
context.set(Parser.class, EMPTY_PARSER);
|
||||
}
|
||||
tikaParser.parse(is, contentHandler, metadata , context);
|
||||
} catch (Exception e) {
|
||||
|
|
Binary file not shown.
|
@ -62,6 +62,15 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
|||
" </document>" +
|
||||
"</dataConfig>";
|
||||
|
||||
private String vsdxConf =
|
||||
"<dataConfig>" +
|
||||
" <dataSource type=\"BinFileDataSource\"/>" +
|
||||
" <document>" +
|
||||
" <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" + getFile("dihextras/test_vsdx.vsdx").getAbsolutePath() + "\" >" +
|
||||
" <field column=\"text\"/>" +
|
||||
" </entity>" +
|
||||
" </document>" +
|
||||
"</dataConfig>";
|
||||
|
||||
private String[] tests = {
|
||||
"//*[@numFound='1']"
|
||||
|
@ -100,6 +109,11 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
|||
"//str[@name='text'][not(contains(.,'When in the Course'))]"
|
||||
};
|
||||
|
||||
private String[] testsVSDX = {
|
||||
"//*[@numFound='1']",
|
||||
"//str[@name='text'][contains(.,'Arrears')]"
|
||||
};
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
assumeFalse("This test fails on UNIX with Turkish default locale (https://issues.apache.org/jira/browse/SOLR-6387)",
|
||||
|
@ -119,6 +133,14 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
|||
assertQ(req("*:*"), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVSDX() throws Exception {
|
||||
//this ensures that we've included the curvesapi dependency
|
||||
//and that the ConnectsType class is bundled with poi-ooxml-schemas.
|
||||
runFullImport(vsdxConf);
|
||||
assertQ(req("*:*"), testsVSDX);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTikaHTMLMapperEmpty() throws Exception {
|
||||
runFullImport(getConfigHTML(null));
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
<dependency org="org.apache.poi" name="poi-ooxml-schemas" rev="${/org.apache.poi/poi-ooxml-schemas}" conf="compile"/>
|
||||
<dependency org="org.apache.xmlbeans" name="xmlbeans" rev="${/org.apache.xmlbeans/xmlbeans}" conf="compile"/>
|
||||
<dependency org="org.apache.commons" name="commons-collections4" rev="${/org.apache.commons/commons-collections4}" conf="compile"/>
|
||||
<dependency org="com.github.virtuald" name="curvesapi" rev="${/com.github.virtuald/curvesapi}" conf="compile"/>
|
||||
<dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="${/org.ccil.cowan.tagsoup/tagsoup}" conf="compile"/>
|
||||
<dependency org="com.googlecode.mp4parser" name="isoparser" rev="${/com.googlecode.mp4parser/isoparser}" conf="compile"/>
|
||||
<dependency org="org.aspectj" name="aspectjrt" rev="${/org.aspectj/aspectjrt}" conf="compile"/>
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
3386abf821719bc89c7685f9eaafaf4a842f0199
|
|
@ -0,0 +1,28 @@
|
|||
Copyright (c) 2005, Graph Builder
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
-Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
-Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
-Neither the name of Graph Builder nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,2 @@
|
|||
Copyright (c) 2005, Graph Builder
|
||||
All rights reserved.
|
Loading…
Reference in New Issue