mirror of
https://github.com/apache/lucene.git
synced 2025-02-20 17:07:09 +00:00
SOLR-4530: DIH: Provide configuration to use Tika's IdentityHtmlMapper
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465879 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1a9de4ba0a
commit
ccf99d3e6c
@ -132,6 +132,9 @@ New Features
|
||||
* SOLR-3755: A new collections api to add additional shards dynamically by splitting
|
||||
existing shards. (yonik, Anshum Gupta, shalin)
|
||||
|
||||
* SOLR-4530: DIH: Provide configuration to use Tika's IdentityHtmlMapper
|
||||
(Alexandre Rafalovitch via shalin)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
||||
|
@ -22,6 +22,8 @@ import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.html.HtmlMapper;
|
||||
import org.apache.tika.parser.html.IdentityHtmlMapper;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.apache.tika.sax.ContentHandlerDecorator;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
@ -63,7 +65,7 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
||||
private boolean done = false;
|
||||
private String parser;
|
||||
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
|
||||
|
||||
private String htmlMapper;
|
||||
|
||||
@Override
|
||||
protected void firstInit(Context context) {
|
||||
@ -88,6 +90,13 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
||||
format = "text";
|
||||
if (!"html".equals(format) && !"xml".equals(format) && !"text".equals(format)&& !"none".equals(format) )
|
||||
throw new DataImportHandlerException(SEVERE, "'format' can be one of text|html|xml|none");
|
||||
|
||||
htmlMapper = context.getResolvedEntityAttribute("htmlMapper");
|
||||
if (htmlMapper == null)
|
||||
htmlMapper = "default";
|
||||
if (!"default".equals(htmlMapper) && !"identity".equals(htmlMapper))
|
||||
throw new DataImportHandlerException(SEVERE, "'htmlMapper', if present, must be 'default' or 'identity'");
|
||||
|
||||
parser = context.getResolvedEntityAttribute("parser");
|
||||
if(parser == null) {
|
||||
parser = AUTO_PARSER;
|
||||
@ -124,7 +133,11 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
||||
tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
|
||||
}
|
||||
try {
|
||||
tikaParser.parse(is, contentHandler, metadata , new ParseContext());
|
||||
ParseContext context = new ParseContext();
|
||||
if ("identity".equals(htmlMapper)){
|
||||
context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
|
||||
}
|
||||
tikaParser.parse(is, contentHandler, metadata , context);
|
||||
} catch (Exception e) {
|
||||
wrapAndThrow(SEVERE, e, "Unable to read content");
|
||||
}
|
||||
|
@ -0,0 +1,29 @@
|
||||
<!DOCTYPE html>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>Title in the header</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>H1 Header</h1>
|
||||
<div>Basic div</div>
|
||||
<div class="classAttribute">Div with attribute</div>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -55,6 +55,21 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
||||
,"//str[@name='text']"
|
||||
};
|
||||
|
||||
private String[] testsHTMLDefault = {
|
||||
"//*[@numFound='1']"
|
||||
, "//str[@name='text'][contains(.,'Basic div')]"
|
||||
, "//str[@name='text'][contains(.,'<h1>')]"
|
||||
, "//str[@name='text'][not(contains(.,'<div>'))]" //default mapper lower-cases elements as it maps
|
||||
, "//str[@name='text'][not(contains(.,'<DIV>'))]"
|
||||
};
|
||||
|
||||
private String[] testsHTMLIdentity = {
|
||||
"//*[@numFound='1']"
|
||||
, "//str[@name='text'][contains(.,'Basic div')]"
|
||||
, "//str[@name='text'][contains(.,'<h1>')]"
|
||||
, "//str[@name='text'][contains(.,'<div>')]"
|
||||
, "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
|
||||
};
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
@ -67,4 +82,36 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
||||
assertQ(req("*:*"), tests );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTikaHTMLMapperEmpty() throws Exception {
|
||||
runFullImport(getConfigHTML(null));
|
||||
assertQ(req("*:*"), testsHTMLDefault);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTikaHTMLMapperDefault() throws Exception {
|
||||
runFullImport(getConfigHTML("default"));
|
||||
assertQ(req("*:*"), testsHTMLDefault);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTikaHTMLMapperIdentity() throws Exception {
|
||||
runFullImport(getConfigHTML("identity"));
|
||||
assertQ(req("*:*"), testsHTMLIdentity);
|
||||
}
|
||||
|
||||
private String getConfigHTML(String htmlMapper) {
|
||||
return
|
||||
"<dataConfig>" +
|
||||
" <dataSource type='BinFileDataSource'/>" +
|
||||
" <document>" +
|
||||
" <entity name='Tika' format='xml' processor='TikaEntityProcessor' " +
|
||||
" url='" + getFile("dihextras/structured.html").getAbsolutePath() + "' " +
|
||||
((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper + "'")) + ">" +
|
||||
" <field column='text'/>" +
|
||||
" </entity>" +
|
||||
" </document>" +
|
||||
"</dataConfig>";
|
||||
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user