SOLR-4530: DIH: Provide configuration to use Tika's IdentityHtmlMapper

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465879 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2013-04-09 04:40:12 +00:00
parent 1a9de4ba0a
commit ccf99d3e6c
4 changed files with 94 additions and 2 deletions

View File

@ -132,6 +132,9 @@ New Features
* SOLR-3755: A new collections api to add additional shards dynamically by splitting
existing shards. (yonik, Anshum Gupta, shalin)
* SOLR-4530: DIH: Provide configuration to use Tika's IdentityHtmlMapper
(Alexandre Rafalovitch via shalin)
Bug Fixes
----------------------

View File

@ -22,6 +22,8 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.IdentityHtmlMapper;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.XHTMLContentHandler;
@ -63,7 +65,7 @@ public class TikaEntityProcessor extends EntityProcessorBase {
private boolean done = false;
private String parser;
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
private String htmlMapper;
@Override
protected void firstInit(Context context) {
@ -88,6 +90,13 @@ public class TikaEntityProcessor extends EntityProcessorBase {
format = "text";
if (!"html".equals(format) && !"xml".equals(format) && !"text".equals(format)&& !"none".equals(format) )
throw new DataImportHandlerException(SEVERE, "'format' can be one of text|html|xml|none");
htmlMapper = context.getResolvedEntityAttribute("htmlMapper");
if (htmlMapper == null)
htmlMapper = "default";
if (!"default".equals(htmlMapper) && !"identity".equals(htmlMapper))
throw new DataImportHandlerException(SEVERE, "'htmlMapper', if present, must be 'default' or 'identity'");
parser = context.getResolvedEntityAttribute("parser");
if(parser == null) {
parser = AUTO_PARSER;
@ -124,7 +133,11 @@ public class TikaEntityProcessor extends EntityProcessorBase {
tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
}
try {
tikaParser.parse(is, contentHandler, metadata , new ParseContext());
ParseContext context = new ParseContext();
if ("identity".equals(htmlMapper)){
context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
}
tikaParser.parse(is, contentHandler, metadata , context);
} catch (Exception e) {
wrapAndThrow(SEVERE, e, "Unable to read content");
}

View File

@ -0,0 +1,29 @@
<!DOCTYPE html>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<title>Title in the header</title>
</head>
<body>
<h1>H1 Header</h1>
<div>Basic div</div>
<div class="classAttribute">Div with attribute</div>
</body>
</html>

View File

@ -55,6 +55,21 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
,"//str[@name='text']"
};
private String[] testsHTMLDefault = {
"//*[@numFound='1']"
, "//str[@name='text'][contains(.,'Basic div')]"
, "//str[@name='text'][contains(.,'<h1>')]"
, "//str[@name='text'][not(contains(.,'<div>'))]" //default mapper lower-cases elements as it maps
, "//str[@name='text'][not(contains(.,'<DIV>'))]"
};
private String[] testsHTMLIdentity = {
"//*[@numFound='1']"
, "//str[@name='text'][contains(.,'Basic div')]"
, "//str[@name='text'][contains(.,'<h1>')]"
, "//str[@name='text'][contains(.,'<div>')]"
, "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
};
@BeforeClass
public static void beforeClass() throws Exception {
@ -67,4 +82,36 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
assertQ(req("*:*"), tests );
}
@Test
public void testTikaHTMLMapperEmpty() throws Exception {
runFullImport(getConfigHTML(null));
assertQ(req("*:*"), testsHTMLDefault);
}
@Test
public void testTikaHTMLMapperDefault() throws Exception {
runFullImport(getConfigHTML("default"));
assertQ(req("*:*"), testsHTMLDefault);
}
@Test
public void testTikaHTMLMapperIdentity() throws Exception {
runFullImport(getConfigHTML("identity"));
assertQ(req("*:*"), testsHTMLIdentity);
}
private String getConfigHTML(String htmlMapper) {
return
"<dataConfig>" +
" <dataSource type='BinFileDataSource'/>" +
" <document>" +
" <entity name='Tika' format='xml' processor='TikaEntityProcessor' " +
" url='" + getFile("dihextras/structured.html").getAbsolutePath() + "' " +
((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper + "'")) + ">" +
" <field column='text'/>" +
" </entity>" +
" </document>" +
"</dataConfig>";
}
}