SOLR-6856: Restore ExtractingRequestHandler's ability to capture all HTML tags when parsing (X)HTML.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1654431 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2015-01-23 23:44:45 +00:00
parent ff4e2c66e0
commit 6dcfa17adc
4 changed files with 85 additions and 14 deletions

View File

@ -536,6 +536,9 @@ Bug Fixes
* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
(shalin)
* SOLR-6856: Restore ExtractingRequestHandler's ability to capture all HTML tags when
parsing (X)HTML. (hossman, ehatcher, Steve Rowe)
Optimizations
----------------------

View File

@ -45,6 +45,7 @@ import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
@ -199,6 +200,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
try{
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
@ -250,4 +252,34 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
}
}
}
public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();
/**
* Keep all elements and their content.
*
* Apparently <SCRIPT> and <STYLE> elements are blocked elsewhere
*/
@Override
public boolean isDiscardElement(String name) {
return false;
}
/** Lowercases the attribute name */
@Override
public String mapSafeAttribute(String elementName, String attributeName) {
return attributeName.toLowerCase(Locale.ENGLISH);
}
/**
* Lowercases the element name, but returns null for <BR>,
* which suppresses the start-element event for lt;BR> tags.
*/
@Override
public String mapSafeElement(String name) {
String lowerName = name.toLowerCase(Locale.ROOT);
return lowerName.equals("br") ? null : lowerName;
}
}
}

View File

@ -1,6 +1,9 @@
<html>
<head>
<title>Welcome to Solr</title>
<style type="text/css">
body { font-family: serif; }
</style>
</head>
<body>
<p>
@ -10,4 +13,7 @@
<div>Here is some text in a div</div>
<div>This has a <a href="http://www.apache.org">link</a>.</div>
</body>
<script>
document.getElementById("div").blur();
</script>
</html>

View File

@ -111,6 +111,8 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
//assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded
assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded
// load again in the exact same way, but boost one field
loadLocal("extraction/simple.html",
@ -127,16 +129,6 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']");
assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix
// test capture
loadLocal("extraction/simple.html",
"literal.id","simple4",
"uprefix", "t_",
"capture","p", // capture only what is in the title element
"commit", "true"
);
assertQ(req("+id:simple4 +t_content:Solr"), "//*[@numFound='1']");
assertQ(req("+id:simple4 +t_p:\"here is some text\""), "//*[@numFound='1']");
loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
@ -197,6 +189,33 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
);
}
@Test
public void testCapture() throws Exception {
loadLocal("extraction/simple.html",
"literal.id","capture1",
"uprefix","t_",
"capture","div",
"fmap.div", "foo_t",
"commit", "true"
);
assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
loadLocal("extraction/simple.html",
"literal.id", "capture2",
"captureAttr", "true",
"defaultField", "text",
"fmap.div", "div_t",
"fmap.a", "anchor_t",
"capture", "div",
"capture", "a",
"commit", "true"
);
assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
}
@Test
public void testDefaultField() throws Exception {
@ -476,14 +495,25 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
SolrQueryResponse rsp = loadLocal("extraction/example.html",
ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant:node()",
ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
ExtractingParams.EXTRACT_ONLY, "true"
);
assertTrue("rsp is null and it shouldn't be", rsp != null);
NamedList list = rsp.getValues();
String val = (String) list.get("example.html");
val = val.trim();
assertTrue(val + " is not equal to " + "linkNews", val.equals("linkNews") == true);//there are two <a> tags, and they get collapesd
assertEquals("News", val.trim()); //there is only one matching <a> tag
loadLocal("extraction/example.html",
"literal.id", "example1",
"captureAttr", "true",
"defaultField", "text",
"capture", "div",
"fmap.div", "foo_t",
"boost.foo_t", "3",
"xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
"commit", "true"
);
assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
}
/** test arabic PDF extraction is functional */