mirror of https://github.com/apache/lucene.git
SOLR-6856: Restore ExtractingRequestHandler's ability to capture all HTML tags when parsing (X)HTML.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1654431 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ff4e2c66e0
commit
6dcfa17adc
|
@ -536,6 +536,9 @@ Bug Fixes
|
|||
|
||||
* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
|
||||
(shalin)
|
||||
|
||||
* SOLR-6856: Restore ExtractingRequestHandler's ability to capture all HTML tags when
|
||||
parsing (X)HTML. (hossman, ehatcher, Steve Rowe)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
|
|
@ -45,6 +45,7 @@ import org.apache.tika.parser.DefaultParser;
|
|||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.PasswordProvider;
|
||||
import org.apache.tika.parser.html.HtmlMapper;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.apache.tika.sax.xpath.Matcher;
|
||||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||
|
@ -199,6 +200,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
try{
|
||||
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
||||
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
||||
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
|
||||
|
||||
// Password handling
|
||||
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
|
||||
|
@ -250,4 +252,34 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
|
||||
public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();
|
||||
|
||||
/**
|
||||
* Keep all elements and their content.
|
||||
*
|
||||
* Apparently <SCRIPT> and <STYLE> elements are blocked elsewhere
|
||||
*/
|
||||
@Override
|
||||
public boolean isDiscardElement(String name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Lowercases the attribute name */
|
||||
@Override
|
||||
public String mapSafeAttribute(String elementName, String attributeName) {
|
||||
return attributeName.toLowerCase(Locale.ENGLISH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Lowercases the element name, but returns null for <BR>,
|
||||
* which suppresses the start-element event for lt;BR> tags.
|
||||
*/
|
||||
@Override
|
||||
public String mapSafeElement(String name) {
|
||||
String lowerName = name.toLowerCase(Locale.ROOT);
|
||||
return lowerName.equals("br") ? null : lowerName;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,6 +1,9 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>Welcome to Solr</title>
|
||||
<style type="text/css">
|
||||
body { font-family: serif; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
|
@ -10,4 +13,7 @@
|
|||
<div>Here is some text in a div</div>
|
||||
<div>This has a <a href="http://www.apache.org">link</a>.</div>
|
||||
</body>
|
||||
<script>
|
||||
document.getElementById("div").blur();
|
||||
</script>
|
||||
</html>
|
||||
|
|
|
@ -111,6 +111,8 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
//assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded
|
||||
assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded
|
||||
|
||||
// load again in the exact same way, but boost one field
|
||||
loadLocal("extraction/simple.html",
|
||||
|
@ -127,16 +129,6 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']");
|
||||
assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix
|
||||
|
||||
// test capture
|
||||
loadLocal("extraction/simple.html",
|
||||
"literal.id","simple4",
|
||||
"uprefix", "t_",
|
||||
"capture","p", // capture only what is in the title element
|
||||
"commit", "true"
|
||||
);
|
||||
assertQ(req("+id:simple4 +t_content:Solr"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:simple4 +t_p:\"here is some text\""), "//*[@numFound='1']");
|
||||
|
||||
loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
|
@ -197,6 +189,33 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCapture() throws Exception {
|
||||
loadLocal("extraction/simple.html",
|
||||
"literal.id","capture1",
|
||||
"uprefix","t_",
|
||||
"capture","div",
|
||||
"fmap.div", "foo_t",
|
||||
"commit", "true"
|
||||
);
|
||||
assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
|
||||
|
||||
loadLocal("extraction/simple.html",
|
||||
"literal.id", "capture2",
|
||||
"captureAttr", "true",
|
||||
"defaultField", "text",
|
||||
"fmap.div", "div_t",
|
||||
"fmap.a", "anchor_t",
|
||||
"capture", "div",
|
||||
"capture", "a",
|
||||
"commit", "true"
|
||||
);
|
||||
assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
|
||||
assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDefaultField() throws Exception {
|
||||
|
@ -476,14 +495,25 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
|
||||
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||
SolrQueryResponse rsp = loadLocal("extraction/example.html",
|
||||
ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant:node()",
|
||||
ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
|
||||
ExtractingParams.EXTRACT_ONLY, "true"
|
||||
);
|
||||
assertTrue("rsp is null and it shouldn't be", rsp != null);
|
||||
NamedList list = rsp.getValues();
|
||||
String val = (String) list.get("example.html");
|
||||
val = val.trim();
|
||||
assertTrue(val + " is not equal to " + "linkNews", val.equals("linkNews") == true);//there are two <a> tags, and they get collapesd
|
||||
assertEquals("News", val.trim()); //there is only one matching <a> tag
|
||||
|
||||
loadLocal("extraction/example.html",
|
||||
"literal.id", "example1",
|
||||
"captureAttr", "true",
|
||||
"defaultField", "text",
|
||||
"capture", "div",
|
||||
"fmap.div", "foo_t",
|
||||
"boost.foo_t", "3",
|
||||
"xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
|
||||
"commit", "true"
|
||||
);
|
||||
assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
/** test arabic PDF extraction is functional */
|
||||
|
|
Loading…
Reference in New Issue