SOLR-6856: Restore ExtractingRequestHandler's ability to capture all HTML tags when parsing (X)HTML.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1654431 13f79535-47bb-0310-9956-ffa450edef68
2015-01-23 23:44:45 +00:00 · 2015-01-23 23:44:45 +00:00 · 6dcfa17adc
parent ff4e2c66e0
commit 6dcfa17adc
4 changed files with 85 additions and 14 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -536,6 +536,9 @@ Bug Fixes

 * SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
  (shalin)
+  
+* SOLR-6856: Restore ExtractingRequestHandler's ability to capture all HTML tags when
+  parsing (X)HTML. (hossman, ehatcher, Steve Rowe)

 Optimizations
 ----------------------
--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@ -45,6 +45,7 @@ import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.html.HtmlMapper;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
@ -199,6 +200,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
        try{
          //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
          ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
@ -250,4 +252,34 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
  }
-}
+
+  public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
+    public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();
+
+    /** 
+     * Keep all elements and their content.
+     *  
+     * Apparently &lt;SCRIPT&gt; and &lt;STYLE&gt; elements are blocked elsewhere
+     */
+    @Override
+    public boolean isDiscardElement(String name) {     
+      return false;
+    }
+
+    /** Lowercases the attribute name */
+    @Override
+    public String mapSafeAttribute(String elementName, String attributeName) {
+      return attributeName.toLowerCase(Locale.ENGLISH);
+    }
+
+    /**
+     * Lowercases the element name, but returns null for &lt;BR&gt;,
+     * which suppresses the start-element event for lt;BR&gt; tags.
+     */
+    @Override
+    public String mapSafeElement(String name) {
+      String lowerName = name.toLowerCase(Locale.ROOT);
+      return lowerName.equals("br") ? null : lowerName;
+    }
+   }
+ }
--- a/solr/contrib/extraction/src/test-files/extraction/simple.html
+++ b/solr/contrib/extraction/src/test-files/extraction/simple.html
@ -1,6 +1,9 @@
 <html>
 <head>
  <title>Welcome to Solr</title>
+  <style type="text/css">
+    body { font-family: serif; }
+  </style>
 </head>
 <body>
 <p>
@ -10,4 +13,7 @@
 <div>Here is some text in a div</div>
 <div>This has a <a href="http://www.apache.org">link</a>.</div>
 </body>
+<script>
+  document.getElementById("div").blur();
+</script>
 </html>
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@ -111,6 +111,8 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
    //assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
    assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
    assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
+    assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded
+    assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded

    // load again in the exact same way, but boost one field
    loadLocal("extraction/simple.html",
@ -127,16 +129,6 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
    assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']");
    assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix

-    // test capture
-     loadLocal("extraction/simple.html",
-      "literal.id","simple4",
-      "uprefix", "t_",
-      "capture","p",     // capture only what is in the title element
-      "commit", "true"
-    );
-    assertQ(req("+id:simple4 +t_content:Solr"), "//*[@numFound='1']");
-    assertQ(req("+id:simple4 +t_p:\"here is some text\""), "//*[@numFound='1']");
-
    loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
            "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
            "fmap.Author", "extractedAuthor",
@ -197,6 +189,33 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
            );
  }

+  @Test
+  public void testCapture() throws Exception {
+    loadLocal("extraction/simple.html",
+        "literal.id","capture1",
+        "uprefix","t_",
+        "capture","div",
+        "fmap.div", "foo_t",
+        "commit", "true"
+    );
+    assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
+    assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
+
+    loadLocal("extraction/simple.html",
+        "literal.id", "capture2",
+        "captureAttr", "true",
+        "defaultField", "text",
+        "fmap.div", "div_t",
+        "fmap.a", "anchor_t",
+        "capture", "div",
+        "capture", "a",
+        "commit", "true"
+    );
+    assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
+    assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
+    assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
+    assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
+  }

  @Test
  public void testDefaultField() throws Exception {
@ -476,14 +495,25 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
    assertTrue("handler is null and it shouldn't be", handler != null);
    SolrQueryResponse rsp = loadLocal("extraction/example.html",
-            ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant:node()",
+            ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()",
            ExtractingParams.EXTRACT_ONLY, "true"
    );
    assertTrue("rsp is null and it shouldn't be", rsp != null);
    NamedList list = rsp.getValues();
    String val = (String) list.get("example.html");
-    val = val.trim();
-    assertTrue(val + " is not equal to " + "linkNews", val.equals("linkNews") == true);//there are two <a> tags, and they get collapesd
+    assertEquals("News", val.trim()); //there is only one matching <a> tag
+
+    loadLocal("extraction/example.html",
+        "literal.id", "example1",
+        "captureAttr", "true",
+        "defaultField", "text",
+        "capture", "div",
+        "fmap.div", "foo_t",
+        "boost.foo_t", "3",
+        "xpath", "/xhtml:html/xhtml:body/xhtml:div//node()",
+        "commit", "true"
+    );
+    assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
  }

  /** test arabic PDF extraction is functional */