SOLR-4679, SOLR-4908, SOLR-5124: Text extracted from HTML or PDF files using Solr Cell was missing ignorable whitespace, which is inserted by TIKA for convenience to support plain text extraction without using the HTML elements. This bug resulted in glued words.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512296 13f79535-47bb-0310-9956-ffa450edef68
2013-08-09 13:26:55 +00:00 · 2013-08-09 13:26:55 +00:00 · 57acbcfd00
parent 25bd977a3c
commit 57acbcfd00
4 changed files with 17 additions and 0 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -106,6 +106,11 @@ Bug Fixes
 * SOLR-5107: Fixed NPE when using numTerms=0 in LukeRequestHandler
  (Ahmet Arslan, hossman)

+* SOLR-4679, SOLR-4908, SOLR-5124: Text extracted from HTML or PDF files
+  using Solr Cell was missing ignorable whitespace, which is inserted by
+  TIKA for convenience to support plain text extraction without using the
+  HTML elements. This bug resulted in glued words.  (hossman, Uwe Schindler)
+
 Optimizations
 ----------------------

--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
@ -303,6 +303,13 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
    bldrStack.getLast().append(chars, offset, length);
  }

+  /**
+   * Treat the same as any other characters
+   */
+  @Override
+  public void ignorableWhitespace(char[] chars, int offset, int length) throws SAXException {
+    characters(chars, offset, length);
+  }

  /**
   * Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField}
--- a/solr/contrib/extraction/src/test-files/extraction/simple.html
+++ b/solr/contrib/extraction/src/test-files/extraction/simple.html
@ -6,6 +6,7 @@
 <p>
  Here is some text
 </p>
+<p>distinct<br/>words</p>
 <div>Here is some text in a div</div>
 <div>This has a <a href="http://www.apache.org">link</a>.</div>
 </body>
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@ -88,6 +88,10 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
    assertU(commit());
    assertQ(req("title:Welcome"), "//*[@numFound='1']");

+    assertQ(req("extractedContent:distinctwords"),      "//*[@numFound='0']");
+    assertQ(req("extractedContent:distinct"),           "//*[@numFound='1']");
+    assertQ(req("extractedContent:words"),              "//*[@numFound='2']");
+    assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']");

    loadLocal("extraction/simple.html",
      "literal.id","simple2",