diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 0951352a0fb..4e0e6af2ab8 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -106,6 +106,11 @@ Bug Fixes * SOLR-5107: Fixed NPE when using numTerms=0 in LukeRequestHandler (Ahmet Arslan, hossman) +* SOLR-4679, SOLR-4908, SOLR-5124: Text extracted from HTML or PDF files + using Solr Cell was missing ignorable whitespace, which is inserted by + TIKA for convenience to support plain text extraction without using the + HTML elements. This bug resulted in glued words. (hossman, Uwe Schindler) + Optimizations ---------------------- diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java index ad84ad54d88..f0cc5d0a57e 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -303,6 +303,13 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara bldrStack.getLast().append(chars, offset, length); } + /** + * Treat the same as any other characters + */ + @Override + public void ignorableWhitespace(char[] chars, int offset, int length) throws SAXException { + characters(chars, offset, length); + } /** * Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField} diff --git a/solr/contrib/extraction/src/test-files/extraction/simple.html b/solr/contrib/extraction/src/test-files/extraction/simple.html index f33cf92677e..656b656b6ab 100644 --- a/solr/contrib/extraction/src/test-files/extraction/simple.html +++ b/solr/contrib/extraction/src/test-files/extraction/simple.html @@ -6,6 +6,7 @@

Here is some text

+

distinct
words

Here is some text in a div
This has a link.
diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index dbdb370348d..eabddd6b398 100644 --- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -88,6 +88,10 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { assertU(commit()); assertQ(req("title:Welcome"), "//*[@numFound='1']"); + assertQ(req("extractedContent:distinctwords"), "//*[@numFound='0']"); + assertQ(req("extractedContent:distinct"), "//*[@numFound='1']"); + assertQ(req("extractedContent:words"), "//*[@numFound='2']"); + assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']"); loadLocal("extraction/simple.html", "literal.id","simple2",