diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 0951352a0fb..4e0e6af2ab8 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -106,6 +106,11 @@ Bug Fixes * SOLR-5107: Fixed NPE when using numTerms=0 in LukeRequestHandler (Ahmet Arslan, hossman) +* SOLR-4679, SOLR-4908, SOLR-5124: Text extracted from HTML or PDF files + using Solr Cell was missing ignorable whitespace, which is inserted by + TIKA for convenience to support plain text extraction without using the + HTML elements. This bug resulted in glued words. (hossman, Uwe Schindler) + Optimizations ---------------------- diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java index ad84ad54d88..f0cc5d0a57e 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -303,6 +303,13 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara bldrStack.getLast().append(chars, offset, length); } + /** + * Treat the same as any other characters + */ + @Override + public void ignorableWhitespace(char[] chars, int offset, int length) throws SAXException { + characters(chars, offset, length); + } /** * Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField} diff --git a/solr/contrib/extraction/src/test-files/extraction/simple.html b/solr/contrib/extraction/src/test-files/extraction/simple.html index f33cf92677e..656b656b6ab 100644 --- a/solr/contrib/extraction/src/test-files/extraction/simple.html +++ b/solr/contrib/extraction/src/test-files/extraction/simple.html @@ -6,6 +6,7 @@
Here is some text
+distinct
words