mirror of https://github.com/apache/lucene.git
SOLR-4679, SOLR-4908, SOLR-5124: Text extracted from HTML or PDF files using Solr Cell was missing ignorable whitespace, which is inserted by TIKA for convenience to support plain text extraction without using the HTML elements. This bug resulted in glued words.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512296 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
25bd977a3c
commit
57acbcfd00
|
@ -106,6 +106,11 @@ Bug Fixes
|
|||
* SOLR-5107: Fixed NPE when using numTerms=0 in LukeRequestHandler
|
||||
(Ahmet Arslan, hossman)
|
||||
|
||||
* SOLR-4679, SOLR-4908, SOLR-5124: Text extracted from HTML or PDF files
|
||||
using Solr Cell was missing ignorable whitespace, which is inserted by
|
||||
TIKA for convenience to support plain text extraction without using the
|
||||
HTML elements. This bug resulted in glued words. (hossman, Uwe Schindler)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -303,6 +303,13 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
bldrStack.getLast().append(chars, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Treat the same as any other characters
|
||||
*/
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] chars, int offset, int length) throws SAXException {
|
||||
characters(chars, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField}
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
<p>
|
||||
Here is some text
|
||||
</p>
|
||||
<p>distinct<br/>words</p>
|
||||
<div>Here is some text in a div</div>
|
||||
<div>This has a <a href="http://www.apache.org">link</a>.</div>
|
||||
</body>
|
||||
|
|
|
@ -88,6 +88,10 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
assertU(commit());
|
||||
assertQ(req("title:Welcome"), "//*[@numFound='1']");
|
||||
|
||||
assertQ(req("extractedContent:distinctwords"), "//*[@numFound='0']");
|
||||
assertQ(req("extractedContent:distinct"), "//*[@numFound='1']");
|
||||
assertQ(req("extractedContent:words"), "//*[@numFound='2']");
|
||||
assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']");
|
||||
|
||||
loadLocal("extraction/simple.html",
|
||||
"literal.id","simple2",
|
||||
|
|
Loading…
Reference in New Issue