SOLR-1318: Added ICU4J to extraction and test for Arabic

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@921425 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2010-03-10 16:18:09 +00:00
parent 4f5166fdae
commit ca50eef4ca
4 changed files with 19 additions and 0 deletions

View File

@ -29,6 +29,7 @@ $Id:$
* SOLR-1738: Upgrade to Tika 0.6 (gsingers)
* SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers)
================== Release 1.4.0 ==================

View File

@ -0,0 +1,2 @@
AnyObjectId[bf0d532cb19e6ce3972f370a13a1940d1a8d1db8] was removed in git history.
Apache SVN contains full history.

View File

@ -322,6 +322,22 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
assertTrue(val + " is not equal to " + "linkNews", val.equals("linkNews") == true);//there are two <a> tags, and they get collapesd
}
/** test arabic PDF extraction is functional */
public void testArabicPDF() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "one",
"fmap.Last-Modified", "extractedDate");
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]");
assertU(commit());
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
}
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);

Binary file not shown.