SOLR-2241: upgrade to Tika 0.8

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1040815 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2010-11-30 22:33:30 +00:00
parent 0511306b53
commit ef762e6046
26 changed files with 42 additions and 28 deletions

View File

@ -20,13 +20,13 @@ to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequ
Tika Dependency Tika Dependency
--------------- ---------------
Current Version: Tika 0.8-SNAPSHOT (rev 942725) Current Version: Tika 0.8 (released 11/07/2010)
$Id:$ $Id:$
================== Release 1.5-dev ================== ================== Release 3.1-dev ==================
* Upgraded to Tika 0.8 and changed deprecated parse call
* SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that * SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller) parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)

View File

@ -0,0 +1,2 @@
AnyObjectId[b64b033af70609338c07e2a88a5f7efcd1a84ddb] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[78d832c11c42023d4bc12077a1d9b7b5025217bc] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[51baf91a2df10184a8cca5cb43f11418576743a1] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[3bc5a7691d234751986dbeeca353f9ee390f1ffb] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[194e1f0c6e458db0b840b1530534a199306c07d2] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[d2c9a0514c1c4123c815851a5643eccd3ca884c8] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[371c2537fc26548ca8187f426900b34d9ab8b435] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[d93af7f1688eba78bb8580e010adf1ee66ac1d40] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[ed19b45098b326c42f625db2613c21a03a3ff79e] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[c3cce64a366865316dd1e579a53e6db858166619] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[9972d973277def35e3749d39cf39dfa37d61f75c] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[a08d953500f508864bb22ff1306f396d8b634c22] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[c986646e69bef4e3cd9086eabfc67f6a200fa3d9] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[5f36eb4e9b23409c8b266b196140975de6da3a80] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[5b79f0246f6b9b599767586fc426b26cf28c960a] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[82282b542613378e3bd46c6850c6ac1e715b5f11] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[1a01b2b895b560d94dd12b3fd5e46a39724e16d1] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[6fd02d419c0653c0127773ad3f22e186c03764cb] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[ba482aecb0d9b5a1b74d038c37a8cdde821b0258] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[934d3a7a0c87fc25ffe6bdfa2774fc7ae8e5cbd8] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[809e47cc4fb901a2fd67c99b70952c36243a0cd2] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[6aba6dca7d96e30dd3c411cd0a2e28033b219767] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[25d23ac5cf511587131a9e9ee58ad384ccf6f57c] was removed in git history.
Apache SVN contains full history.

View File

@ -31,6 +31,7 @@ import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser; import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.Matcher;
@ -190,7 +191,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
} //else leave it as is } //else leave it as is
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document. //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
parser.parse(inputStream, parsingHandler, metadata); ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
parser.parse(inputStream, parsingHandler, metadata, context);
if (extractOnly == false) { if (extractOnly == false) {
addDoc(handler); addDoc(handler);
} else { } else {

View File

@ -58,13 +58,15 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
@Test @Test
public void testExtraction() throws Exception { public void testExtraction() throws Exception {
// broken for turkish: https://issues.apache.org/jira/browse/SOLR-2088
String defLang = Locale.getDefault().getLanguage();
assumeFalse("Known bugs under Turkish locale: https://issues.apache.org/jira/browse/SOLR-2088", defLang.equals("tr") || defLang.equals("az"));
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null); assertTrue("handler is null and it shouldn't be", handler != null);
loadLocal("solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", loadLocal("solr-word.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"fmap.AAPL:Keywords", "ignored_a",
"fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor", "fmap.Author", "extractedAuthor",
"fmap.content", "extractedContent", "fmap.content", "extractedContent",
"literal.id", "one", "literal.id", "one",
@ -146,6 +148,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
} }
@Test @Test
public void testDefaultField() throws Exception { public void testDefaultField() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
@ -349,6 +352,9 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"fmap.AAPL:Keywords", "ignored_a",
"fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor", "fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase", "fmap.content", "wdf_nocase",
"literal.id", "one", "literal.id", "one",