From ef762e604669b52d82638871ad2fb3ad214ce24c Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Tue, 30 Nov 2010 22:33:30 +0000 Subject: [PATCH] SOLR-2241: upgrade to Tika 0.8 git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1040815 13f79535-47bb-0310-9956-ffa450edef68 --- solr/contrib/extraction/CHANGES.txt | 6 +++--- solr/contrib/extraction/lib/boilerpipe-1.1.0.jar | 2 ++ .../extraction/lib/commons-compress-1.0.jar | 2 -- .../extraction/lib/commons-compress-1.1.jar | 2 ++ solr/contrib/extraction/lib/fontbox-1.1.0.jar | 2 -- solr/contrib/extraction/lib/fontbox-1.3.1.jar | 2 ++ solr/contrib/extraction/lib/jempbox-1.1.0.jar | 2 -- solr/contrib/extraction/lib/jempbox-1.3.1.jar | 2 ++ solr/contrib/extraction/lib/netcdf-4.2.jar | 2 ++ solr/contrib/extraction/lib/pdfbox-1.1.0.jar | 2 -- solr/contrib/extraction/lib/pdfbox-1.3.1.jar | 2 ++ solr/contrib/extraction/lib/poi-3.6.jar | 2 -- solr/contrib/extraction/lib/poi-3.7.jar | 2 ++ solr/contrib/extraction/lib/poi-ooxml-3.6.jar | 2 -- solr/contrib/extraction/lib/poi-ooxml-3.7.jar | 2 ++ .../extraction/lib/poi-ooxml-schemas-3.6.jar | 2 -- .../extraction/lib/poi-ooxml-schemas-3.7.jar | 2 ++ solr/contrib/extraction/lib/poi-scratchpad-3.6.jar | 2 -- solr/contrib/extraction/lib/poi-scratchpad-3.7.jar | 2 ++ solr/contrib/extraction/lib/rome-0.9.jar | 2 ++ .../extraction/lib/tika-core-0.8-SNAPSHOT.jar | 2 -- solr/contrib/extraction/lib/tika-core-0.8.jar | 2 ++ .../extraction/lib/tika-parsers-0.8-SNAPSHOT.jar | 2 -- solr/contrib/extraction/lib/tika-parsers-0.8.jar | 2 ++ .../extraction/ExtractingDocumentLoader.java | 4 +++- .../solr/handler/ExtractingRequestHandlerTest.java | 14 ++++++++++---- 26 files changed, 42 insertions(+), 28 deletions(-) create mode 100644 solr/contrib/extraction/lib/boilerpipe-1.1.0.jar delete mode 100644 solr/contrib/extraction/lib/commons-compress-1.0.jar create mode 100644 solr/contrib/extraction/lib/commons-compress-1.1.jar delete mode 100644 solr/contrib/extraction/lib/fontbox-1.1.0.jar create mode 100644 solr/contrib/extraction/lib/fontbox-1.3.1.jar delete mode 100644 solr/contrib/extraction/lib/jempbox-1.1.0.jar create mode 100644 solr/contrib/extraction/lib/jempbox-1.3.1.jar create mode 100644 solr/contrib/extraction/lib/netcdf-4.2.jar delete mode 100644 solr/contrib/extraction/lib/pdfbox-1.1.0.jar create mode 100644 solr/contrib/extraction/lib/pdfbox-1.3.1.jar delete mode 100644 solr/contrib/extraction/lib/poi-3.6.jar create mode 100644 solr/contrib/extraction/lib/poi-3.7.jar delete mode 100644 solr/contrib/extraction/lib/poi-ooxml-3.6.jar create mode 100644 solr/contrib/extraction/lib/poi-ooxml-3.7.jar delete mode 100644 solr/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar create mode 100644 solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar delete mode 100644 solr/contrib/extraction/lib/poi-scratchpad-3.6.jar create mode 100644 solr/contrib/extraction/lib/poi-scratchpad-3.7.jar create mode 100644 solr/contrib/extraction/lib/rome-0.9.jar delete mode 100644 solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar create mode 100644 solr/contrib/extraction/lib/tika-core-0.8.jar delete mode 100644 solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar create mode 100644 solr/contrib/extraction/lib/tika-parsers-0.8.jar diff --git a/solr/contrib/extraction/CHANGES.txt b/solr/contrib/extraction/CHANGES.txt index 0a404691e3b..86d23bda34c 100644 --- a/solr/contrib/extraction/CHANGES.txt +++ b/solr/contrib/extraction/CHANGES.txt @@ -20,13 +20,13 @@ to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequ Tika Dependency --------------- -Current Version: Tika 0.8-SNAPSHOT (rev 942725) +Current Version: Tika 0.8 (released 11/07/2010) $Id:$ -================== Release 1.5-dev ================== - +================== Release 3.1-dev ================== +* Upgraded to Tika 0.8 and changed deprecated parse call * SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller) diff --git a/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar b/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar new file mode 100644 index 00000000000..ee17c0a37f8 --- /dev/null +++ b/solr/contrib/extraction/lib/boilerpipe-1.1.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[b64b033af70609338c07e2a88a5f7efcd1a84ddb] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/commons-compress-1.0.jar b/solr/contrib/extraction/lib/commons-compress-1.0.jar deleted file mode 100644 index 473e2bfa654..00000000000 --- a/solr/contrib/extraction/lib/commons-compress-1.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[78d832c11c42023d4bc12077a1d9b7b5025217bc] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/commons-compress-1.1.jar b/solr/contrib/extraction/lib/commons-compress-1.1.jar new file mode 100644 index 00000000000..c47e3962b1c --- /dev/null +++ b/solr/contrib/extraction/lib/commons-compress-1.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[51baf91a2df10184a8cca5cb43f11418576743a1] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/fontbox-1.1.0.jar b/solr/contrib/extraction/lib/fontbox-1.1.0.jar deleted file mode 100644 index 96095bd61e1..00000000000 --- a/solr/contrib/extraction/lib/fontbox-1.1.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[3bc5a7691d234751986dbeeca353f9ee390f1ffb] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/fontbox-1.3.1.jar b/solr/contrib/extraction/lib/fontbox-1.3.1.jar new file mode 100644 index 00000000000..9ef07f971e5 --- /dev/null +++ b/solr/contrib/extraction/lib/fontbox-1.3.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[194e1f0c6e458db0b840b1530534a199306c07d2] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/jempbox-1.1.0.jar b/solr/contrib/extraction/lib/jempbox-1.1.0.jar deleted file mode 100644 index f0db763fefe..00000000000 --- a/solr/contrib/extraction/lib/jempbox-1.1.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[d2c9a0514c1c4123c815851a5643eccd3ca884c8] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/jempbox-1.3.1.jar b/solr/contrib/extraction/lib/jempbox-1.3.1.jar new file mode 100644 index 00000000000..989f389fcf3 --- /dev/null +++ b/solr/contrib/extraction/lib/jempbox-1.3.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[371c2537fc26548ca8187f426900b34d9ab8b435] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/netcdf-4.2.jar b/solr/contrib/extraction/lib/netcdf-4.2.jar new file mode 100644 index 00000000000..a26a8890e09 --- /dev/null +++ b/solr/contrib/extraction/lib/netcdf-4.2.jar @@ -0,0 +1,2 @@ +AnyObjectId[d93af7f1688eba78bb8580e010adf1ee66ac1d40] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/pdfbox-1.1.0.jar b/solr/contrib/extraction/lib/pdfbox-1.1.0.jar deleted file mode 100644 index 11b995f7a5f..00000000000 --- a/solr/contrib/extraction/lib/pdfbox-1.1.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[ed19b45098b326c42f625db2613c21a03a3ff79e] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/pdfbox-1.3.1.jar b/solr/contrib/extraction/lib/pdfbox-1.3.1.jar new file mode 100644 index 00000000000..3d016a980d9 --- /dev/null +++ b/solr/contrib/extraction/lib/pdfbox-1.3.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[c3cce64a366865316dd1e579a53e6db858166619] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/poi-3.6.jar b/solr/contrib/extraction/lib/poi-3.6.jar deleted file mode 100644 index 804f9d9737d..00000000000 --- a/solr/contrib/extraction/lib/poi-3.6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[9972d973277def35e3749d39cf39dfa37d61f75c] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/poi-3.7.jar b/solr/contrib/extraction/lib/poi-3.7.jar new file mode 100644 index 00000000000..f45329703f5 --- /dev/null +++ b/solr/contrib/extraction/lib/poi-3.7.jar @@ -0,0 +1,2 @@ +AnyObjectId[a08d953500f508864bb22ff1306f396d8b634c22] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/poi-ooxml-3.6.jar b/solr/contrib/extraction/lib/poi-ooxml-3.6.jar deleted file mode 100644 index 343f2c569f2..00000000000 --- a/solr/contrib/extraction/lib/poi-ooxml-3.6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[c986646e69bef4e3cd9086eabfc67f6a200fa3d9] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/poi-ooxml-3.7.jar b/solr/contrib/extraction/lib/poi-ooxml-3.7.jar new file mode 100644 index 00000000000..bad71afaf64 --- /dev/null +++ b/solr/contrib/extraction/lib/poi-ooxml-3.7.jar @@ -0,0 +1,2 @@ +AnyObjectId[5f36eb4e9b23409c8b266b196140975de6da3a80] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar b/solr/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar deleted file mode 100644 index a4a66f3976d..00000000000 --- a/solr/contrib/extraction/lib/poi-ooxml-schemas-3.6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[5b79f0246f6b9b599767586fc426b26cf28c960a] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar b/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar new file mode 100644 index 00000000000..8f593a9f7d6 --- /dev/null +++ b/solr/contrib/extraction/lib/poi-ooxml-schemas-3.7.jar @@ -0,0 +1,2 @@ +AnyObjectId[82282b542613378e3bd46c6850c6ac1e715b5f11] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/poi-scratchpad-3.6.jar b/solr/contrib/extraction/lib/poi-scratchpad-3.6.jar deleted file mode 100644 index f261b6ad666..00000000000 --- a/solr/contrib/extraction/lib/poi-scratchpad-3.6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[1a01b2b895b560d94dd12b3fd5e46a39724e16d1] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar b/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar new file mode 100644 index 00000000000..c3415432e75 --- /dev/null +++ b/solr/contrib/extraction/lib/poi-scratchpad-3.7.jar @@ -0,0 +1,2 @@ +AnyObjectId[6fd02d419c0653c0127773ad3f22e186c03764cb] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/rome-0.9.jar b/solr/contrib/extraction/lib/rome-0.9.jar new file mode 100644 index 00000000000..796717c03be --- /dev/null +++ b/solr/contrib/extraction/lib/rome-0.9.jar @@ -0,0 +1,2 @@ +AnyObjectId[ba482aecb0d9b5a1b74d038c37a8cdde821b0258] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar b/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar deleted file mode 100644 index 99d39fe372b..00000000000 --- a/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[934d3a7a0c87fc25ffe6bdfa2774fc7ae8e5cbd8] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-core-0.8.jar b/solr/contrib/extraction/lib/tika-core-0.8.jar new file mode 100644 index 00000000000..56c34842918 --- /dev/null +++ b/solr/contrib/extraction/lib/tika-core-0.8.jar @@ -0,0 +1,2 @@ +AnyObjectId[809e47cc4fb901a2fd67c99b70952c36243a0cd2] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar b/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar deleted file mode 100644 index 7f4d6fb2c06..00000000000 --- a/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[6aba6dca7d96e30dd3c411cd0a2e28033b219767] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-parsers-0.8.jar b/solr/contrib/extraction/lib/tika-parsers-0.8.jar new file mode 100644 index 00000000000..b36aaaaf3e5 --- /dev/null +++ b/solr/contrib/extraction/lib/tika-parsers-0.8.jar @@ -0,0 +1,2 @@ +AnyObjectId[25d23ac5cf511587131a9e9ee58ad384ccf6f57c] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index b79aeff17e4..f97918be1f8 100644 --- a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -31,6 +31,7 @@ import org.apache.solr.handler.ContentStreamLoader; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.sax.xpath.Matcher; @@ -190,7 +191,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { } //else leave it as is //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document. - parser.parse(inputStream, parsingHandler, metadata); + ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context? + parser.parse(inputStream, parsingHandler, metadata, context); if (extractOnly == false) { addDoc(handler); } else { diff --git a/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java index b7ef604ef61..64f6767fce7 100644 --- a/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java +++ b/solr/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java @@ -58,13 +58,15 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { @Test public void testExtraction() throws Exception { - // broken for turkish: https://issues.apache.org/jira/browse/SOLR-2088 - String defLang = Locale.getDefault().getLanguage(); - assumeFalse("Known bugs under Turkish locale: https://issues.apache.org/jira/browse/SOLR-2088", defLang.equals("tr") || defLang.equals("az")); ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); - loadLocal("solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", + loadLocal("solr-word.pdf", + "fmap.created", "extractedDate", + "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", + "fmap.Creation-Date", "extractedDate", + "fmap.AAPL:Keywords", "ignored_a", + "fmap.xmpTPg:NPages", "ignored_a", "fmap.Author", "extractedAuthor", "fmap.content", "extractedContent", "literal.id", "one", @@ -146,6 +148,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { } + @Test public void testDefaultField() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); @@ -349,6 +352,9 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", + "fmap.Creation-Date", "extractedDate", + "fmap.AAPL:Keywords", "ignored_a", + "fmap.xmpTPg:NPages", "ignored_a", "fmap.Author", "extractedAuthor", "fmap.content", "wdf_nocase", "literal.id", "one",