From 4da546cc291a9994ec5c1d2de87d4dcb79555e2d Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Sun, 22 Nov 2009 16:18:49 +0000 Subject: [PATCH] SOLR-1567: Upgrade to Tika 0.5 git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@883095 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/extraction/CHANGES.txt | 8 ++++++-- contrib/extraction/lib/bcmail-jdk14-136.jar | 2 -- contrib/extraction/lib/bcprov-jdk14-136.jar | 2 -- contrib/extraction/lib/commons-codec-1.3.jar | 2 -- contrib/extraction/lib/commons-io-1.4.jar | 2 -- contrib/extraction/lib/commons-lang-2.1.jar | 2 -- contrib/extraction/lib/fontbox-0.1.0.jar | 2 -- contrib/extraction/lib/fontbox-0.8.0-incubator.jar | 2 ++ .../lib/geronimo-stax-api_1.0_spec-1.0.1.jar | 2 ++ .../lib/geronimo-stax-api_1.0_spec-1.0.jar | 2 -- contrib/extraction/lib/icu4j-3.8.jar | 2 -- contrib/extraction/lib/jempbox-0.2.0.jar | 2 -- contrib/extraction/lib/jempbox-0.8.0-incubator.jar | 2 ++ .../lib/metadata-extractor-2.4.0-beta-1.jar | 2 ++ contrib/extraction/lib/nekohtml-1.9.9.jar | 2 -- contrib/extraction/lib/pdfbox-0.7.3.jar | 2 -- contrib/extraction/lib/pdfbox-0.8.0-incubating.jar | 2 ++ contrib/extraction/lib/poi-3.5-FINAL.jar | 2 ++ contrib/extraction/lib/poi-3.5-beta6.jar | 2 -- contrib/extraction/lib/poi-ooxml-3.5-FINAL.jar | 2 ++ contrib/extraction/lib/poi-ooxml-3.5-beta6.jar | 2 -- contrib/extraction/lib/poi-scratchpad-3.5-FINAL.jar | 2 ++ contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar | 2 -- contrib/extraction/lib/tagsoup-1.2.jar | 2 ++ contrib/extraction/lib/tika-core-0.4.jar | 2 -- contrib/extraction/lib/tika-core-0.5.jar | 2 ++ contrib/extraction/lib/tika-parsers-0.4.jar | 2 -- contrib/extraction/lib/tika-parsers-0.5.jar | 2 ++ .../handler/extraction/ExtractingRequestHandler.java | 12 ++---------- 29 files changed, 30 insertions(+), 44 deletions(-) delete mode 100644 contrib/extraction/lib/bcmail-jdk14-136.jar delete mode 100644 contrib/extraction/lib/bcprov-jdk14-136.jar delete mode 100644 contrib/extraction/lib/commons-codec-1.3.jar delete mode 100644 contrib/extraction/lib/commons-io-1.4.jar delete mode 100644 contrib/extraction/lib/commons-lang-2.1.jar delete mode 100644 contrib/extraction/lib/fontbox-0.1.0.jar create mode 100644 contrib/extraction/lib/fontbox-0.8.0-incubator.jar create mode 100644 contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar delete mode 100644 contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar delete mode 100644 contrib/extraction/lib/icu4j-3.8.jar delete mode 100644 contrib/extraction/lib/jempbox-0.2.0.jar create mode 100644 contrib/extraction/lib/jempbox-0.8.0-incubator.jar create mode 100644 contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar delete mode 100644 contrib/extraction/lib/nekohtml-1.9.9.jar delete mode 100644 contrib/extraction/lib/pdfbox-0.7.3.jar create mode 100644 contrib/extraction/lib/pdfbox-0.8.0-incubating.jar create mode 100644 contrib/extraction/lib/poi-3.5-FINAL.jar delete mode 100644 contrib/extraction/lib/poi-3.5-beta6.jar create mode 100644 contrib/extraction/lib/poi-ooxml-3.5-FINAL.jar delete mode 100644 contrib/extraction/lib/poi-ooxml-3.5-beta6.jar create mode 100644 contrib/extraction/lib/poi-scratchpad-3.5-FINAL.jar delete mode 100644 contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar create mode 100644 contrib/extraction/lib/tagsoup-1.2.jar delete mode 100644 contrib/extraction/lib/tika-core-0.4.jar create mode 100644 contrib/extraction/lib/tika-core-0.5.jar delete mode 100644 contrib/extraction/lib/tika-parsers-0.4.jar create mode 100644 contrib/extraction/lib/tika-parsers-0.5.jar diff --git a/contrib/extraction/CHANGES.txt b/contrib/extraction/CHANGES.txt index 5b2729bd3dd..1609b2da955 100644 --- a/contrib/extraction/CHANGES.txt +++ b/contrib/extraction/CHANGES.txt @@ -17,8 +17,12 @@ You will need Solr up and running. Then, simply add the extraction JAR file, pl to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequestHandler for more details on hooking it in and configuring. - $Id:$ + +================== Release 1.5-dev ================== + +* SOLR-1567: Upgrade to Tika 0.5, which upgrades many of the underlying libraries (PDFBox, for example) too (gsingers) + ================== Release 1.4.0 ================== 1. SOLR-284: Added in support for extraction. (Eric Pugh, Chris Harris, gsingers) @@ -34,4 +38,4 @@ $Id:$ for discussion on language detection. See http://www.apache.org/dist/lucene/tika/CHANGES-0.4.txt. (gsingers) -6. SOLR-1274: Added text serialization output for extractOnly (Peter Wolanin, gsingers) \ No newline at end of file +6. SOLR-1274: Added text serialization output for extractOnly (Peter Wolanin, gsingers) diff --git a/contrib/extraction/lib/bcmail-jdk14-136.jar b/contrib/extraction/lib/bcmail-jdk14-136.jar deleted file mode 100644 index 17c87e6170e..00000000000 --- a/contrib/extraction/lib/bcmail-jdk14-136.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[3f78d2a6b7154d80e06bb96441dcf4faa60518c5] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/bcprov-jdk14-136.jar b/contrib/extraction/lib/bcprov-jdk14-136.jar deleted file mode 100644 index 8e97af41f0c..00000000000 --- a/contrib/extraction/lib/bcprov-jdk14-136.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[c1cf5b3c233fabfc9765d0cb641dcb7e903ec179] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/commons-codec-1.3.jar b/contrib/extraction/lib/commons-codec-1.3.jar deleted file mode 100644 index 41a0921bf9f..00000000000 --- a/contrib/extraction/lib/commons-codec-1.3.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[957b6752af9a60c1bb2a4f65db0e90e5ce00f521] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/commons-io-1.4.jar b/contrib/extraction/lib/commons-io-1.4.jar deleted file mode 100644 index 6052eb538ab..00000000000 --- a/contrib/extraction/lib/commons-io-1.4.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[133dc6cb35f5ca2c5920fd0933a557c2def88680] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/commons-lang-2.1.jar b/contrib/extraction/lib/commons-lang-2.1.jar deleted file mode 100644 index c56ef0cba61..00000000000 --- a/contrib/extraction/lib/commons-lang-2.1.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[87b80ab5db1729662ccf3439e147430a28c36d03] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/fontbox-0.1.0.jar b/contrib/extraction/lib/fontbox-0.1.0.jar deleted file mode 100644 index 91ca9cca647..00000000000 --- a/contrib/extraction/lib/fontbox-0.1.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[acb1d7b1d4e00241adba2188235bc85e51ab010c] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/fontbox-0.8.0-incubator.jar b/contrib/extraction/lib/fontbox-0.8.0-incubator.jar new file mode 100644 index 00000000000..e0b3470ce1a --- /dev/null +++ b/contrib/extraction/lib/fontbox-0.8.0-incubator.jar @@ -0,0 +1,2 @@ +AnyObjectId[91a496ac1164c08522c1e622fc39b8e991dd2d0b] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar b/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar new file mode 100644 index 00000000000..77da03b5bd6 --- /dev/null +++ b/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[ab1ee3ba605df11b3075677c808d092845dad123] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar b/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar deleted file mode 100644 index a560a688df5..00000000000 --- a/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[0d6d37423b2e2b53aba04bb09845233502619cb6] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/icu4j-3.8.jar b/contrib/extraction/lib/icu4j-3.8.jar deleted file mode 100644 index 61523bbc913..00000000000 --- a/contrib/extraction/lib/icu4j-3.8.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[26654862dc6a6ed6b9c8c4766660150d6dd0efd6] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/jempbox-0.2.0.jar b/contrib/extraction/lib/jempbox-0.2.0.jar deleted file mode 100644 index 76a0bd2eaf0..00000000000 --- a/contrib/extraction/lib/jempbox-0.2.0.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[953613cba49cc76b0562116af797c68ccf2a0a52] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/jempbox-0.8.0-incubator.jar b/contrib/extraction/lib/jempbox-0.8.0-incubator.jar new file mode 100644 index 00000000000..f01f9529a03 --- /dev/null +++ b/contrib/extraction/lib/jempbox-0.8.0-incubator.jar @@ -0,0 +1,2 @@ +AnyObjectId[adcead7737700efc2c77f7f16a83ce0c0547381a] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar b/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar new file mode 100644 index 00000000000..a396b86ca0b --- /dev/null +++ b/contrib/extraction/lib/metadata-extractor-2.4.0-beta-1.jar @@ -0,0 +1,2 @@ +AnyObjectId[3720d649dd56d96f9351435dea7c2c921a0be050] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/nekohtml-1.9.9.jar b/contrib/extraction/lib/nekohtml-1.9.9.jar deleted file mode 100644 index 00195ba2b18..00000000000 --- a/contrib/extraction/lib/nekohtml-1.9.9.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[2e0627173abfce31e41a5c6b9b06f0d6a911ae6a] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/pdfbox-0.7.3.jar b/contrib/extraction/lib/pdfbox-0.7.3.jar deleted file mode 100644 index 88d36fee995..00000000000 --- a/contrib/extraction/lib/pdfbox-0.7.3.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[9bfe670f69cbf240764b6afd74c166afc7d62256] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/pdfbox-0.8.0-incubating.jar b/contrib/extraction/lib/pdfbox-0.8.0-incubating.jar new file mode 100644 index 00000000000..f0bffd53e8c --- /dev/null +++ b/contrib/extraction/lib/pdfbox-0.8.0-incubating.jar @@ -0,0 +1,2 @@ +AnyObjectId[637324e4666d5fbf2fe29cb8151a790b1ccabcec] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-3.5-FINAL.jar b/contrib/extraction/lib/poi-3.5-FINAL.jar new file mode 100644 index 00000000000..928ae65dc1a --- /dev/null +++ b/contrib/extraction/lib/poi-3.5-FINAL.jar @@ -0,0 +1,2 @@ +AnyObjectId[7c5c5343469ce7f60f5bff6db1f3c2de51933b70] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-3.5-beta6.jar b/contrib/extraction/lib/poi-3.5-beta6.jar deleted file mode 100644 index d847acd2d71..00000000000 --- a/contrib/extraction/lib/poi-3.5-beta6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[3281c6e43309fc39dda416d47f632e10f2367c1b] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-ooxml-3.5-FINAL.jar b/contrib/extraction/lib/poi-ooxml-3.5-FINAL.jar new file mode 100644 index 00000000000..d7d72b4045f --- /dev/null +++ b/contrib/extraction/lib/poi-ooxml-3.5-FINAL.jar @@ -0,0 +1,2 @@ +AnyObjectId[2fe4979a8d8ee91a5b517a84c405fc1fcf1560cb] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-ooxml-3.5-beta6.jar b/contrib/extraction/lib/poi-ooxml-3.5-beta6.jar deleted file mode 100644 index 92354c8c8ea..00000000000 --- a/contrib/extraction/lib/poi-ooxml-3.5-beta6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[206d9920d36cbe51e1cd7c801294734ae1401505] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-scratchpad-3.5-FINAL.jar b/contrib/extraction/lib/poi-scratchpad-3.5-FINAL.jar new file mode 100644 index 00000000000..05896f003e9 --- /dev/null +++ b/contrib/extraction/lib/poi-scratchpad-3.5-FINAL.jar @@ -0,0 +1,2 @@ +AnyObjectId[b220b35818d54fa7c6efae42821754c54c358293] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar b/contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar deleted file mode 100644 index cd9ed02aa04..00000000000 --- a/contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[c1a27b5fef037f0d81a115a8ef377b95356bbbed] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tagsoup-1.2.jar b/contrib/extraction/lib/tagsoup-1.2.jar new file mode 100644 index 00000000000..95267dcb1d0 --- /dev/null +++ b/contrib/extraction/lib/tagsoup-1.2.jar @@ -0,0 +1,2 @@ +AnyObjectId[af27803ec117e6ec643b8522e266481253b35fe3] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tika-core-0.4.jar b/contrib/extraction/lib/tika-core-0.4.jar deleted file mode 100644 index 6f8fb822caa..00000000000 --- a/contrib/extraction/lib/tika-core-0.4.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[32b269484b5fa5f7e695e66346df089cfb740780] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tika-core-0.5.jar b/contrib/extraction/lib/tika-core-0.5.jar new file mode 100644 index 00000000000..cd5e66b2169 --- /dev/null +++ b/contrib/extraction/lib/tika-core-0.5.jar @@ -0,0 +1,2 @@ +AnyObjectId[1adcb48b593b05ebdd77e99d79e9600f24cac99c] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tika-parsers-0.4.jar b/contrib/extraction/lib/tika-parsers-0.4.jar deleted file mode 100644 index aae708c2528..00000000000 --- a/contrib/extraction/lib/tika-parsers-0.4.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[8451a7cc91b2c3c25429446d0d3adf89af2e31c8] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tika-parsers-0.5.jar b/contrib/extraction/lib/tika-parsers-0.5.jar new file mode 100644 index 00000000000..0b7e57a13b8 --- /dev/null +++ b/contrib/extraction/lib/tika-parsers-0.5.jar @@ -0,0 +1,2 @@ +AnyObjectId[0928f3cdf4b2d077b1f1f874c2d15c796e9d73c4] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 0d870366d31..32cb7b3a13c 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -76,11 +76,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement throw new SolrException(ErrorCode.SERVER_ERROR, e); } } else { - try { - config = TikaConfig.getDefaultConfig(); - } catch (TikaException e) { - throw new SolrException(ErrorCode.SERVER_ERROR, e); - } + config = TikaConfig.getDefaultConfig(); } NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS); if (configDateFormats != null && configDateFormats.size() > 0) { @@ -92,11 +88,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement } } } else { - try { - config = TikaConfig.getDefaultConfig(); - } catch (TikaException e) { - throw new SolrException(ErrorCode.SERVER_ERROR, e); - } + config = TikaConfig.getDefaultConfig(); } factory = createFactory(); }