diff --git a/contrib/extraction/CHANGES.txt b/contrib/extraction/CHANGES.txt index 1862ba9aee0..0aced5f9384 100644 --- a/contrib/extraction/CHANGES.txt +++ b/contrib/extraction/CHANGES.txt @@ -28,4 +28,9 @@ $Id:$ 3. SOLR-1075: Upgrade to Tika 0.3. See http://www.apache.org/dist/lucene/tika/CHANGES-0.3.txt (gsingers) -4. SOLR-1128: Added metadata output to "extract only" option. (gsingers) \ No newline at end of file +4. SOLR-1128: Added metadata output to "extract only" option. (gsingers) + +5. SOLR-1310: Upgrade to Tika 0.4. Note there are some differences in detecting Languages now. + See http://www.lucidimagination.com/search/document/d6f1899a85b2a45c/vote_apache_tika_0_4_release_candidate_2#d6f1899a85b2a45c + for discussion on language detection. + See http://www.apache.org/dist/lucene/tika/CHANGES-0.4.txt. (gsingers) \ No newline at end of file diff --git a/contrib/extraction/lib/bcmail-jdk14-132.jar b/contrib/extraction/lib/bcmail-jdk14-132.jar deleted file mode 100644 index 07bab8e5fe6..00000000000 --- a/contrib/extraction/lib/bcmail-jdk14-132.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[680f8c60c1f0393f7e56595e24b29b3ceb46e933] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/bcmail-jdk14-136.jar b/contrib/extraction/lib/bcmail-jdk14-136.jar new file mode 100644 index 00000000000..17c87e6170e --- /dev/null +++ b/contrib/extraction/lib/bcmail-jdk14-136.jar @@ -0,0 +1,2 @@ +AnyObjectId[3f78d2a6b7154d80e06bb96441dcf4faa60518c5] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/bcprov-jdk14-132.jar b/contrib/extraction/lib/bcprov-jdk14-132.jar deleted file mode 100644 index ddaa0b62fd6..00000000000 --- a/contrib/extraction/lib/bcprov-jdk14-132.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[552721d0e8deb28f2909cfc5ec900a5e35736795] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/bcprov-jdk14-136.jar b/contrib/extraction/lib/bcprov-jdk14-136.jar new file mode 100644 index 00000000000..8e97af41f0c --- /dev/null +++ b/contrib/extraction/lib/bcprov-jdk14-136.jar @@ -0,0 +1,2 @@ +AnyObjectId[c1cf5b3c233fabfc9765d0cb641dcb7e903ec179] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/commons-compress-1.0.jar b/contrib/extraction/lib/commons-compress-1.0.jar new file mode 100644 index 00000000000..473e2bfa654 --- /dev/null +++ b/contrib/extraction/lib/commons-compress-1.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[78d832c11c42023d4bc12077a1d9b7b5025217bc] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/commons-logging-1.0.4.jar b/contrib/extraction/lib/commons-logging-1.0.4.jar deleted file mode 100644 index f330fde2131..00000000000 --- a/contrib/extraction/lib/commons-logging-1.0.4.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[b73a80fab641131e6fbe3ae833549efb3c540d17] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/commons-logging-1.1.1.jar b/contrib/extraction/lib/commons-logging-1.1.1.jar new file mode 100644 index 00000000000..e537a05e956 --- /dev/null +++ b/contrib/extraction/lib/commons-logging-1.1.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[1deef144cb17ed2c11c6cdcdcb2d9530fa8d0b47] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/fontbox-0.1.0-dev.jar b/contrib/extraction/lib/fontbox-0.1.0-dev.jar deleted file mode 100644 index f2c6adebdf1..00000000000 --- a/contrib/extraction/lib/fontbox-0.1.0-dev.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[c9030febd2ae484532407db9ef98247cbe61b779] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/fontbox-0.1.0.jar b/contrib/extraction/lib/fontbox-0.1.0.jar new file mode 100644 index 00000000000..91ca9cca647 --- /dev/null +++ b/contrib/extraction/lib/fontbox-0.1.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[acb1d7b1d4e00241adba2188235bc85e51ab010c] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar b/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar new file mode 100644 index 00000000000..a560a688df5 --- /dev/null +++ b/contrib/extraction/lib/geronimo-stax-api_1.0_spec-1.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[0d6d37423b2e2b53aba04bb09845233502619cb6] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/jempbox-0.2.0.jar b/contrib/extraction/lib/jempbox-0.2.0.jar new file mode 100644 index 00000000000..76a0bd2eaf0 --- /dev/null +++ b/contrib/extraction/lib/jempbox-0.2.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[953613cba49cc76b0562116af797c68ccf2a0a52] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/pdfbox-0.7.3.jar b/contrib/extraction/lib/pdfbox-0.7.3.jar index 0d58df0a3e4..88d36fee995 100644 --- a/contrib/extraction/lib/pdfbox-0.7.3.jar +++ b/contrib/extraction/lib/pdfbox-0.7.3.jar @@ -1,2 +1,2 @@ -AnyObjectId[f821d644766c4d5c95e53db4b83cc6cb37b553f6] was removed in git history. +AnyObjectId[9bfe670f69cbf240764b6afd74c166afc7d62256] was removed in git history. Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-3.5-beta5.jar b/contrib/extraction/lib/poi-3.5-beta5.jar deleted file mode 100644 index ca9c3ec8b93..00000000000 --- a/contrib/extraction/lib/poi-3.5-beta5.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[0e3279b961c07d4d4ed909dade956aacfbf1e785] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-3.5-beta6.jar b/contrib/extraction/lib/poi-3.5-beta6.jar new file mode 100644 index 00000000000..d847acd2d71 --- /dev/null +++ b/contrib/extraction/lib/poi-3.5-beta6.jar @@ -0,0 +1,2 @@ +AnyObjectId[3281c6e43309fc39dda416d47f632e10f2367c1b] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-ooxml-3.5-beta6.jar b/contrib/extraction/lib/poi-ooxml-3.5-beta6.jar new file mode 100644 index 00000000000..92354c8c8ea --- /dev/null +++ b/contrib/extraction/lib/poi-ooxml-3.5-beta6.jar @@ -0,0 +1,2 @@ +AnyObjectId[206d9920d36cbe51e1cd7c801294734ae1401505] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-scratchpad-3.5-beta5.jar b/contrib/extraction/lib/poi-scratchpad-3.5-beta5.jar deleted file mode 100644 index b25ec0fa647..00000000000 --- a/contrib/extraction/lib/poi-scratchpad-3.5-beta5.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[6877c97853cafab42e5c0a88f3b39b6cd31165ca] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar b/contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar new file mode 100644 index 00000000000..cd9ed02aa04 --- /dev/null +++ b/contrib/extraction/lib/poi-scratchpad-3.5-beta6.jar @@ -0,0 +1,2 @@ +AnyObjectId[c1a27b5fef037f0d81a115a8ef377b95356bbbed] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tika-0.3.jar b/contrib/extraction/lib/tika-0.3.jar deleted file mode 100644 index b668cdddf59..00000000000 --- a/contrib/extraction/lib/tika-0.3.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[4dd90139eda215b4489ae99c1eb6cb97b953c884] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tika-core-0.4.jar b/contrib/extraction/lib/tika-core-0.4.jar new file mode 100644 index 00000000000..6f8fb822caa --- /dev/null +++ b/contrib/extraction/lib/tika-core-0.4.jar @@ -0,0 +1,2 @@ +AnyObjectId[32b269484b5fa5f7e695e66346df089cfb740780] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tika-parsers-0.4.jar b/contrib/extraction/lib/tika-parsers-0.4.jar new file mode 100644 index 00000000000..aae708c2528 --- /dev/null +++ b/contrib/extraction/lib/tika-parsers-0.4.jar @@ -0,0 +1,2 @@ +AnyObjectId[8451a7cc91b2c3c25429446d0d3adf89af2e31c8] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java index 422fb1f2490..fef024133a6 100644 --- a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java +++ b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java @@ -79,8 +79,9 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { "literal.id","simple2", "uprefix", "t_", "lowernames", "true", - "captureAttr", "true", "map.a","t_href", - "map.content_language", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping + "captureAttr", "true", + "map.a","t_href", + "map.content_type", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping "commit", "true" // test immediate commit ); @@ -88,7 +89,7 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { // assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']"); // test both lowernames and unknown field mapping - assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']"); + //assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']"); assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']"); assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']"); @@ -98,7 +99,6 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { "uprefix", "t_", "lowernames", "true", "captureAttr", "true", "map.a","t_href", - "map.content_language", "abcxyz", "commit", "true" ,"boost.t_href", "100.0" @@ -106,6 +106,7 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { assertQ(req("t_href:http"), "//*[@numFound='2']"); assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']"); + assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix // test capture loadLocal("simple.html",