From 8f550126f745f7c01e6129c2ca860978d76db07f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Wed, 25 Jan 2012 14:18:06 +0000 Subject: [PATCH] SOLR-2901: Upgrade Solr to Tika 1.0 git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1235753 13f79535-47bb-0310-9956-ffa450edef68 --- dev-tools/eclipse/dot.classpath | 6 ++--- dev-tools/maven/pom.xml.template | 2 +- solr/CHANGES.txt | 4 ++- .../dataimport/MailEntityProcessor.java | 13 ++++++--- .../dataimport/TikaEntityProcessor.java | 4 +-- solr/contrib/extraction/CHANGES.txt | 4 ++- .../extraction/lib/commons-compress-1.2.jar | 2 -- .../extraction/lib/commons-compress-1.3.jar | 2 ++ .../contrib/extraction/lib/tika-core-0.10.jar | 2 -- solr/contrib/extraction/lib/tika-core-1.0.jar | 2 ++ .../extraction/lib/tika-parsers-0.10.jar | 2 -- .../extraction/lib/tika-parsers-1.0.jar | 2 ++ .../extraction/ExtractingDocumentLoader.java | 7 ++++- .../ExtractingRequestHandlerTest.java | 27 ++++++++++++++++++- ...ntifierUpdateProcessorFactoryTestCase.java | 12 ++++++++- 15 files changed, 70 insertions(+), 21 deletions(-) delete mode 100755 solr/contrib/extraction/lib/commons-compress-1.2.jar create mode 100644 solr/contrib/extraction/lib/commons-compress-1.3.jar delete mode 100644 solr/contrib/extraction/lib/tika-core-0.10.jar create mode 100644 solr/contrib/extraction/lib/tika-core-1.0.jar delete mode 100644 solr/contrib/extraction/lib/tika-parsers-0.10.jar create mode 100644 solr/contrib/extraction/lib/tika-parsers-1.0.jar diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath index f95959a1ada..0e901c1dae2 100644 --- a/dev-tools/eclipse/dot.classpath +++ b/dev-tools/eclipse/dot.classpath @@ -96,7 +96,7 @@ - + @@ -149,8 +149,8 @@ - - + + diff --git a/dev-tools/maven/pom.xml.template b/dev-tools/maven/pom.xml.template index c1f3a0faf83..5c62f5f2289 100644 --- a/dev-tools/maven/pom.xml.template +++ b/dev-tools/maven/pom.xml.template @@ -45,7 +45,7 @@ 6.1.26 6.1.26-patched-JETTY-1340 1.6.1 - 0.10 + 1.0 JIRA diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 997b79ebedf..0ef90c7a5ed 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -24,7 +24,7 @@ $Id$ ================== 4.0.0-dev ================== Versions of Major Components --------------------- -Apache Tika 0.10 +Apache Tika 1.0 Carrot2 3.5.0 Velocity 1.6.4 and Velocity Tools 2.0 Apache UIMA 2.3.1 @@ -556,6 +556,8 @@ Other Changes * SOLR-2718: Add ability to lazy load response writers, defined with startup="lazy". (ehatcher) +* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy) + Build ---------------------- * SOLR-2487: Add build target to package war without slf4j jars (janhoy) diff --git a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java index 8b4e6732659..b4d8231f46e 100644 --- a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java +++ b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java @@ -18,8 +18,8 @@ package org.apache.solr.handler.dataimport; import com.sun.mail.imap.IMAPMessage; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.utils.ParseUtils; +import org.apache.tika.Tika; +import org.apache.tika.metadata.Metadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -95,6 +95,8 @@ public class MailEntityProcessor extends EntityProcessorBase { getStringFromContext("processAttachment",null) == null ? "processAttachement":"processAttachment" , true); + tika = new Tika(); + logConfig(); } @@ -166,7 +168,10 @@ public class MailEntityProcessor extends EntityProcessorBase { if (!processAttachment || (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) return; InputStream is = part.getInputStream(); String fileName = part.getFileName(); - String content = ParseUtils.getStringContent(is, TikaConfig.getDefaultConfig(), ctype.getBaseType().toLowerCase(Locale.ENGLISH)); + Metadata md = new Metadata(); + md.set(Metadata.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ENGLISH)); + md.set(Metadata.RESOURCE_NAME_KEY, fileName); + String content = tika.parseToString(is, md); if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) { if (row.get(ATTACHMENT) == null) row.put(ATTACHMENT, new ArrayList()); @@ -529,6 +534,8 @@ public class MailEntityProcessor extends EntityProcessorBase { private boolean processAttachment = true; + private Tika tika; + // holds the current state private Store mailbox; private boolean connected = false; diff --git a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java index 6d42ec4dcdf..e3dd7ae2a75 100644 --- a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java +++ b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java @@ -118,9 +118,7 @@ public class TikaEntityProcessor extends EntityProcessorBase { } Parser tikaParser = null; if(parser.equals(AUTO_PARSER)){ - AutoDetectParser parser = new AutoDetectParser(); - parser.setConfig(tikaConfig); - tikaParser = parser; + tikaParser = new AutoDetectParser(tikaConfig); } else { tikaParser = (Parser) context.getSolrCore().getResourceLoader().newInstance(parser); } diff --git a/solr/contrib/extraction/CHANGES.txt b/solr/contrib/extraction/CHANGES.txt index daf04a76e56..bf81ac735d1 100644 --- a/solr/contrib/extraction/CHANGES.txt +++ b/solr/contrib/extraction/CHANGES.txt @@ -20,7 +20,7 @@ to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequ Tika Dependency --------------- -Current Version: Tika 0.10 (released 2011-09-30) +Current Version: Tika 1.0 (released 2011-11-07) $Id$ @@ -34,6 +34,8 @@ $Id$ This is convenient when Tika's auto detector cannot detect encoding, especially the text file is too short to detect encoding. (koji) +* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy) + ================== Release 3.5.0 ================== * SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy) diff --git a/solr/contrib/extraction/lib/commons-compress-1.2.jar b/solr/contrib/extraction/lib/commons-compress-1.2.jar deleted file mode 100755 index 7a6af362823..00000000000 --- a/solr/contrib/extraction/lib/commons-compress-1.2.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[61753909c3f32306bf60d09e5345d47058ba2122] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/commons-compress-1.3.jar b/solr/contrib/extraction/lib/commons-compress-1.3.jar new file mode 100644 index 00000000000..0a27da8e395 --- /dev/null +++ b/solr/contrib/extraction/lib/commons-compress-1.3.jar @@ -0,0 +1,2 @@ +AnyObjectId[6c826c528b60bb1b25e9053b7f4c920292f6c343] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-core-0.10.jar b/solr/contrib/extraction/lib/tika-core-0.10.jar deleted file mode 100644 index fc3b33110f3..00000000000 --- a/solr/contrib/extraction/lib/tika-core-0.10.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[2259a3b83e63f6ad14b866bff7925b2a1b9f0c23] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-core-1.0.jar b/solr/contrib/extraction/lib/tika-core-1.0.jar new file mode 100644 index 00000000000..b884eafe101 --- /dev/null +++ b/solr/contrib/extraction/lib/tika-core-1.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[5d6bc873cd8bd72dd426090f1b237f373f5fdc00] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-parsers-0.10.jar b/solr/contrib/extraction/lib/tika-parsers-0.10.jar deleted file mode 100644 index 9392674cad2..00000000000 --- a/solr/contrib/extraction/lib/tika-parsers-0.10.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[57368ebb26d4b6493e76efa5552b2e489bc09bbe] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-parsers-1.0.jar b/solr/contrib/extraction/lib/tika-parsers-1.0.jar new file mode 100644 index 00000000000..6ff300c7b86 --- /dev/null +++ b/solr/contrib/extraction/lib/tika-parsers-1.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[d5e5f8fa0ad29f21719ea9bc2a85c6a95cd8a205] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index a731fe1603a..2d250da4c34 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -39,6 +39,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; @@ -138,7 +139,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { if (streamType != null) { //Cache? Parsers are lightweight to construct and thread-safe, so I'm told MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ENGLISH)); - parser = config.getParser(mt); + parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); } else { parser = autoDetectParser; } @@ -151,6 +152,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { if (resourceName != null) { metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName); } + // Provide stream's content type as hint for auto detection + if(stream.getContentType() != null) { + metadata.add(Metadata.CONTENT_TYPE, stream.getContentType()); + } InputStream inputStream = null; try { diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index 799b51c148d..c7146c84e91 100644 --- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -18,7 +18,6 @@ package org.apache.solr.handler.extraction; import java.util.ArrayList; import java.util.List; - import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.ContentStream; @@ -419,7 +418,33 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { assertU(commit()); assertQ(req("*:*"), "//result[@numFound=1]"); } + + @Test + public void testWrongStreamType() throws Exception { + ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); + assertTrue("handler is null and it shouldn't be", handler != null); + try{ + // Load plain text specifying another mime type, should fail + loadLocal("extraction/version_control.txt", + "literal.id", "one", + ExtractingParams.STREAM_TYPE, "application/pdf" + ); + fail("SolrException is expected because wrong parser specified for the file type"); + } + catch(Exception expected){} + + try{ + // Load plain text specifying non existing mimetype, should fail + loadLocal("extraction/version_control.txt", + "literal.id", "one", + ExtractingParams.STREAM_TYPE, "foo/bar" + ); + fail("SolrException is expected because nonexsisting parser specified"); + } + catch(Exception expected){} + } + SolrQueryResponse loadLocal(String filename, String... args) throws Exception { LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args); try { diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java index 8d1761978b2..fdcaf1811e0 100644 --- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java +++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java @@ -67,7 +67,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S assertLang("no", "id", "1no", "name", "Lucene", "subject", "Lucene er et fri/åpen kildekode programvarebibliotek for informasjonsgjenfinning, opprinnelig utviklet i programmeringsspråket Java av Doug Cutting. Lucene støttes av Apache Software Foundation og utgis under Apache-lisensen."); assertLang("en", "id", "2en", "name", "Lucene", "subject", "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."); assertLang("sv", "id", "3sv", "name", "Maven", "subject", "Apache Maven är ett verktyg utvecklat av Apache Software Foundation och används inom systemutveckling av datorprogram i programspråket Java. Maven används för att automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven används inom samma område som Apache Ant men dess byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade."); - assertLang("es", "id", "4es", "name", "Lucene", "subject", "Lucene es un API de código abierto para recuperación de información, originalmente implementada en Java por Doug Cutting. Está apoyado por el Apache Software Foundation y se distribuye bajo la Apache Software License. Lucene tiene versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python, Ruby y PHP."); + assertLang("es", "id", "4es", "name", "Español", "subject", "El español, como las otras lenguas romances, es una continuación moderna del latín hablado (denominado latín vulgar), desde el siglo III, que tras el desmembramiento del Imperio romano fue divergiendo de las otras variantes del latín que se hablaban en las distintas provincias del antiguo Imperio, dando lugar mediante una lenta evolución a las distintas lenguas romances. Debido a su propagación por América, el español es, con diferencia, la lengua romance que ha logrado mayor difusión."); assertLang("un", "id", "5un", "name", "a", "subject", "b"); assertLang("th", "id", "6th", "name", "บทความคัดสรรเดือนนี้", "subject", "อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์ เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์ ระหว่างที่ถูกเยอรมนีเข้าครอบครองในช่วงสงครามโลกครั้งที่สอง"); assertLang("ru", "id", "7ru", "name", "Lucene", "subject", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия)."); @@ -76,7 +76,17 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S assertLang("nl", "id", "10nl", "name", "Lucene", "subject", "Lucene is een gratis open source, tekst gebaseerde information retrieval API van origine geschreven in Java door Doug Cutting. Het wordt ondersteund door de Apache Software Foundation en is vrijgegeven onder de Apache Software Licentie. Lucene is ook beschikbaar in andere programeertalen zoals Perl, C#, C++, Python, Ruby en PHP."); assertLang("it", "id", "11it", "name", "Lucene", "subject", "Lucene è una API gratuita ed open source per il reperimento di informazioni inizialmente implementata in Java da Doug Cutting. È supportata dall'Apache Software Foundation ed è resa disponibile con l'Apache License. Lucene è stata successivamente reimplementata in Perl, C#, C++, Python, Ruby e PHP."); assertLang("pt", "id", "12pt", "name", "Lucene", "subject", "Apache Lucene, ou simplesmente Lucene, é um software de busca e uma API de indexação de documentos, escrito na linguagem de programação Java. É um software de código aberto da Apache Software Foundation licenciado através da licença Apache."); + // New in Tika1.0 + assertLang("ca", "id", "13ca", "name", "Catalan", "subject", "El català posseeix dos estàndards principals: el regulat per l'Institut d'Estudis Catalans, o estàndard general, que pren com a base l'ortografia establerta per Pompeu Fabra amb els trets gramaticals i ortogràfics característics del català central; i el regulat per l'Acadèmia Valenciana de la Llengua, estàndard d'àmbit restringit, centrat en l'estandardització del valencià i que pren com a base les Normes de Castelló, és a dir, l'ortografia de Pompeu Fabra però més adaptada a la pronúncia del català occidental i als trets que caracteritzen els dialectes valencians."); + assertLang("be", "id", "14be", "name", "Belarusian", "subject", "Наступнай буйной дзяржавай на беларускай зямлі было Вялікае княства Літоўскае, Рускае і Жамойцкае (ВКЛ). Падчас стварэння і пачатковага развіцця гэтай дзяржавы найбуйнейшым і асноўным яе цэнтрам быў Новагародак. Акрамя сучасных земляў Беларусі, у склад гэтай дзяржавы ўваходзілі таксама землі сучаснай Літвы, паўночная частка сучаснай Украіны і частка сучаснай Расіі."); + assertLang("eo", "id", "15eo", "name", "Esperanto", "subject", "La vortprovizo de Esperanto devenas plejparte el la okcidenteŭropaj lingvoj, dum ĝia sintakso kaj morfologio montras ankaŭ slavlingvan influon. La morfemoj ne ŝanĝiĝas kaj oni povas ilin preskaŭ senlime kombini, kreante diverssignifajn vortojn, Esperanto do havas multajn kunaĵojn kun la analizaj lingvoj, al kiuj apartenas ekzemple la ĉina; kontraŭe la interna strukturo de Esperanto certagrade respegulas la aglutinajn lingvojn, kiel la japanan, svahilan aŭ turkan."); + assertLang("gl", "id", "16gl", "name", "Galician", "subject", "A cifra de falantes medrou axiña durante as décadas seguintes, nun principio no Imperio ruso e na Europa oriental, logo na Europa occidental, América, China e no Xapón. Nos primeiros anos do movemento, os esperantistas mantiñan contacto por correspondencia, pero en 1905 o primeiro Congreso Universal de Esperanto levouse a cabo na cidade francesa de Boulogne-sur-Mer. Dende entón, os congresos mundiais organizáronse nos cinco continentes ano tras ano agás durante as dúas Guerras Mundiais."); + assertLang("ro", "id", "17ro", "name", "Romanian", "subject", "La momentul destrămării Uniunii Sovietice și a înlăturării regimului comunist instalat în România (1989), țara a inițiat o serie de reforme economice și politice. După un deceniu de probleme economice, România a introdus noi reforme economice de ordin general (precum cota unică de impozitare, în 2005) și a aderat la Uniunea Europeană la 1 ianuarie 2007."); + assertLang("sk", "id", "18sk", "name", "Slovakian", "subject", "Boli vytvorené dva národné parlamenty - Česká národná rada a Slovenská národná rada a spoločný jednokomorový česko-slovenský parlament bol premenovaný z Národného zhromaždenia na Federálne zhromaždenie s dvoma komorami - Snemovňou ľudu a Snemovňu národov."); + assertLang("sl", "id", "19sl", "name", "Slovenian", "subject", "Slovenska Wikipedija je različica spletne enciklopedije Wikipedije v slovenskem jeziku. Projekt slovenske Wikipedije se je začel 26. februarja 2002 z ustanovitvijo njene spletne strani, njen pobudnik pa je bil uporabnik Jani Melik."); + assertLang("uk", "id", "20uk", "name", "Ukrainian", "subject", "Народно-господарський комплекс країни включає такі види промисловості як важке машинобудування, чорна та кольорова металургія, суднобудування, виробництво автобусів, легкових та вантажних автомобілів, тракторів та іншої сільськогосподарської техніки, тепловозів, верстатів, турбін, авіаційних двигунів та літаків, обладнання для електростанцій, нафто-газової та хімічної промисловості тощо. Крім того, Україна є потужним виробником електроенергії. Україна має розвинуте сільське господарство і займає одне з провідних місць серед експортерів деяких видів сільськогосподарської продукції і продовольства (зокрема, соняшникової олії)."); } + @Test public void testMapFieldName() throws Exception {