diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 071060ff595..6fa4b1eccf5 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -285,6 +285,8 @@ Bug Fixes * SOLR-1706: fixed WordDelimiterFilter for certain combinations of options where it would output incorrect tokens. (Robert Muir, Chris Male) +* SOLR-1902: Exposed SolrResourceLoader's class loader for use by Tika + Other Changes ---------------------- diff --git a/solr/contrib/extraction/CHANGES.txt b/solr/contrib/extraction/CHANGES.txt index 23c3d69c7d5..0a404691e3b 100644 --- a/solr/contrib/extraction/CHANGES.txt +++ b/solr/contrib/extraction/CHANGES.txt @@ -17,21 +17,23 @@ You will need Solr up and running. Then, simply add the extraction JAR file, pl to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequestHandler for more details on hooking it in and configuring. + Tika Dependency + --------------- + +Current Version: Tika 0.8-SNAPSHOT (rev 942725) + $Id:$ ================== Release 1.5-dev ================== -* SOLR-1567: Upgrade to Tika 0.5, which upgrades many of the underlying libraries (PDFBox, for example) too (gsingers) * SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller) -* SOLR-1738: Upgrade to Tika 0.6 (gsingers) - * SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers) -* SOLR-1819: Upgraded to Tika 0.7 (gsingers) +* SOLR-1902: Upgraded to Tika 0.8-SNAPSHOT to incorporate passing in Solr's custom ClassLoader (gsingers) ================== Release 1.4.0 ================== diff --git a/solr/contrib/extraction/lib/tika-core-0.7.jar b/solr/contrib/extraction/lib/tika-core-0.7.jar deleted file mode 100644 index d2a791af618..00000000000 --- a/solr/contrib/extraction/lib/tika-core-0.7.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[522527a851848b18dc666e88b945d42a18075d58] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar b/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar new file mode 100644 index 00000000000..99d39fe372b --- /dev/null +++ b/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar @@ -0,0 +1,2 @@ +AnyObjectId[934d3a7a0c87fc25ffe6bdfa2774fc7ae8e5cbd8] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-parsers-0.7.jar b/solr/contrib/extraction/lib/tika-parsers-0.7.jar deleted file mode 100644 index d5333dd400f..00000000000 --- a/solr/contrib/extraction/lib/tika-parsers-0.7.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[8f0a259678e80ad7f7036b23407dcdad5c6a633d] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar b/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar new file mode 100644 index 00000000000..7f4d6fb2c06 --- /dev/null +++ b/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar @@ -0,0 +1,2 @@ +AnyObjectId[6aba6dca7d96e30dd3c411cd0a2e28033b219767] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index a4427d7c938..a887a6f29e7 100644 --- a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -37,6 +37,7 @@ import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MediaType; import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.BaseMarkupSerializer; import org.apache.xml.serialize.XMLSerializer; @@ -134,7 +135,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); if (streamType != null) { //Cache? Parsers are lightweight to construct and thread-safe, so I'm told - parser = config.getParser(streamType.trim().toLowerCase()); + MediaType mt = MediaType.parse(streamType.trim().toLowerCase()); + parser = config.getParser(mt); } else { parser = autoDetectParser; } diff --git a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 14cfc7efc14..3c507267d2f 100644 --- a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -29,10 +29,12 @@ import org.apache.solr.handler.ContentStreamHandlerBase; import org.apache.solr.handler.ContentStreamLoader; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MimeTypeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; +import java.io.IOException; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; @@ -77,8 +79,6 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement } catch (Exception e) { throw new SolrException(ErrorCode.SERVER_ERROR, e); } - } else { - config = TikaConfig.getDefaultConfig(); } NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS); if (configDateFormats != null && configDateFormats.size() > 0) { @@ -90,12 +90,23 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement dateFormats.add(format); } } - } else { - config = TikaConfig.getDefaultConfig(); + } + if (config == null) { + try { + config = getDefaultConfig(core.getResourceLoader().getClassLoader()); + } catch (MimeTypeException e) { + throw new SolrException(ErrorCode.SERVER_ERROR, e); + } catch (IOException e) { + throw new SolrException(ErrorCode.SERVER_ERROR, e); + } } factory = createFactory(); } + private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException { + return new TikaConfig(classLoader); + } + protected SolrContentHandlerFactory createFactory() { return new SolrContentHandlerFactory(dateFormats); } diff --git a/solr/src/java/org/apache/solr/core/SolrResourceLoader.java b/solr/src/java/org/apache/solr/core/SolrResourceLoader.java index 40b2b557728..e6b929565fe 100644 --- a/solr/src/java/org/apache/solr/core/SolrResourceLoader.java +++ b/solr/src/java/org/apache/solr/core/SolrResourceLoader.java @@ -214,6 +214,16 @@ public class SolrResourceLoader implements ResourceLoader return coreProperties; } + /** + * EXPERT + *
+ * The underlying class loader. Most applications will not need to use this. + * @return The {@link ClassLoader} + */ + public ClassLoader getClassLoader() { + return classLoader; + } + /** Opens a schema resource by its name. * Override this method to customize loading schema resources. *@return the stream for the named schema