SOLR-1902: fix Tika extraction issue

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942753 13f79535-47bb-0310-9956-ffa450edef68
2010-05-10 14:36:54 +00:00 · 2010-05-10 14:36:54 +00:00 · e85c2774f7
parent 57a8756a9e
commit e85c2774f7
9 changed files with 40 additions and 13 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -285,6 +285,8 @@ Bug Fixes
 * SOLR-1706: fixed WordDelimiterFilter for certain combinations of options
  where it would output incorrect tokens. (Robert Muir, Chris Male)

+* SOLR-1902: Exposed SolrResourceLoader's class loader for use by Tika  
+
 Other Changes
 ----------------------

--- a/solr/contrib/extraction/CHANGES.txt
+++ b/solr/contrib/extraction/CHANGES.txt
@ -17,21 +17,23 @@ You will need Solr up and running.  Then, simply add the extraction JAR file, pl
 to your Solr Home lib directory.  See http://wiki.apache.org/solr/ExtractingRequestHandler for more details on hooking it in
 and configuring.

+ Tika Dependency
+ ---------------
+
+Current Version: Tika 0.8-SNAPSHOT (rev 942725)
+
 $Id:$

 ================== Release 1.5-dev ==================


-* SOLR-1567: Upgrade to Tika 0.5, which upgrades many of the underlying libraries (PDFBox, for example) too (gsingers)

 * SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
  parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)

-* SOLR-1738: Upgrade to Tika 0.6 (gsingers)
-
 * SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers)

-* SOLR-1819: Upgraded to Tika 0.7 (gsingers)
+* SOLR-1902: Upgraded to Tika 0.8-SNAPSHOT to incorporate passing in Solr's custom ClassLoader (gsingers)

 ================== Release 1.4.0 ==================

--- a/solr/contrib/extraction/lib/tika-core-0.7.jar
+++ b/solr/contrib/extraction/lib/tika-core-0.7.jar
@ -1,2 +0,0 @@
-AnyObjectId[522527a851848b18dc666e88b945d42a18075d58] was removed in git history.
-Apache SVN contains full history.
--- a/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
+++ b/solr/contrib/extraction/lib/tika-core-0.8-SNAPSHOT.jar
@ -0,0 +1,2 @@
+AnyObjectId[934d3a7a0c87fc25ffe6bdfa2774fc7ae8e5cbd8] was removed in git history.
+Apache SVN contains full history.
--- a/solr/contrib/extraction/lib/tika-parsers-0.7.jar
+++ b/solr/contrib/extraction/lib/tika-parsers-0.7.jar
@ -1,2 +0,0 @@
-AnyObjectId[8f0a259678e80ad7f7036b23407dcdad5c6a633d] was removed in git history.
-Apache SVN contains full history.
--- a/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
+++ b/solr/contrib/extraction/lib/tika-parsers-0.8-SNAPSHOT.jar
@ -0,0 +1,2 @@
+AnyObjectId[6aba6dca7d96e30dd3c411cd0a2e28033b219767] was removed in git history.
+Apache SVN contains full history.
--- a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@ -37,6 +37,7 @@ import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
 import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.BaseMarkupSerializer;
 import org.apache.xml.serialize.XMLSerializer;
@ -134,7 +135,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
-      parser = config.getParser(streamType.trim().toLowerCase());
+      MediaType mt = MediaType.parse(streamType.trim().toLowerCase());
+      parser = config.getParser(mt);
    } else {
      parser = autoDetectParser;
    }
--- a/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@ -29,10 +29,12 @@ import org.apache.solr.handler.ContentStreamHandlerBase;
 import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MimeTypeException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.File;
+import java.io.IOException;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
@ -77,8 +79,6 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
        } catch (Exception e) {
          throw new SolrException(ErrorCode.SERVER_ERROR, e);
        }
-      } else {
-        config = TikaConfig.getDefaultConfig();
      }
      NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
      if (configDateFormats != null && configDateFormats.size() > 0) {
@ -90,12 +90,23 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
          dateFormats.add(format);
        }
      }
-    } else {
-      config = TikaConfig.getDefaultConfig();
+    }
+    if (config == null) {
+      try {
+        config = getDefaultConfig(core.getResourceLoader().getClassLoader());
+      } catch (MimeTypeException e) {
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
+      } catch (IOException e) {
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
+      }
    }
    factory = createFactory();
  }

+  private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException {
+    return new TikaConfig(classLoader);
+  }
+
  protected SolrContentHandlerFactory createFactory() {
    return new SolrContentHandlerFactory(dateFormats);
  }
--- a/solr/src/java/org/apache/solr/core/SolrResourceLoader.java
+++ b/solr/src/java/org/apache/solr/core/SolrResourceLoader.java
@ -214,6 +214,16 @@ public class SolrResourceLoader implements ResourceLoader
    return coreProperties;
  }

+  /**
+   * EXPERT
+   * <p/>
+   * The underlying class loader.  Most applications will not need to use this.
+   * @return The {@link ClassLoader}
+   */
+  public ClassLoader getClassLoader() {
+    return classLoader;
+  }
+
  /** Opens a schema resource by its name.
   * Override this method to customize loading schema resources.
   *@return the stream for the named schema