SOLR-1902: fix Tika extraction issue

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942753 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2010-05-10 14:36:54 +00:00
parent 57a8756a9e
commit e85c2774f7
9 changed files with 40 additions and 13 deletions

View File

@ -285,6 +285,8 @@ Bug Fixes
* SOLR-1706: fixed WordDelimiterFilter for certain combinations of options
where it would output incorrect tokens. (Robert Muir, Chris Male)
* SOLR-1902: Exposed SolrResourceLoader's class loader for use by Tika
Other Changes
----------------------

View File

@ -17,21 +17,23 @@ You will need Solr up and running. Then, simply add the extraction JAR file, pl
to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequestHandler for more details on hooking it in
and configuring.
Tika Dependency
---------------
Current Version: Tika 0.8-SNAPSHOT (rev 942725)
$Id:$
================== Release 1.5-dev ==================
* SOLR-1567: Upgrade to Tika 0.5, which upgrades many of the underlying libraries (PDFBox, for example) too (gsingers)
* SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)
* SOLR-1738: Upgrade to Tika 0.6 (gsingers)
* SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers)
* SOLR-1819: Upgraded to Tika 0.7 (gsingers)
* SOLR-1902: Upgraded to Tika 0.8-SNAPSHOT to incorporate passing in Solr's custom ClassLoader (gsingers)
================== Release 1.4.0 ==================

View File

@ -1,2 +0,0 @@
AnyObjectId[522527a851848b18dc666e88b945d42a18075d58] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[934d3a7a0c87fc25ffe6bdfa2774fc7ae8e5cbd8] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[8f0a259678e80ad7f7036b23407dcdad5c6a633d] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[6aba6dca7d96e30dd3c411cd0a2e28033b219767] was removed in git history.
Apache SVN contains full history.

View File

@ -37,6 +37,7 @@ import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.BaseMarkupSerializer;
import org.apache.xml.serialize.XMLSerializer;
@ -134,7 +135,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
if (streamType != null) {
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
parser = config.getParser(streamType.trim().toLowerCase());
MediaType mt = MediaType.parse(streamType.trim().toLowerCase());
parser = config.getParser(mt);
} else {
parser = autoDetectParser;
}

View File

@ -29,10 +29,12 @@ import org.apache.solr.handler.ContentStreamHandlerBase;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MimeTypeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
@ -77,8 +79,6 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
} else {
config = TikaConfig.getDefaultConfig();
}
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
if (configDateFormats != null && configDateFormats.size() > 0) {
@ -90,12 +90,23 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
dateFormats.add(format);
}
}
} else {
config = TikaConfig.getDefaultConfig();
}
if (config == null) {
try {
config = getDefaultConfig(core.getResourceLoader().getClassLoader());
} catch (MimeTypeException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
factory = createFactory();
}
private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException {
return new TikaConfig(classLoader);
}
protected SolrContentHandlerFactory createFactory() {
return new SolrContentHandlerFactory(dateFormats);
}

View File

@ -214,6 +214,16 @@ public class SolrResourceLoader implements ResourceLoader
return coreProperties;
}
/**
* EXPERT
* <p/>
* The underlying class loader. Most applications will not need to use this.
* @return The {@link ClassLoader}
*/
public ClassLoader getClassLoader() {
return classLoader;
}
/** Opens a schema resource by its name.
* Override this method to customize loading schema resources.
*@return the stream for the named schema