mirror of https://github.com/apache/lucene.git
SOLR-1902: fix Tika extraction issue
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@942753 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
57a8756a9e
commit
e85c2774f7
|
@ -285,6 +285,8 @@ Bug Fixes
|
|||
* SOLR-1706: fixed WordDelimiterFilter for certain combinations of options
|
||||
where it would output incorrect tokens. (Robert Muir, Chris Male)
|
||||
|
||||
* SOLR-1902: Exposed SolrResourceLoader's class loader for use by Tika
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -17,21 +17,23 @@ You will need Solr up and running. Then, simply add the extraction JAR file, pl
|
|||
to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequestHandler for more details on hooking it in
|
||||
and configuring.
|
||||
|
||||
Tika Dependency
|
||||
---------------
|
||||
|
||||
Current Version: Tika 0.8-SNAPSHOT (rev 942725)
|
||||
|
||||
$Id:$
|
||||
|
||||
================== Release 1.5-dev ==================
|
||||
|
||||
|
||||
* SOLR-1567: Upgrade to Tika 0.5, which upgrades many of the underlying libraries (PDFBox, for example) too (gsingers)
|
||||
|
||||
* SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
|
||||
parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)
|
||||
|
||||
* SOLR-1738: Upgrade to Tika 0.6 (gsingers)
|
||||
|
||||
* SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers)
|
||||
|
||||
* SOLR-1819: Upgraded to Tika 0.7 (gsingers)
|
||||
* SOLR-1902: Upgraded to Tika 0.8-SNAPSHOT to incorporate passing in Solr's custom ClassLoader (gsingers)
|
||||
|
||||
================== Release 1.4.0 ==================
|
||||
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[522527a851848b18dc666e88b945d42a18075d58] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[934d3a7a0c87fc25ffe6bdfa2774fc7ae8e5cbd8] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[8f0a259678e80ad7f7036b23407dcdad5c6a633d] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[6aba6dca7d96e30dd3c411cd0a2e28033b219767] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -37,6 +37,7 @@ import org.apache.tika.sax.xpath.Matcher;
|
|||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||
import org.apache.tika.sax.xpath.XPathParser;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.xml.serialize.OutputFormat;
|
||||
import org.apache.xml.serialize.BaseMarkupSerializer;
|
||||
import org.apache.xml.serialize.XMLSerializer;
|
||||
|
@ -134,7 +135,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
|
||||
if (streamType != null) {
|
||||
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
|
||||
parser = config.getParser(streamType.trim().toLowerCase());
|
||||
MediaType mt = MediaType.parse(streamType.trim().toLowerCase());
|
||||
parser = config.getParser(mt);
|
||||
} else {
|
||||
parser = autoDetectParser;
|
||||
}
|
||||
|
|
|
@ -29,10 +29,12 @@ import org.apache.solr.handler.ContentStreamHandlerBase;
|
|||
import org.apache.solr.handler.ContentStreamLoader;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.mime.MimeTypeException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -77,8 +79,6 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
} catch (Exception e) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||
}
|
||||
} else {
|
||||
config = TikaConfig.getDefaultConfig();
|
||||
}
|
||||
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
|
||||
if (configDateFormats != null && configDateFormats.size() > 0) {
|
||||
|
@ -90,12 +90,23 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
dateFormats.add(format);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
config = TikaConfig.getDefaultConfig();
|
||||
}
|
||||
if (config == null) {
|
||||
try {
|
||||
config = getDefaultConfig(core.getResourceLoader().getClassLoader());
|
||||
} catch (MimeTypeException e) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||
} catch (IOException e) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||
}
|
||||
}
|
||||
factory = createFactory();
|
||||
}
|
||||
|
||||
private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException {
|
||||
return new TikaConfig(classLoader);
|
||||
}
|
||||
|
||||
protected SolrContentHandlerFactory createFactory() {
|
||||
return new SolrContentHandlerFactory(dateFormats);
|
||||
}
|
||||
|
|
|
@ -214,6 +214,16 @@ public class SolrResourceLoader implements ResourceLoader
|
|||
return coreProperties;
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERT
|
||||
* <p/>
|
||||
* The underlying class loader. Most applications will not need to use this.
|
||||
* @return The {@link ClassLoader}
|
||||
*/
|
||||
public ClassLoader getClassLoader() {
|
||||
return classLoader;
|
||||
}
|
||||
|
||||
/** Opens a schema resource by its name.
|
||||
* Override this method to customize loading schema resources.
|
||||
*@return the stream for the named schema
|
||||
|
|
Loading…
Reference in New Issue