mirror of https://github.com/apache/lucene.git
SOLR-2480: add ignoreTikaException flag
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1103120 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4c85f799eb
commit
86faa47fa9
|
@ -22,7 +22,7 @@ to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequ
|
|||
|
||||
Current Version: Tika 0.8 (released 11/07/2010)
|
||||
|
||||
$Id:$
|
||||
$Id$
|
||||
|
||||
================== Release 4.0-dev ==================
|
||||
|
||||
|
@ -30,7 +30,8 @@ $Id:$
|
|||
|
||||
================== Release 3.2-dev ==================
|
||||
|
||||
(No Changes)
|
||||
* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
|
||||
meta data. (Shinichiro Abe, koji)
|
||||
|
||||
================== Release 3.1-dev ==================
|
||||
|
||||
|
|
|
@ -16,20 +16,27 @@
|
|||
*/
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.UpdateParams;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.handler.ContentStreamLoader;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.update.AddUpdateCommand;
|
||||
import org.apache.solr.update.processor.UpdateRequestProcessor;
|
||||
import org.apache.solr.handler.ContentStreamLoader;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
|
@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentHandler;
|
|||
import org.apache.tika.sax.xpath.Matcher;
|
||||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||
import org.apache.tika.sax.xpath.XPathParser;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.xml.serialize.OutputFormat;
|
||||
import org.apache.xml.serialize.BaseMarkupSerializer;
|
||||
import org.apache.xml.serialize.XMLSerializer;
|
||||
import org.apache.xml.serialize.OutputFormat;
|
||||
import org.apache.xml.serialize.TextSerializer;
|
||||
import org.apache.xml.serialize.XMLSerializer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.util.Locale;
|
||||
|
||||
|
||||
/**
|
||||
* The class responsible for loading extracted content into Solr.
|
||||
*
|
||||
**/
|
||||
public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
|
||||
|
||||
/**
|
||||
* Extract Only supported format
|
||||
*/
|
||||
|
@ -74,6 +79,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
final IndexSchema schema;
|
||||
final SolrParams params;
|
||||
final UpdateRequestProcessor processor;
|
||||
final boolean ignoreTikaException;
|
||||
protected AutoDetectParser autoDetectParser;
|
||||
|
||||
private final AddUpdateCommand templateAdd;
|
||||
|
@ -95,6 +101,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
//this is lightweight
|
||||
autoDetectParser = new AutoDetectParser(config);
|
||||
this.factory = factory;
|
||||
|
||||
ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
|
||||
}
|
||||
|
||||
|
||||
|
@ -180,9 +188,17 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
parsingHandler = new MatchingContentHandler(handler, matcher);
|
||||
} //else leave it as is
|
||||
|
||||
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
||||
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
||||
parser.parse(inputStream, parsingHandler, metadata, context);
|
||||
try{
|
||||
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
||||
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
||||
parser.parse(inputStream, parsingHandler, metadata, context);
|
||||
} catch (TikaException e) {
|
||||
if(ignoreTikaException)
|
||||
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
|
||||
.append(". metadata=").append(metadata.toString()).toString());
|
||||
else
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
||||
}
|
||||
if (extractOnly == false) {
|
||||
addDoc(handler);
|
||||
} else {
|
||||
|
@ -202,8 +218,6 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
}
|
||||
} catch (SAXException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
||||
} catch (TikaException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(inputStream);
|
||||
}
|
||||
|
|
|
@ -28,6 +28,11 @@ public interface ExtractingParams {
|
|||
*/
|
||||
public static final String LOWERNAMES = "lowernames";
|
||||
|
||||
/**
|
||||
* if true, ignore TikaException (give up to extract text but index meta data)
|
||||
*/
|
||||
public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
|
||||
|
||||
|
||||
/**
|
||||
* The param prefix for mapping Tika metadata to Solr fields.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.handler;
|
||||
package org.apache.solr.handler.extraction;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -16,25 +16,23 @@ package org.apache.solr.handler;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.apache.solr.common.util.ContentStreamBase;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
|
||||
import org.apache.solr.handler.extraction.ExtractingParams;
|
||||
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
|
||||
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
|
||||
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -363,6 +361,40 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTikaExceptionHandling() throws Exception {
|
||||
ExtractingRequestHandler handler = (ExtractingRequestHandler)
|
||||
h.getCore().getRequestHandler("/update/extract");
|
||||
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||
|
||||
try{
|
||||
loadLocal("password-is-solrcell.docx",
|
||||
"literal.id", "one");
|
||||
fail("TikaException is expected because of trying to extract text from password protected word file.");
|
||||
}
|
||||
catch(Exception expected){}
|
||||
assertU(commit());
|
||||
assertQ(req("*:*"), "//result[@numFound=0]");
|
||||
|
||||
try{
|
||||
loadLocal("password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"fmap.AAPL:Keywords", "ignored_a",
|
||||
"fmap.xmpTPg:NPages", "ignored_a",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"fmap.content", "wdf_nocase",
|
||||
"literal.id", "one",
|
||||
"ignoreTikaException", "true", // set ignore flag
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
}
|
||||
catch(Exception e){
|
||||
fail("TikaException should be ignored.");
|
||||
}
|
||||
assertU(commit());
|
||||
assertQ(req("*:*"), "//result[@numFound=1]");
|
||||
}
|
||||
|
||||
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
|
||||
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
|
||||
try {
|
Binary file not shown.
Loading…
Reference in New Issue