SOLR-2480: add ignoreTikaException flag

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1103120 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-05-14 15:01:12 +00:00
parent 4c85f799eb
commit 86faa47fa9
5 changed files with 79 additions and 27 deletions

View File

@ -22,7 +22,7 @@ to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequ
Current Version: Tika 0.8 (released 11/07/2010)
$Id:$
$Id$
================== Release 4.0-dev ==================
@ -30,7 +30,8 @@ $Id:$
================== Release 3.2-dev ==================
(No Changes)
* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
meta data. (Shinichiro Abe, koji)
================== Release 3.1-dev ==================

View File

@ -16,20 +16,27 @@
*/
package org.apache.solr.handler.extraction;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Locale;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.BaseMarkupSerializer;
import org.apache.xml.serialize.XMLSerializer;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.TextSerializer;
import org.apache.xml.serialize.XMLSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Locale;
/**
* The class responsible for loading extracted content into Solr.
*
**/
public class ExtractingDocumentLoader extends ContentStreamLoader {
private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
/**
* Extract Only supported format
*/
@ -74,6 +79,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
final IndexSchema schema;
final SolrParams params;
final UpdateRequestProcessor processor;
final boolean ignoreTikaException;
protected AutoDetectParser autoDetectParser;
private final AddUpdateCommand templateAdd;
@ -95,6 +101,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
//this is lightweight
autoDetectParser = new AutoDetectParser(config);
this.factory = factory;
ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
}
@ -180,9 +188,17 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
parsingHandler = new MatchingContentHandler(handler, matcher);
} //else leave it as is
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
parser.parse(inputStream, parsingHandler, metadata, context);
try{
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if(ignoreTikaException)
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
.append(". metadata=").append(metadata.toString()).toString());
else
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
if (extractOnly == false) {
addDoc(handler);
} else {
@ -202,8 +218,6 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} catch (TikaException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}

View File

@ -28,6 +28,11 @@ public interface ExtractingParams {
*/
public static final String LOWERNAMES = "lowernames";
/**
* if true, ignore TikaException (give up to extract text but index meta data)
*/
public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
/**
* The param prefix for mapping Tika metadata to Solr fields.

View File

@ -1,4 +1,4 @@
package org.apache.solr.handler;
package org.apache.solr.handler.extraction;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -16,25 +16,23 @@ package org.apache.solr.handler;
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.List;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.SolrException;
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
import org.apache.solr.handler.extraction.ExtractingParams;
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import java.util.List;
import java.util.ArrayList;
import java.io.File;
/**
*
@ -363,6 +361,40 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
}
@Test
public void testTikaExceptionHandling() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
try{
loadLocal("password-is-solrcell.docx",
"literal.id", "one");
fail("TikaException is expected because of trying to extract text from password protected word file.");
}
catch(Exception expected){}
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=0]");
try{
loadLocal("password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"fmap.AAPL:Keywords", "ignored_a",
"fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "one",
"ignoreTikaException", "true", // set ignore flag
"fmap.Last-Modified", "extractedDate");
}
catch(Exception e){
fail("TikaException should be ignored.");
}
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=1]");
}
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try {