SOLR-2480: add ignoreTikaException flag

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1103120 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-05-14 15:01:12 +00:00
parent 4c85f799eb
commit 86faa47fa9
5 changed files with 79 additions and 27 deletions

View File

@ -22,7 +22,7 @@ to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequ
Current Version: Tika 0.8 (released 11/07/2010) Current Version: Tika 0.8 (released 11/07/2010)
$Id:$ $Id$
================== Release 4.0-dev ================== ================== Release 4.0-dev ==================
@ -30,7 +30,8 @@ $Id:$
================== Release 3.2-dev ================== ================== Release 3.2-dev ==================
(No Changes) * SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
meta data. (Shinichiro Abe, koji)
================== Release 3.1-dev ================== ================== Release 3.1-dev ==================

View File

@ -16,20 +16,27 @@
*/ */
package org.apache.solr.handler.extraction; package org.apache.solr.handler.extraction;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Locale;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams; import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser; import org.apache.tika.parser.Parser;
@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser; import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.BaseMarkupSerializer; import org.apache.xml.serialize.BaseMarkupSerializer;
import org.apache.xml.serialize.XMLSerializer; import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.TextSerializer; import org.apache.xml.serialize.TextSerializer;
import org.apache.xml.serialize.XMLSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler; import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Locale;
/** /**
* The class responsible for loading extracted content into Solr. * The class responsible for loading extracted content into Solr.
* *
**/ **/
public class ExtractingDocumentLoader extends ContentStreamLoader { public class ExtractingDocumentLoader extends ContentStreamLoader {
private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
/** /**
* Extract Only supported format * Extract Only supported format
*/ */
@ -74,6 +79,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
final IndexSchema schema; final IndexSchema schema;
final SolrParams params; final SolrParams params;
final UpdateRequestProcessor processor; final UpdateRequestProcessor processor;
final boolean ignoreTikaException;
protected AutoDetectParser autoDetectParser; protected AutoDetectParser autoDetectParser;
private final AddUpdateCommand templateAdd; private final AddUpdateCommand templateAdd;
@ -95,6 +101,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
//this is lightweight //this is lightweight
autoDetectParser = new AutoDetectParser(config); autoDetectParser = new AutoDetectParser(config);
this.factory = factory; this.factory = factory;
ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
} }
@ -180,9 +188,17 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
parsingHandler = new MatchingContentHandler(handler, matcher); parsingHandler = new MatchingContentHandler(handler, matcher);
} //else leave it as is } //else leave it as is
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document. try{
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context? //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
parser.parse(inputStream, parsingHandler, metadata, context); ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if(ignoreTikaException)
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
.append(". metadata=").append(metadata.toString()).toString());
else
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
if (extractOnly == false) { if (extractOnly == false) {
addDoc(handler); addDoc(handler);
} else { } else {
@ -202,8 +218,6 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
} }
} catch (SAXException e) { } catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} catch (TikaException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally { } finally {
IOUtils.closeQuietly(inputStream); IOUtils.closeQuietly(inputStream);
} }

View File

@ -28,6 +28,11 @@ public interface ExtractingParams {
*/ */
public static final String LOWERNAMES = "lowernames"; public static final String LOWERNAMES = "lowernames";
/**
* if true, ignore TikaException (give up to extract text but index meta data)
*/
public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
/** /**
* The param prefix for mapping Tika metadata to Solr fields. * The param prefix for mapping Tika metadata to Solr fields.

View File

@ -1,4 +1,4 @@
package org.apache.solr.handler; package org.apache.solr.handler.extraction;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -16,25 +16,23 @@ package org.apache.solr.handler;
* limitations under the License. * limitations under the License.
*/ */
import java.util.ArrayList;
import java.util.List;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.common.SolrException;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.SolrException; import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
import org.apache.solr.handler.extraction.ExtractingParams; import org.apache.solr.handler.extraction.ExtractingParams;
import org.apache.solr.handler.extraction.ExtractingRequestHandler; import org.apache.solr.handler.extraction.ExtractingRequestHandler;
import org.apache.solr.handler.extraction.ExtractingDocumentLoader; import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.junit.Before; import org.junit.Before;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import java.util.List;
import java.util.ArrayList;
import java.io.File;
/** /**
* *
@ -363,6 +361,40 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]"); assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
} }
@Test
public void testTikaExceptionHandling() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler)
h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
try{
loadLocal("password-is-solrcell.docx",
"literal.id", "one");
fail("TikaException is expected because of trying to extract text from password protected word file.");
}
catch(Exception expected){}
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=0]");
try{
loadLocal("password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"fmap.AAPL:Keywords", "ignored_a",
"fmap.xmpTPg:NPages", "ignored_a",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "one",
"ignoreTikaException", "true", // set ignore flag
"fmap.Last-Modified", "extractedDate");
}
catch(Exception e){
fail("TikaException should be ignored.");
}
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=1]");
}
SolrQueryResponse loadLocal(String filename, String... args) throws Exception { SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args); LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try { try {