mirror of https://github.com/apache/lucene.git
SOLR-2480: add ignoreTikaException flag
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1103120 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4c85f799eb
commit
86faa47fa9
|
@ -22,7 +22,7 @@ to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequ
|
||||||
|
|
||||||
Current Version: Tika 0.8 (released 11/07/2010)
|
Current Version: Tika 0.8 (released 11/07/2010)
|
||||||
|
|
||||||
$Id:$
|
$Id$
|
||||||
|
|
||||||
================== Release 4.0-dev ==================
|
================== Release 4.0-dev ==================
|
||||||
|
|
||||||
|
@ -30,7 +30,8 @@ $Id:$
|
||||||
|
|
||||||
================== Release 3.2-dev ==================
|
================== Release 3.2-dev ==================
|
||||||
|
|
||||||
(No Changes)
|
* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
|
||||||
|
meta data. (Shinichiro Abe, koji)
|
||||||
|
|
||||||
================== Release 3.1-dev ==================
|
================== Release 3.1-dev ==================
|
||||||
|
|
||||||
|
|
|
@ -16,20 +16,27 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.handler.extraction;
|
package org.apache.solr.handler.extraction;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.params.SolrParams;
|
import org.apache.solr.common.params.SolrParams;
|
||||||
import org.apache.solr.common.params.UpdateParams;
|
import org.apache.solr.common.params.UpdateParams;
|
||||||
import org.apache.solr.common.util.ContentStream;
|
import org.apache.solr.common.util.ContentStream;
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.handler.ContentStreamLoader;
|
||||||
import org.apache.solr.request.SolrQueryRequest;
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
import org.apache.solr.response.SolrQueryResponse;
|
import org.apache.solr.response.SolrQueryResponse;
|
||||||
import org.apache.solr.schema.IndexSchema;
|
import org.apache.solr.schema.IndexSchema;
|
||||||
import org.apache.solr.update.AddUpdateCommand;
|
import org.apache.solr.update.AddUpdateCommand;
|
||||||
import org.apache.solr.update.processor.UpdateRequestProcessor;
|
import org.apache.solr.update.processor.UpdateRequestProcessor;
|
||||||
import org.apache.solr.handler.ContentStreamLoader;
|
|
||||||
import org.apache.tika.config.TikaConfig;
|
import org.apache.tika.config.TikaConfig;
|
||||||
|
import org.apache.tika.exception.TikaException;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.apache.tika.mime.MediaType;
|
||||||
import org.apache.tika.parser.AutoDetectParser;
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
import org.apache.tika.parser.Parser;
|
import org.apache.tika.parser.Parser;
|
||||||
|
@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentHandler;
|
||||||
import org.apache.tika.sax.xpath.Matcher;
|
import org.apache.tika.sax.xpath.Matcher;
|
||||||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||||
import org.apache.tika.sax.xpath.XPathParser;
|
import org.apache.tika.sax.xpath.XPathParser;
|
||||||
import org.apache.tika.exception.TikaException;
|
|
||||||
import org.apache.tika.mime.MediaType;
|
|
||||||
import org.apache.xml.serialize.OutputFormat;
|
|
||||||
import org.apache.xml.serialize.BaseMarkupSerializer;
|
import org.apache.xml.serialize.BaseMarkupSerializer;
|
||||||
import org.apache.xml.serialize.XMLSerializer;
|
import org.apache.xml.serialize.OutputFormat;
|
||||||
import org.apache.xml.serialize.TextSerializer;
|
import org.apache.xml.serialize.TextSerializer;
|
||||||
|
import org.apache.xml.serialize.XMLSerializer;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
import org.xml.sax.ContentHandler;
|
import org.xml.sax.ContentHandler;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.StringWriter;
|
|
||||||
import java.util.Locale;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The class responsible for loading extracted content into Solr.
|
* The class responsible for loading extracted content into Solr.
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class ExtractingDocumentLoader extends ContentStreamLoader {
|
public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract Only supported format
|
* Extract Only supported format
|
||||||
*/
|
*/
|
||||||
|
@ -74,6 +79,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
final IndexSchema schema;
|
final IndexSchema schema;
|
||||||
final SolrParams params;
|
final SolrParams params;
|
||||||
final UpdateRequestProcessor processor;
|
final UpdateRequestProcessor processor;
|
||||||
|
final boolean ignoreTikaException;
|
||||||
protected AutoDetectParser autoDetectParser;
|
protected AutoDetectParser autoDetectParser;
|
||||||
|
|
||||||
private final AddUpdateCommand templateAdd;
|
private final AddUpdateCommand templateAdd;
|
||||||
|
@ -95,6 +101,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
//this is lightweight
|
//this is lightweight
|
||||||
autoDetectParser = new AutoDetectParser(config);
|
autoDetectParser = new AutoDetectParser(config);
|
||||||
this.factory = factory;
|
this.factory = factory;
|
||||||
|
|
||||||
|
ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -180,9 +188,17 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
parsingHandler = new MatchingContentHandler(handler, matcher);
|
parsingHandler = new MatchingContentHandler(handler, matcher);
|
||||||
} //else leave it as is
|
} //else leave it as is
|
||||||
|
|
||||||
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
try{
|
||||||
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
||||||
parser.parse(inputStream, parsingHandler, metadata, context);
|
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
||||||
|
parser.parse(inputStream, parsingHandler, metadata, context);
|
||||||
|
} catch (TikaException e) {
|
||||||
|
if(ignoreTikaException)
|
||||||
|
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
|
||||||
|
.append(". metadata=").append(metadata.toString()).toString());
|
||||||
|
else
|
||||||
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
||||||
|
}
|
||||||
if (extractOnly == false) {
|
if (extractOnly == false) {
|
||||||
addDoc(handler);
|
addDoc(handler);
|
||||||
} else {
|
} else {
|
||||||
|
@ -202,8 +218,6 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
}
|
}
|
||||||
} catch (SAXException e) {
|
} catch (SAXException e) {
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
||||||
} catch (TikaException e) {
|
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.closeQuietly(inputStream);
|
IOUtils.closeQuietly(inputStream);
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,11 @@ public interface ExtractingParams {
|
||||||
*/
|
*/
|
||||||
public static final String LOWERNAMES = "lowernames";
|
public static final String LOWERNAMES = "lowernames";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* if true, ignore TikaException (give up to extract text but index meta data)
|
||||||
|
*/
|
||||||
|
public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The param prefix for mapping Tika metadata to Solr fields.
|
* The param prefix for mapping Tika metadata to Solr fields.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package org.apache.solr.handler;
|
package org.apache.solr.handler.extraction;
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -16,25 +16,23 @@ package org.apache.solr.handler;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.solr.SolrTestCaseJ4;
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.response.SolrQueryResponse;
|
|
||||||
import org.apache.solr.common.util.ContentStream;
|
import org.apache.solr.common.util.ContentStream;
|
||||||
import org.apache.solr.common.util.ContentStreamBase;
|
import org.apache.solr.common.util.ContentStreamBase;
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
|
||||||
import org.apache.solr.handler.extraction.ExtractingParams;
|
import org.apache.solr.handler.extraction.ExtractingParams;
|
||||||
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
|
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
|
||||||
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
|
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||||
|
import org.apache.solr.response.SolrQueryResponse;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.io.File;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -363,6 +361,40 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
||||||
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
|
assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTikaExceptionHandling() throws Exception {
|
||||||
|
ExtractingRequestHandler handler = (ExtractingRequestHandler)
|
||||||
|
h.getCore().getRequestHandler("/update/extract");
|
||||||
|
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||||
|
|
||||||
|
try{
|
||||||
|
loadLocal("password-is-solrcell.docx",
|
||||||
|
"literal.id", "one");
|
||||||
|
fail("TikaException is expected because of trying to extract text from password protected word file.");
|
||||||
|
}
|
||||||
|
catch(Exception expected){}
|
||||||
|
assertU(commit());
|
||||||
|
assertQ(req("*:*"), "//result[@numFound=0]");
|
||||||
|
|
||||||
|
try{
|
||||||
|
loadLocal("password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
|
||||||
|
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
|
||||||
|
"fmap.Creation-Date", "extractedDate",
|
||||||
|
"fmap.AAPL:Keywords", "ignored_a",
|
||||||
|
"fmap.xmpTPg:NPages", "ignored_a",
|
||||||
|
"fmap.Author", "extractedAuthor",
|
||||||
|
"fmap.content", "wdf_nocase",
|
||||||
|
"literal.id", "one",
|
||||||
|
"ignoreTikaException", "true", // set ignore flag
|
||||||
|
"fmap.Last-Modified", "extractedDate");
|
||||||
|
}
|
||||||
|
catch(Exception e){
|
||||||
|
fail("TikaException should be ignored.");
|
||||||
|
}
|
||||||
|
assertU(commit());
|
||||||
|
assertQ(req("*:*"), "//result[@numFound=1]");
|
||||||
|
}
|
||||||
|
|
||||||
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
|
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
|
||||||
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
|
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
|
||||||
try {
|
try {
|
Binary file not shown.
Loading…
Reference in New Issue