SOLR-8166: Introduce possibility to configure ParseContext in ExtractingRequestHandler/ExtractingDocumentLoader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712629 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2015-11-04 20:13:40 +00:00
parent 2bbbc4d42d
commit 6f5b28be8a
9 changed files with 243 additions and 5 deletions

View File

@ -195,6 +195,10 @@ New Features
* SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI
* SOLR-8166: Introduce possibility to configure ParseContext in
ExtractingRequestHandler/ExtractingDocumentLoader (Andriy Binetsky
via Uwe Schindler)
Bug Fixes
----------------------

View File

@ -91,13 +91,16 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
private final AddUpdateCommand templateAdd;
protected TikaConfig config;
protected ParseContextConfig parseContextConfig;
protected SolrContentHandlerFactory factory;
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
TikaConfig config, SolrContentHandlerFactory factory) {
TikaConfig config, ParseContextConfig parseContextConfig,
SolrContentHandlerFactory factory) {
this.params = req.getParams();
this.core = req.getCore();
this.config = config;
this.parseContextConfig = parseContextConfig;
this.processor = processor;
templateAdd = new AddUpdateCommand(req);
@ -199,7 +202,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
try{
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
ParseContext context = parseContextConfig.create();
context.set(Parser.class, parser);
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling

View File

@ -48,10 +48,12 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
public static final String CONFIG_LOCATION = "tika.config";
public static final String DATE_FORMATS = "date.formats";
protected TikaConfig config;
protected ParseContextConfig parseContextConfig;
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
@ -79,6 +81,16 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
if (parseContextConfigLoc != null) {
try {
parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
if (configDateFormats != null && configDateFormats.size() > 0) {
dateFormats = new HashSet<>();
@ -97,6 +109,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
if (parseContextConfig == null) {
parseContextConfig = new ParseContextConfig();
}
factory = createFactory();
}
@ -111,7 +126,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
@Override
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
return new ExtractingDocumentLoader(req, processor, config, factory);
return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
}
// ////////////////////// SolrInfoMBeans methods //////////////////////

View File

@ -0,0 +1,114 @@
package org.apache.solr.handler.extraction;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import javax.xml.parsers.DocumentBuilderFactory;
import java.beans.BeanInfo;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.beans.PropertyEditor;
import java.beans.PropertyEditorManager;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.tika.parser.ParseContext;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class ParseContextConfig {
private final Map<Class<?>, Object> entries = new HashMap<>();
/** Creates an empty Config without any settings (used as placeholder). */
public ParseContextConfig() {
}
/** Creates a {@code ParseContextConfig} from the given XML DOM element. */
public ParseContextConfig(SolrResourceLoader resourceLoader, Element element) throws Exception {
extract(element, resourceLoader);
}
/** Creates a {@code ParseContextConfig} from the given XML file, loaded from the given {@link SolrResourceLoader}. */
public ParseContextConfig(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception {
this(resourceLoader, loadConfigFile(resourceLoader, parseContextConfigLoc).getDocumentElement());
}
private static Document loadConfigFile(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception {
try (InputStream in = resourceLoader.openResource(parseContextConfigLoc)) {
return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in, parseContextConfigLoc);
}
}
private void extract(Element element, SolrResourceLoader loader) throws Exception {
final NodeList xmlEntries = element.getElementsByTagName("entry");
for (int i = 0, c1 = xmlEntries.getLength(); i < c1; i++) {
final NamedNodeMap xmlEntryAttributes = xmlEntries.item(i).getAttributes();
final String className = xmlEntryAttributes.getNamedItem("class").getNodeValue();
final String implementationName = xmlEntryAttributes.getNamedItem("impl").getNodeValue();
final NodeList xmlProperties = ((Element)xmlEntries.item(i)).getElementsByTagName("property");
final Class<?> interfaceClass = loader.findClass(className, Object.class);
final BeanInfo beanInfo = Introspector.getBeanInfo(interfaceClass, Introspector.IGNORE_ALL_BEANINFO);
final HashMap<String, PropertyDescriptor> descriptorMap = new HashMap<>();
for (final PropertyDescriptor pd : beanInfo.getPropertyDescriptors()) {
descriptorMap.put(pd.getName(), pd);
}
final Object instance = loader.newInstance(implementationName, Object.class);
if (!interfaceClass.isInstance(instance)) {
throw new IllegalArgumentException("Implementation class does not extend " + interfaceClass.getName());
}
for (int j = 0, c2 = xmlProperties.getLength(); j < c2; j++) {
final Node xmlProperty = xmlProperties.item(j);
final NamedNodeMap xmlPropertyAttributes = xmlProperty.getAttributes();
final String propertyName = xmlPropertyAttributes.getNamedItem("name").getNodeValue();
final String propertyValue = xmlPropertyAttributes.getNamedItem("value").getNodeValue();
final PropertyDescriptor propertyDescriptor = descriptorMap.get(propertyName);
propertyDescriptor.getWriteMethod().invoke(instance, getValueFromString(propertyDescriptor.getPropertyType(), propertyValue));
}
entries.put(interfaceClass, instance);
}
}
private Object getValueFromString(Class<?> targetType, String text) {
final PropertyEditor editor = PropertyEditorManager.findEditor(targetType);
editor.setAsText(text);
return editor.getValue();
}
@SuppressWarnings({"rawtypes", "unchecked"})
public ParseContext create() {
final ParseContext result = new ParseContext();
for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
result.set((Class) entry.getKey(), entry.getValue());
}
return result;
}
}

View File

@ -0,0 +1,22 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<entries>
<entry class="org.apache.tika.parser.pdf.PDFParserConfig" impl="org.apache.tika.parser.pdf.PDFParserConfig">
<property name="extractInlineImages" value="true"/>
</entry>
</entries>

View File

@ -185,7 +185,9 @@
</lst>
</requestHandler>
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
<str name="parseContext.config">parseContext.xml</str>
</requestHandler>
<requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
<lst name="defaults">

View File

@ -656,6 +656,28 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
}
@Test
public void testPdfWithImages() throws Exception {
//Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
loadLocal("extraction/pdf-with-image.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "pdfWithImage",
"resource.name", "pdf-with-image.pdf",
"resource.password", "solrRules",
"fmap.Last-Modified", "extractedDate");
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
assertU(commit());
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
}
@Test
public void testPasswordProtected() throws Exception {
// PDF, Passwords from resource.password
@ -705,7 +727,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
// DOCX, Passwords from file
loadLocal("extraction/password-is-Word2010.docx",
"fmap.created", "extractedDate",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",

View File

@ -0,0 +1,53 @@
package org.apache.solr.handler.extraction;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class ParseContextConfigTest extends SolrTestCaseJ4 {
public void testAll() throws Exception {
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
Element entries = document.createElement("entries");
Element entry = document.createElement("entry");
entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
Element property = document.createElement("property");
property.setAttribute("name", "extractInlineImages");
property.setAttribute("value", "true");
entry.appendChild(property);
entries.appendChild(entry);
ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader("."), entries).create();
PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
assertEquals(true, pdfParserConfig.getExtractInlineImages());
}
}