mirror of https://github.com/apache/lucene.git
SOLR-8166: Introduce possibility to configure ParseContext in ExtractingRequestHandler/ExtractingDocumentLoader
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712629 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2bbbc4d42d
commit
6f5b28be8a
|
@ -195,6 +195,10 @@ New Features
|
|||
|
||||
* SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI
|
||||
|
||||
* SOLR-8166: Introduce possibility to configure ParseContext in
|
||||
ExtractingRequestHandler/ExtractingDocumentLoader (Andriy Binetsky
|
||||
via Uwe Schindler)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -91,13 +91,16 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
private final AddUpdateCommand templateAdd;
|
||||
|
||||
protected TikaConfig config;
|
||||
protected ParseContextConfig parseContextConfig;
|
||||
protected SolrContentHandlerFactory factory;
|
||||
|
||||
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
|
||||
TikaConfig config, SolrContentHandlerFactory factory) {
|
||||
TikaConfig config, ParseContextConfig parseContextConfig,
|
||||
SolrContentHandlerFactory factory) {
|
||||
this.params = req.getParams();
|
||||
this.core = req.getCore();
|
||||
this.config = config;
|
||||
this.parseContextConfig = parseContextConfig;
|
||||
this.processor = processor;
|
||||
|
||||
templateAdd = new AddUpdateCommand(req);
|
||||
|
@ -199,7 +202,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
|
||||
try{
|
||||
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
||||
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
||||
ParseContext context = parseContextConfig.create();
|
||||
|
||||
|
||||
context.set(Parser.class, parser);
|
||||
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
|
||||
|
||||
// Password handling
|
||||
|
|
|
@ -48,10 +48,12 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
|
||||
private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
|
||||
|
||||
public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
|
||||
public static final String CONFIG_LOCATION = "tika.config";
|
||||
public static final String DATE_FORMATS = "date.formats";
|
||||
|
||||
protected TikaConfig config;
|
||||
protected ParseContextConfig parseContextConfig;
|
||||
|
||||
|
||||
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
|
||||
|
@ -79,6 +81,16 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||
}
|
||||
}
|
||||
|
||||
String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
|
||||
if (parseContextConfigLoc != null) {
|
||||
try {
|
||||
parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
|
||||
} catch (Exception e) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||
}
|
||||
}
|
||||
|
||||
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
|
||||
if (configDateFormats != null && configDateFormats.size() > 0) {
|
||||
dateFormats = new HashSet<>();
|
||||
|
@ -97,6 +109,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||
}
|
||||
}
|
||||
if (parseContextConfig == null) {
|
||||
parseContextConfig = new ParseContextConfig();
|
||||
}
|
||||
factory = createFactory();
|
||||
}
|
||||
|
||||
|
@ -111,7 +126,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
|||
|
||||
@Override
|
||||
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
|
||||
return new ExtractingDocumentLoader(req, processor, config, factory);
|
||||
return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
|
||||
}
|
||||
|
||||
// ////////////////////// SolrInfoMBeans methods //////////////////////
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
package org.apache.solr.handler.extraction;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import java.beans.BeanInfo;
|
||||
import java.beans.Introspector;
|
||||
import java.beans.PropertyDescriptor;
|
||||
import java.beans.PropertyEditor;
|
||||
import java.beans.PropertyEditorManager;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NamedNodeMap;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
public class ParseContextConfig {
|
||||
private final Map<Class<?>, Object> entries = new HashMap<>();
|
||||
|
||||
/** Creates an empty Config without any settings (used as placeholder). */
|
||||
public ParseContextConfig() {
|
||||
}
|
||||
|
||||
/** Creates a {@code ParseContextConfig} from the given XML DOM element. */
|
||||
public ParseContextConfig(SolrResourceLoader resourceLoader, Element element) throws Exception {
|
||||
extract(element, resourceLoader);
|
||||
}
|
||||
|
||||
/** Creates a {@code ParseContextConfig} from the given XML file, loaded from the given {@link SolrResourceLoader}. */
|
||||
public ParseContextConfig(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception {
|
||||
this(resourceLoader, loadConfigFile(resourceLoader, parseContextConfigLoc).getDocumentElement());
|
||||
}
|
||||
|
||||
private static Document loadConfigFile(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception {
|
||||
try (InputStream in = resourceLoader.openResource(parseContextConfigLoc)) {
|
||||
return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in, parseContextConfigLoc);
|
||||
}
|
||||
}
|
||||
|
||||
private void extract(Element element, SolrResourceLoader loader) throws Exception {
|
||||
final NodeList xmlEntries = element.getElementsByTagName("entry");
|
||||
for (int i = 0, c1 = xmlEntries.getLength(); i < c1; i++) {
|
||||
final NamedNodeMap xmlEntryAttributes = xmlEntries.item(i).getAttributes();
|
||||
final String className = xmlEntryAttributes.getNamedItem("class").getNodeValue();
|
||||
final String implementationName = xmlEntryAttributes.getNamedItem("impl").getNodeValue();
|
||||
|
||||
final NodeList xmlProperties = ((Element)xmlEntries.item(i)).getElementsByTagName("property");
|
||||
|
||||
final Class<?> interfaceClass = loader.findClass(className, Object.class);
|
||||
final BeanInfo beanInfo = Introspector.getBeanInfo(interfaceClass, Introspector.IGNORE_ALL_BEANINFO);
|
||||
|
||||
final HashMap<String, PropertyDescriptor> descriptorMap = new HashMap<>();
|
||||
for (final PropertyDescriptor pd : beanInfo.getPropertyDescriptors()) {
|
||||
descriptorMap.put(pd.getName(), pd);
|
||||
}
|
||||
|
||||
final Object instance = loader.newInstance(implementationName, Object.class);
|
||||
if (!interfaceClass.isInstance(instance)) {
|
||||
throw new IllegalArgumentException("Implementation class does not extend " + interfaceClass.getName());
|
||||
}
|
||||
|
||||
for (int j = 0, c2 = xmlProperties.getLength(); j < c2; j++) {
|
||||
final Node xmlProperty = xmlProperties.item(j);
|
||||
final NamedNodeMap xmlPropertyAttributes = xmlProperty.getAttributes();
|
||||
|
||||
final String propertyName = xmlPropertyAttributes.getNamedItem("name").getNodeValue();
|
||||
final String propertyValue = xmlPropertyAttributes.getNamedItem("value").getNodeValue();
|
||||
|
||||
final PropertyDescriptor propertyDescriptor = descriptorMap.get(propertyName);
|
||||
propertyDescriptor.getWriteMethod().invoke(instance, getValueFromString(propertyDescriptor.getPropertyType(), propertyValue));
|
||||
}
|
||||
|
||||
entries.put(interfaceClass, instance);
|
||||
}
|
||||
}
|
||||
|
||||
private Object getValueFromString(Class<?> targetType, String text) {
|
||||
final PropertyEditor editor = PropertyEditorManager.findEditor(targetType);
|
||||
editor.setAsText(text);
|
||||
return editor.getValue();
|
||||
}
|
||||
|
||||
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||
public ParseContext create() {
|
||||
final ParseContext result = new ParseContext();
|
||||
|
||||
for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
|
||||
result.set((Class) entry.getKey(), entry.getValue());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
Binary file not shown.
|
@ -0,0 +1,22 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<entries>
|
||||
<entry class="org.apache.tika.parser.pdf.PDFParserConfig" impl="org.apache.tika.parser.pdf.PDFParserConfig">
|
||||
<property name="extractInlineImages" value="true"/>
|
||||
</entry>
|
||||
</entries>
|
|
@ -185,7 +185,9 @@
|
|||
</lst>
|
||||
</requestHandler>
|
||||
|
||||
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
|
||||
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
|
||||
<str name="parseContext.config">parseContext.xml</str>
|
||||
</requestHandler>
|
||||
|
||||
<requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
|
||||
<lst name="defaults">
|
||||
|
|
|
@ -656,6 +656,28 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPdfWithImages() throws Exception {
|
||||
//Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
|
||||
loadLocal("extraction/pdf-with-image.pdf",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"uprefix", "ignored_",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"fmap.content", "wdf_nocase",
|
||||
"literal.id", "pdfWithImage",
|
||||
"resource.name", "pdf-with-image.pdf",
|
||||
"resource.password", "solrRules",
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
|
||||
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
|
||||
assertU(commit());
|
||||
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPasswordProtected() throws Exception {
|
||||
// PDF, Passwords from resource.password
|
||||
|
@ -705,7 +727,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
|
||||
// DOCX, Passwords from file
|
||||
loadLocal("extraction/password-is-Word2010.docx",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.solr.handler.extraction;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
public class ParseContextConfigTest extends SolrTestCaseJ4 {
|
||||
|
||||
public void testAll() throws Exception {
|
||||
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
|
||||
Element entries = document.createElement("entries");
|
||||
Element entry = document.createElement("entry");
|
||||
|
||||
|
||||
entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
|
||||
entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
|
||||
|
||||
Element property = document.createElement("property");
|
||||
|
||||
property.setAttribute("name", "extractInlineImages");
|
||||
property.setAttribute("value", "true");
|
||||
entry.appendChild(property);
|
||||
entries.appendChild(entry);
|
||||
|
||||
ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader("."), entries).create();
|
||||
|
||||
PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
|
||||
|
||||
assertEquals(true, pdfParserConfig.getExtractInlineImages());
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue