mirror of https://github.com/apache/lucene.git
SOLR-8166: Introduce possibility to configure ParseContext in ExtractingRequestHandler/ExtractingDocumentLoader
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712629 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2bbbc4d42d
commit
6f5b28be8a
|
@ -195,6 +195,10 @@ New Features
|
||||||
|
|
||||||
* SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI
|
* SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI
|
||||||
|
|
||||||
|
* SOLR-8166: Introduce possibility to configure ParseContext in
|
||||||
|
ExtractingRequestHandler/ExtractingDocumentLoader (Andriy Binetsky
|
||||||
|
via Uwe Schindler)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -91,13 +91,16 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
private final AddUpdateCommand templateAdd;
|
private final AddUpdateCommand templateAdd;
|
||||||
|
|
||||||
protected TikaConfig config;
|
protected TikaConfig config;
|
||||||
|
protected ParseContextConfig parseContextConfig;
|
||||||
protected SolrContentHandlerFactory factory;
|
protected SolrContentHandlerFactory factory;
|
||||||
|
|
||||||
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
|
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
|
||||||
TikaConfig config, SolrContentHandlerFactory factory) {
|
TikaConfig config, ParseContextConfig parseContextConfig,
|
||||||
|
SolrContentHandlerFactory factory) {
|
||||||
this.params = req.getParams();
|
this.params = req.getParams();
|
||||||
this.core = req.getCore();
|
this.core = req.getCore();
|
||||||
this.config = config;
|
this.config = config;
|
||||||
|
this.parseContextConfig = parseContextConfig;
|
||||||
this.processor = processor;
|
this.processor = processor;
|
||||||
|
|
||||||
templateAdd = new AddUpdateCommand(req);
|
templateAdd = new AddUpdateCommand(req);
|
||||||
|
@ -199,7 +202,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
|
|
||||||
try{
|
try{
|
||||||
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
||||||
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
ParseContext context = parseContextConfig.create();
|
||||||
|
|
||||||
|
|
||||||
|
context.set(Parser.class, parser);
|
||||||
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
|
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
|
||||||
|
|
||||||
// Password handling
|
// Password handling
|
||||||
|
|
|
@ -48,10 +48,12 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
||||||
|
|
||||||
private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
|
private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
|
||||||
|
|
||||||
|
public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
|
||||||
public static final String CONFIG_LOCATION = "tika.config";
|
public static final String CONFIG_LOCATION = "tika.config";
|
||||||
public static final String DATE_FORMATS = "date.formats";
|
public static final String DATE_FORMATS = "date.formats";
|
||||||
|
|
||||||
protected TikaConfig config;
|
protected TikaConfig config;
|
||||||
|
protected ParseContextConfig parseContextConfig;
|
||||||
|
|
||||||
|
|
||||||
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
|
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
|
||||||
|
@ -79,6 +81,16 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
||||||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
|
||||||
|
if (parseContextConfigLoc != null) {
|
||||||
|
try {
|
||||||
|
parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
|
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
|
||||||
if (configDateFormats != null && configDateFormats.size() > 0) {
|
if (configDateFormats != null && configDateFormats.size() > 0) {
|
||||||
dateFormats = new HashSet<>();
|
dateFormats = new HashSet<>();
|
||||||
|
@ -97,6 +109,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
||||||
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
throw new SolrException(ErrorCode.SERVER_ERROR, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (parseContextConfig == null) {
|
||||||
|
parseContextConfig = new ParseContextConfig();
|
||||||
|
}
|
||||||
factory = createFactory();
|
factory = createFactory();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -111,7 +126,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
|
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
|
||||||
return new ExtractingDocumentLoader(req, processor, config, factory);
|
return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ////////////////////// SolrInfoMBeans methods //////////////////////
|
// ////////////////////// SolrInfoMBeans methods //////////////////////
|
||||||
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
package org.apache.solr.handler.extraction;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
|
import java.beans.BeanInfo;
|
||||||
|
import java.beans.Introspector;
|
||||||
|
import java.beans.PropertyDescriptor;
|
||||||
|
import java.beans.PropertyEditor;
|
||||||
|
import java.beans.PropertyEditorManager;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Element;
|
||||||
|
import org.w3c.dom.NamedNodeMap;
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
import org.w3c.dom.NodeList;
|
||||||
|
|
||||||
|
public class ParseContextConfig {
|
||||||
|
private final Map<Class<?>, Object> entries = new HashMap<>();
|
||||||
|
|
||||||
|
/** Creates an empty Config without any settings (used as placeholder). */
|
||||||
|
public ParseContextConfig() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates a {@code ParseContextConfig} from the given XML DOM element. */
|
||||||
|
public ParseContextConfig(SolrResourceLoader resourceLoader, Element element) throws Exception {
|
||||||
|
extract(element, resourceLoader);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Creates a {@code ParseContextConfig} from the given XML file, loaded from the given {@link SolrResourceLoader}. */
|
||||||
|
public ParseContextConfig(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception {
|
||||||
|
this(resourceLoader, loadConfigFile(resourceLoader, parseContextConfigLoc).getDocumentElement());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Document loadConfigFile(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception {
|
||||||
|
try (InputStream in = resourceLoader.openResource(parseContextConfigLoc)) {
|
||||||
|
return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in, parseContextConfigLoc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void extract(Element element, SolrResourceLoader loader) throws Exception {
|
||||||
|
final NodeList xmlEntries = element.getElementsByTagName("entry");
|
||||||
|
for (int i = 0, c1 = xmlEntries.getLength(); i < c1; i++) {
|
||||||
|
final NamedNodeMap xmlEntryAttributes = xmlEntries.item(i).getAttributes();
|
||||||
|
final String className = xmlEntryAttributes.getNamedItem("class").getNodeValue();
|
||||||
|
final String implementationName = xmlEntryAttributes.getNamedItem("impl").getNodeValue();
|
||||||
|
|
||||||
|
final NodeList xmlProperties = ((Element)xmlEntries.item(i)).getElementsByTagName("property");
|
||||||
|
|
||||||
|
final Class<?> interfaceClass = loader.findClass(className, Object.class);
|
||||||
|
final BeanInfo beanInfo = Introspector.getBeanInfo(interfaceClass, Introspector.IGNORE_ALL_BEANINFO);
|
||||||
|
|
||||||
|
final HashMap<String, PropertyDescriptor> descriptorMap = new HashMap<>();
|
||||||
|
for (final PropertyDescriptor pd : beanInfo.getPropertyDescriptors()) {
|
||||||
|
descriptorMap.put(pd.getName(), pd);
|
||||||
|
}
|
||||||
|
|
||||||
|
final Object instance = loader.newInstance(implementationName, Object.class);
|
||||||
|
if (!interfaceClass.isInstance(instance)) {
|
||||||
|
throw new IllegalArgumentException("Implementation class does not extend " + interfaceClass.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0, c2 = xmlProperties.getLength(); j < c2; j++) {
|
||||||
|
final Node xmlProperty = xmlProperties.item(j);
|
||||||
|
final NamedNodeMap xmlPropertyAttributes = xmlProperty.getAttributes();
|
||||||
|
|
||||||
|
final String propertyName = xmlPropertyAttributes.getNamedItem("name").getNodeValue();
|
||||||
|
final String propertyValue = xmlPropertyAttributes.getNamedItem("value").getNodeValue();
|
||||||
|
|
||||||
|
final PropertyDescriptor propertyDescriptor = descriptorMap.get(propertyName);
|
||||||
|
propertyDescriptor.getWriteMethod().invoke(instance, getValueFromString(propertyDescriptor.getPropertyType(), propertyValue));
|
||||||
|
}
|
||||||
|
|
||||||
|
entries.put(interfaceClass, instance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Object getValueFromString(Class<?> targetType, String text) {
|
||||||
|
final PropertyEditor editor = PropertyEditorManager.findEditor(targetType);
|
||||||
|
editor.setAsText(text);
|
||||||
|
return editor.getValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||||
|
public ParseContext create() {
|
||||||
|
final ParseContext result = new ParseContext();
|
||||||
|
|
||||||
|
for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
|
||||||
|
result.set((Class) entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,22 @@
|
||||||
|
<?xml version="1.0" ?>
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<entries>
|
||||||
|
<entry class="org.apache.tika.parser.pdf.PDFParserConfig" impl="org.apache.tika.parser.pdf.PDFParserConfig">
|
||||||
|
<property name="extractInlineImages" value="true"/>
|
||||||
|
</entry>
|
||||||
|
</entries>
|
|
@ -185,7 +185,9 @@
|
||||||
</lst>
|
</lst>
|
||||||
</requestHandler>
|
</requestHandler>
|
||||||
|
|
||||||
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
|
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
|
||||||
|
<str name="parseContext.config">parseContext.xml</str>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
<requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
|
<requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
|
||||||
<lst name="defaults">
|
<lst name="defaults">
|
||||||
|
|
|
@ -656,6 +656,28 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
||||||
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
|
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPdfWithImages() throws Exception {
|
||||||
|
//Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
|
||||||
|
loadLocal("extraction/pdf-with-image.pdf",
|
||||||
|
"fmap.created", "extractedDate",
|
||||||
|
"fmap.producer", "extractedProducer",
|
||||||
|
"fmap.creator", "extractedCreator",
|
||||||
|
"fmap.Keywords", "extractedKeywords",
|
||||||
|
"fmap.Creation-Date", "extractedDate",
|
||||||
|
"uprefix", "ignored_",
|
||||||
|
"fmap.Author", "extractedAuthor",
|
||||||
|
"fmap.content", "wdf_nocase",
|
||||||
|
"literal.id", "pdfWithImage",
|
||||||
|
"resource.name", "pdf-with-image.pdf",
|
||||||
|
"resource.password", "solrRules",
|
||||||
|
"fmap.Last-Modified", "extractedDate");
|
||||||
|
|
||||||
|
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
|
||||||
|
assertU(commit());
|
||||||
|
assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPasswordProtected() throws Exception {
|
public void testPasswordProtected() throws Exception {
|
||||||
// PDF, Passwords from resource.password
|
// PDF, Passwords from resource.password
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
package org.apache.solr.handler.extraction;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Element;
|
||||||
|
|
||||||
|
public class ParseContextConfigTest extends SolrTestCaseJ4 {
|
||||||
|
|
||||||
|
public void testAll() throws Exception {
|
||||||
|
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
|
||||||
|
Element entries = document.createElement("entries");
|
||||||
|
Element entry = document.createElement("entry");
|
||||||
|
|
||||||
|
|
||||||
|
entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
|
||||||
|
entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
|
||||||
|
|
||||||
|
Element property = document.createElement("property");
|
||||||
|
|
||||||
|
property.setAttribute("name", "extractInlineImages");
|
||||||
|
property.setAttribute("value", "true");
|
||||||
|
entry.appendChild(property);
|
||||||
|
entries.appendChild(entry);
|
||||||
|
|
||||||
|
ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader("."), entries).create();
|
||||||
|
|
||||||
|
PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
|
||||||
|
|
||||||
|
assertEquals(true, pdfParserConfig.getExtractInlineImages());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue