diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 62596cf2390..d58929c3e49 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -195,6 +195,10 @@ New Features * SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI +* SOLR-8166: Introduce possibility to configure ParseContext in + ExtractingRequestHandler/ExtractingDocumentLoader (Andriy Binetsky + via Uwe Schindler) + Bug Fixes ---------------------- diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index 29fad26e06a..cc276266a75 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -91,13 +91,16 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { private final AddUpdateCommand templateAdd; protected TikaConfig config; + protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor, - TikaConfig config, SolrContentHandlerFactory factory) { + TikaConfig config, ParseContextConfig parseContextConfig, + SolrContentHandlerFactory factory) { this.params = req.getParams(); this.core = req.getCore(); this.config = config; + this.parseContextConfig = parseContextConfig; this.processor = processor; templateAdd = new AddUpdateCommand(req); @@ -199,7 +202,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { try{ //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document. - ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context? + ParseContext context = parseContextConfig.create(); + + + context.set(Parser.class, parser); context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); // Password handling diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 6d9e31b12a9..4fc9e09654f 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -48,10 +48,12 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class); + public static final String PARSE_CONTEXT_CONFIG = "parseContext.config"; public static final String CONFIG_LOCATION = "tika.config"; public static final String DATE_FORMATS = "date.formats"; protected TikaConfig config; + protected ParseContextConfig parseContextConfig; protected Collection dateFormats = DateUtil.DEFAULT_DATE_FORMATS; @@ -79,6 +81,16 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement throw new SolrException(ErrorCode.SERVER_ERROR, e); } } + + String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG); + if (parseContextConfigLoc != null) { + try { + parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc); + } catch (Exception e) { + throw new SolrException(ErrorCode.SERVER_ERROR, e); + } + } + NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS); if (configDateFormats != null && configDateFormats.size() > 0) { dateFormats = new HashSet<>(); @@ -97,6 +109,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement throw new SolrException(ErrorCode.SERVER_ERROR, e); } } + if (parseContextConfig == null) { + parseContextConfig = new ParseContextConfig(); + } factory = createFactory(); } @@ -111,7 +126,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { - return new ExtractingDocumentLoader(req, processor, config, factory); + return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory); } // ////////////////////// SolrInfoMBeans methods ////////////////////// diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java new file mode 100644 index 00000000000..d92ebd0a621 --- /dev/null +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java @@ -0,0 +1,114 @@ +package org.apache.solr.handler.extraction; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import javax.xml.parsers.DocumentBuilderFactory; +import java.beans.BeanInfo; +import java.beans.Introspector; +import java.beans.PropertyDescriptor; +import java.beans.PropertyEditor; +import java.beans.PropertyEditorManager; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.solr.core.SolrResourceLoader; +import org.apache.tika.parser.ParseContext; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +public class ParseContextConfig { + private final Map, Object> entries = new HashMap<>(); + + /** Creates an empty Config without any settings (used as placeholder). */ + public ParseContextConfig() { + } + + /** Creates a {@code ParseContextConfig} from the given XML DOM element. */ + public ParseContextConfig(SolrResourceLoader resourceLoader, Element element) throws Exception { + extract(element, resourceLoader); + } + + /** Creates a {@code ParseContextConfig} from the given XML file, loaded from the given {@link SolrResourceLoader}. */ + public ParseContextConfig(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception { + this(resourceLoader, loadConfigFile(resourceLoader, parseContextConfigLoc).getDocumentElement()); + } + + private static Document loadConfigFile(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception { + try (InputStream in = resourceLoader.openResource(parseContextConfigLoc)) { + return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in, parseContextConfigLoc); + } + } + + private void extract(Element element, SolrResourceLoader loader) throws Exception { + final NodeList xmlEntries = element.getElementsByTagName("entry"); + for (int i = 0, c1 = xmlEntries.getLength(); i < c1; i++) { + final NamedNodeMap xmlEntryAttributes = xmlEntries.item(i).getAttributes(); + final String className = xmlEntryAttributes.getNamedItem("class").getNodeValue(); + final String implementationName = xmlEntryAttributes.getNamedItem("impl").getNodeValue(); + + final NodeList xmlProperties = ((Element)xmlEntries.item(i)).getElementsByTagName("property"); + + final Class interfaceClass = loader.findClass(className, Object.class); + final BeanInfo beanInfo = Introspector.getBeanInfo(interfaceClass, Introspector.IGNORE_ALL_BEANINFO); + + final HashMap descriptorMap = new HashMap<>(); + for (final PropertyDescriptor pd : beanInfo.getPropertyDescriptors()) { + descriptorMap.put(pd.getName(), pd); + } + + final Object instance = loader.newInstance(implementationName, Object.class); + if (!interfaceClass.isInstance(instance)) { + throw new IllegalArgumentException("Implementation class does not extend " + interfaceClass.getName()); + } + + for (int j = 0, c2 = xmlProperties.getLength(); j < c2; j++) { + final Node xmlProperty = xmlProperties.item(j); + final NamedNodeMap xmlPropertyAttributes = xmlProperty.getAttributes(); + + final String propertyName = xmlPropertyAttributes.getNamedItem("name").getNodeValue(); + final String propertyValue = xmlPropertyAttributes.getNamedItem("value").getNodeValue(); + + final PropertyDescriptor propertyDescriptor = descriptorMap.get(propertyName); + propertyDescriptor.getWriteMethod().invoke(instance, getValueFromString(propertyDescriptor.getPropertyType(), propertyValue)); + } + + entries.put(interfaceClass, instance); + } + } + + private Object getValueFromString(Class targetType, String text) { + final PropertyEditor editor = PropertyEditorManager.findEditor(targetType); + editor.setAsText(text); + return editor.getValue(); + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + public ParseContext create() { + final ParseContext result = new ParseContext(); + + for (Map.Entry, Object> entry : entries.entrySet()){ + result.set((Class) entry.getKey(), entry.getValue()); + } + + return result; + } +} diff --git a/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf b/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf new file mode 100644 index 00000000000..b168951c345 Binary files /dev/null and b/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf differ diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml new file mode 100644 index 00000000000..574c808c462 --- /dev/null +++ b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml @@ -0,0 +1,22 @@ + + + + + + + diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml index 43f1c1d6745..bb833563546 100644 --- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml +++ b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml @@ -185,7 +185,9 @@ - + + parseContext.xml + diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index 4104d4bb44c..318ffd0cdba 100644 --- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -656,6 +656,28 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']"); } + @Test + public void testPdfWithImages() throws Exception { + //Tests possibility to configure ParseContext (by example to extract embedded images from pdf) + loadLocal("extraction/pdf-with-image.pdf", + "fmap.created", "extractedDate", + "fmap.producer", "extractedProducer", + "fmap.creator", "extractedCreator", + "fmap.Keywords", "extractedKeywords", + "fmap.Creation-Date", "extractedDate", + "uprefix", "ignored_", + "fmap.Author", "extractedAuthor", + "fmap.content", "wdf_nocase", + "literal.id", "pdfWithImage", + "resource.name", "pdf-with-image.pdf", + "resource.password", "solrRules", + "fmap.Last-Modified", "extractedDate"); + + assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']"); + assertU(commit()); + assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']"); + } + @Test public void testPasswordProtected() throws Exception { // PDF, Passwords from resource.password @@ -705,7 +727,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { // DOCX, Passwords from file loadLocal("extraction/password-is-Word2010.docx", - "fmap.created", "extractedDate", + "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", diff --git a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java new file mode 100644 index 00000000000..f9f7bcb507f --- /dev/null +++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java @@ -0,0 +1,53 @@ +package org.apache.solr.handler.extraction; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import javax.xml.parsers.DocumentBuilderFactory; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.pdf.PDFParserConfig; +import org.w3c.dom.Document; +import org.w3c.dom.Element; + +public class ParseContextConfigTest extends SolrTestCaseJ4 { + + public void testAll() throws Exception { + Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); + Element entries = document.createElement("entries"); + Element entry = document.createElement("entry"); + + + entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig"); + entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig"); + + Element property = document.createElement("property"); + + property.setAttribute("name", "extractInlineImages"); + property.setAttribute("value", "true"); + entry.appendChild(property); + entries.appendChild(entry); + + ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader("."), entries).create(); + + PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class); + + assertEquals(true, pdfParserConfig.getExtractInlineImages()); + } + +}