SOLR-8166: Introduce possibility to configure ParseContext in ExtractingRequestHandler/ExtractingDocumentLoader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712629 13f79535-47bb-0310-9956-ffa450edef68
2015-11-04 20:13:40 +00:00 · 2015-11-04 20:13:40 +00:00 · 6f5b28be8a
parent 2bbbc4d42d
commit 6f5b28be8a
9 changed files with 243 additions and 5 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -195,6 +195,10 @@ New Features

 * SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI

+* SOLR-8166: Introduce possibility to configure ParseContext in
+  ExtractingRequestHandler/ExtractingDocumentLoader (Andriy Binetsky
+  via Uwe Schindler)
+
 Bug Fixes
 ----------------------

--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@ -91,13 +91,16 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
  private final AddUpdateCommand templateAdd;

  protected TikaConfig config;
+  protected ParseContextConfig parseContextConfig;
  protected SolrContentHandlerFactory factory;

  public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
-                           TikaConfig config, SolrContentHandlerFactory factory) {
+                           TikaConfig config, ParseContextConfig parseContextConfig,
+                                  SolrContentHandlerFactory factory) {
    this.params = req.getParams();
    this.core = req.getCore();
    this.config = config;
+    this.parseContextConfig = parseContextConfig;
    this.processor = processor;

    templateAdd = new AddUpdateCommand(req);
@ -199,7 +202,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {

        try{
          //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
-          ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+          ParseContext context = parseContextConfig.create();
+
+
+          context.set(Parser.class, parser);
          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@ -48,10 +48,12 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement

  private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);

+  public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
  public static final String CONFIG_LOCATION = "tika.config";
  public static final String DATE_FORMATS = "date.formats";

  protected TikaConfig config;
+  protected ParseContextConfig parseContextConfig;


  protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
@ -79,6 +81,16 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
          throw new SolrException(ErrorCode.SERVER_ERROR, e);
        }
      }
+
+      String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
+      if (parseContextConfigLoc != null) {
+        try {
+          parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
+        } catch (Exception e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        }
+      }
+
      NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
      if (configDateFormats != null && configDateFormats.size() > 0) {
        dateFormats = new HashSet<>();
@ -97,6 +109,9 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement
        throw new SolrException(ErrorCode.SERVER_ERROR, e);
      }
    }
+    if (parseContextConfig == null) {
+      parseContextConfig = new ParseContextConfig();
+    }
    factory = createFactory();
  }

@ -111,7 +126,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase implement

  @Override
  protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
-    return new ExtractingDocumentLoader(req, processor, config, factory);
+    return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
  }

  // ////////////////////// SolrInfoMBeans methods //////////////////////
--- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
+++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
@ -0,0 +1,114 @@
+package org.apache.solr.handler.extraction;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import java.beans.BeanInfo;
+import java.beans.Introspector;
+import java.beans.PropertyDescriptor;
+import java.beans.PropertyEditor;
+import java.beans.PropertyEditorManager;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.tika.parser.ParseContext;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+public class ParseContextConfig {
+  private final Map<Class<?>, Object> entries = new HashMap<>();
+
+  /** Creates an empty Config without any settings (used as placeholder). */
+  public ParseContextConfig() {
+  }
+
+  /** Creates a {@code ParseContextConfig} from the given XML DOM element. */
+  public ParseContextConfig(SolrResourceLoader resourceLoader, Element element) throws Exception {
+    extract(element, resourceLoader);
+  }
+
+  /** Creates a {@code ParseContextConfig} from the given XML file, loaded from the given {@link SolrResourceLoader}. */
+  public ParseContextConfig(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception {
+    this(resourceLoader, loadConfigFile(resourceLoader, parseContextConfigLoc).getDocumentElement());
+  }
+  
+  private static Document loadConfigFile(SolrResourceLoader resourceLoader, String parseContextConfigLoc) throws Exception {
+    try (InputStream in = resourceLoader.openResource(parseContextConfigLoc)) {
+      return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(in, parseContextConfigLoc);
+    }
+  }
+
+  private void extract(Element element, SolrResourceLoader loader) throws Exception {
+    final NodeList xmlEntries = element.getElementsByTagName("entry");
+    for (int i = 0, c1 = xmlEntries.getLength(); i < c1; i++) {
+      final NamedNodeMap xmlEntryAttributes = xmlEntries.item(i).getAttributes();
+      final String className = xmlEntryAttributes.getNamedItem("class").getNodeValue();
+      final String implementationName = xmlEntryAttributes.getNamedItem("impl").getNodeValue();
+
+      final NodeList xmlProperties = ((Element)xmlEntries.item(i)).getElementsByTagName("property");
+
+      final Class<?> interfaceClass = loader.findClass(className, Object.class);
+      final BeanInfo beanInfo = Introspector.getBeanInfo(interfaceClass, Introspector.IGNORE_ALL_BEANINFO);
+      
+      final HashMap<String, PropertyDescriptor> descriptorMap = new HashMap<>();
+      for (final PropertyDescriptor pd : beanInfo.getPropertyDescriptors()) {
+        descriptorMap.put(pd.getName(), pd);
+      }
+
+      final Object instance = loader.newInstance(implementationName, Object.class);
+      if (!interfaceClass.isInstance(instance)) {
+        throw new IllegalArgumentException("Implementation class does not extend " + interfaceClass.getName());
+      }
+
+      for (int j = 0, c2 = xmlProperties.getLength(); j < c2; j++) {
+        final Node xmlProperty = xmlProperties.item(j);
+        final NamedNodeMap xmlPropertyAttributes = xmlProperty.getAttributes();
+
+        final String propertyName = xmlPropertyAttributes.getNamedItem("name").getNodeValue();
+        final String propertyValue = xmlPropertyAttributes.getNamedItem("value").getNodeValue();
+
+        final PropertyDescriptor propertyDescriptor = descriptorMap.get(propertyName);
+        propertyDescriptor.getWriteMethod().invoke(instance, getValueFromString(propertyDescriptor.getPropertyType(), propertyValue));
+      }
+
+      entries.put(interfaceClass, instance);
+    }
+  }
+
+  private Object getValueFromString(Class<?> targetType, String text) {
+    final PropertyEditor editor = PropertyEditorManager.findEditor(targetType);
+    editor.setAsText(text);
+    return editor.getValue();
+  }
+
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  public ParseContext create() {
+    final ParseContext result = new ParseContext();
+
+    for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
+      result.set((Class) entry.getKey(), entry.getValue());
+    }
+
+    return result;
+  }
+}
--- a/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf
+++ b/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
+++ b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
@ -0,0 +1,22 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<entries>
+  <entry class="org.apache.tika.parser.pdf.PDFParserConfig" impl="org.apache.tika.parser.pdf.PDFParserConfig">
+    <property name="extractInlineImages" value="true"/>
+  </entry>
+</entries>
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
+++ b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
@ -185,7 +185,9 @@
    </lst>
  </requestHandler>

-  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
+  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
+    <str name="parseContext.config">parseContext.xml</str>
+  </requestHandler>

  <requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
    <lst name="defaults">
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@ -656,6 +656,28 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
    assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
  }

+  @Test
+  public void testPdfWithImages() throws Exception {
+    //Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
+    loadLocal("extraction/pdf-with-image.pdf",
+        "fmap.created", "extractedDate",
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator",
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "pdfWithImage",
+        "resource.name", "pdf-with-image.pdf",
+        "resource.password", "solrRules",
+        "fmap.Last-Modified", "extractedDate");
+
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
+  }
+
  @Test
  public void testPasswordProtected() throws Exception {
    // PDF, Passwords from resource.password
@ -705,7 +727,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {

    // DOCX, Passwords from file
    loadLocal("extraction/password-is-Word2010.docx", 
-        "fmap.created", "extractedDate", 
+        "fmap.created", "extractedDate",
        "fmap.producer", "extractedProducer",
        "fmap.creator", "extractedCreator", 
        "fmap.Keywords", "extractedKeywords",
--- a/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
+++ b/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
@ -0,0 +1,53 @@
+package org.apache.solr.handler.extraction;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+public class ParseContextConfigTest extends SolrTestCaseJ4 {
+
+  public void  testAll() throws Exception {
+    Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
+    Element entries = document.createElement("entries");
+    Element entry = document.createElement("entry");
+
+
+    entry.setAttribute("class", "org.apache.tika.parser.pdf.PDFParserConfig");
+    entry.setAttribute("impl", "org.apache.tika.parser.pdf.PDFParserConfig");
+
+    Element property = document.createElement("property");
+
+    property.setAttribute("name", "extractInlineImages");
+    property.setAttribute("value", "true");
+    entry.appendChild(property);
+    entries.appendChild(entry);
+
+    ParseContext parseContext = new ParseContextConfig(new SolrResourceLoader("."), entries).create();
+
+    PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+
+    assertEquals(true, pdfParserConfig.getExtractInlineImages());
+  }
+
+}