From dff17dc8a74440f189ccf95c7e19922a0828df46 Mon Sep 17 00:00:00 2001
From: Shalin Shekhar Mangar
Date: Wed, 12 Nov 2008 10:29:49 +0000
Subject: [PATCH] SOLR-833 -- A DataSource to read data from a field as a
reader. This can be used, for example, to read XMLs residing as CLOBs or
BLOBs in databases.
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@713343 13f79535-47bb-0310-9956-ffa450edef68
---
contrib/dataimporthandler/CHANGES.txt | 4 +
.../solr/handler/dataimport/DataImporter.java | 53 +++++---
.../solr/handler/dataimport/DocBuilder.java | 4 +-
.../dataimport/FieldReaderDataSource.java | 122 ++++++++++++++++++
.../handler/dataimport/TemplateString.java | 2 +
.../dataimport/XPathEntityProcessor.java | 10 +-
.../handler/dataimport/TestFieldReader.java | 66 ++++++++++
7 files changed, 232 insertions(+), 29 deletions(-)
create mode 100644 contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FieldReaderDataSource.java
create mode 100644 contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFieldReader.java
diff --git a/contrib/dataimporthandler/CHANGES.txt b/contrib/dataimporthandler/CHANGES.txt
index b25b12c26bf..1687260c739 100644
--- a/contrib/dataimporthandler/CHANGES.txt
+++ b/contrib/dataimporthandler/CHANGES.txt
@@ -28,6 +28,10 @@ New Features
3. SOLR-842: Better error handling in DataImportHandler with options to abort, skip and continue imports.
(Noble Paul, shalin)
+4. SOLR-833: A DataSource to read data from a field as a reader. This can be used, for example, to read XMLs
+ residing as CLOBs or BLOBs in databases.
+ (Noble Paul via shalin)
+
Optimizations
----------------------
diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImporter.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImporter.java
index 0d4cfd0be29..56f9d623d74 100644
--- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImporter.java
+++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImporter.java
@@ -64,7 +64,7 @@ public class DataImporter {
private Properties store = new Properties();
- private Map dataSourceProps;
+ private Map dataSourceProps = new HashMap();
private IndexSchema schema;
@@ -159,6 +159,15 @@ public class DataImporter {
}
+ /**Used by tests
+ */
+ void loadAndInit(String configStr){
+ loadDataConfig(configStr);
+ Map fields = new HashMap();
+ DataConfig.Entity e = getConfig().documents.get(0).entities.get(0);
+ initEntity(e, fields, false);
+ }
+
void loadDataConfig(String configFile) {
try {
@@ -193,27 +202,29 @@ public class DataImporter {
if (e.fields != null) {
for (DataConfig.Field f : e.fields) {
f.nameOrColName = f.getName();
- SchemaField schemaField = schema.getFieldOrNull(f.getName());
- if (schemaField != null) {
- f.multiValued = schemaField.multiValued();
- f.allAttributes.put(MULTI_VALUED, Boolean.toString(schemaField
- .multiValued()));
- f.allAttributes.put(TYPE, schemaField.getType().getTypeName());
- f.allAttributes.put("indexed", Boolean
- .toString(schemaField.indexed()));
- f.allAttributes.put("stored", Boolean.toString(schemaField.stored()));
- f.allAttributes.put("defaultValue", schemaField.getDefaultValue());
- } else {
+ if (schema != null) {
+ SchemaField schemaField = schema.getFieldOrNull(f.getName());
+ if (schemaField != null) {
+ f.multiValued = schemaField.multiValued();
+ f.allAttributes.put(MULTI_VALUED, Boolean.toString(schemaField
+ .multiValued()));
+ f.allAttributes.put(TYPE, schemaField.getType().getTypeName());
+ f.allAttributes.put("indexed", Boolean
+ .toString(schemaField.indexed()));
+ f.allAttributes.put("stored", Boolean.toString(schemaField.stored()));
+ f.allAttributes.put("defaultValue", schemaField.getDefaultValue());
+ } else {
- try {
- f.allAttributes.put(TYPE, schema.getDynamicFieldType(f.getName())
- .getTypeName());
- f.allAttributes.put(MULTI_VALUED, "true");
- f.multiValued = true;
- } catch (RuntimeException e2) {
- LOG.info("Field in data-config.xml - " + f.getName()
- + " not found in schema.xml");
- f.toWrite = false;
+ try {
+ f.allAttributes.put(TYPE, schema.getDynamicFieldType(f.getName())
+ .getTypeName());
+ f.allAttributes.put(MULTI_VALUED, "true");
+ f.multiValued = true;
+ } catch (RuntimeException e2) {
+ LOG.info("Field in data-config.xml - " + f.getName()
+ + " not found in schema.xml");
+ f.toWrite = false;
+ }
}
}
fields.put(f.getName(), f);
diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java
index afea626f0b7..9d4bb78ab17 100644
--- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java
+++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java
@@ -370,7 +370,7 @@ public class DocBuilder {
continue;
}
DataConfig.Field field = entity.colNameVsField.get(key);
- if (field == null) {
+ if (field == null && dataImporter.getSchema() != null) {
// This can be a dynamic field or a field which does not have an entry in data-config ( an implicit field)
SchemaField sf = dataImporter.getSchema().getFieldOrNull(key);
if (sf == null) {
@@ -381,7 +381,7 @@ public class DocBuilder {
}
//else do nothing. if we add it it may fail
} else {
- if (field.toWrite) {
+ if (field != null && field.toWrite) {
addFieldToDoc(entry.getValue(), key, field.boost, field.multiValued, doc);
}
}
diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FieldReaderDataSource.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FieldReaderDataSource.java
new file mode 100644
index 00000000000..563e8f81555
--- /dev/null
+++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FieldReaderDataSource.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.dataimport;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.lang.reflect.Modifier;
+import java.sql.Blob;
+import java.sql.Clob;
+import java.util.Properties;
+
+/**
+ * This can be useful for users who has a DB field containing xml and wish to use a nested XPathEntityProcessor
+ *
+ * The datasouce may be configured as follows
+ *
+ *
+ *
+ * The enity which uses this datasource must keep the url value as the varaible name url="field-name"
+ *
+ * The fieldname must be resolvable from VariableResolver
+ *
+ * This may be used with any EntityProcessor which uses a DataSource eg:XPathEntityProcessor
+ *
+ * Supports String, BLOB, CLOB data types and there is an extra field (in the entity) 'encoding' for BLOB types
+ *
+ * @version $Id$
+ * @since 1.4
+ */
+public class FieldReaderDataSource extends DataSource {
+ private static final Logger LOG = LoggerFactory.getLogger(FieldReaderDataSource.class);
+ protected VariableResolver vr;
+ protected String dataField;
+ private String encoding;
+
+ public void init(Context context, Properties initProps) {
+ vr = context.getVariableResolver();
+ dataField = context.getEntityAttribute("dataField");
+ encoding = context.getEntityAttribute("encoding");
+ /*no op*/
+ }
+
+ public Reader getData(String query) {
+ Object o = vr.resolve(dataField);
+ if (o == null) return null;
+ if (o instanceof String) {
+ return new StringReader((String) o);
+ } else if (o instanceof Clob) {
+ Clob clob = (Clob) o;
+ try {
+ //Most of the JDBC drivers have getCharacterStream defined as public
+ // so let us just check it
+ Method m = clob.getClass().getDeclaredMethod("getCharacterStream");
+ if (Modifier.isPublic(m.getModifiers())) {
+ return (Reader) m.invoke(clob);
+ } else {
+ // force invoke
+ m.setAccessible(true);
+ return (Reader) m.invoke(clob);
+ }
+ } catch (Exception e) {
+ LOG.info("Unable to get data from CLOB");
+ return null;
+
+ }
+
+ } else if (o instanceof Blob) {
+ Blob blob = (Blob) o;
+ try {
+ //Most of the JDBC drivers have getBinaryStream defined as public
+ // so let us just check it
+ Method m = blob.getClass().getDeclaredMethod("getBinaryStream");
+ if (Modifier.isPublic(m.getModifiers())) {
+ return getReader(m, blob);
+ } else {
+ // force invoke
+ m.setAccessible(true);
+ return getReader(m, blob);
+ }
+ } catch (Exception e) {
+ LOG.info("Unable to get data from BLOB");
+ return null;
+
+ }
+ } else {
+ return new StringReader(o.toString());
+ }
+
+ }
+
+ private Reader getReader(Method m, Blob blob)
+ throws IllegalAccessException, InvocationTargetException, UnsupportedEncodingException {
+ InputStream is = (InputStream) m.invoke(blob);
+ if (encoding == null) {
+ return (new InputStreamReader(is));
+ } else {
+ return (new InputStreamReader(is, encoding));
+ }
+ }
+
+ public void close() {
+
+ }
+}
diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/TemplateString.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/TemplateString.java
index b4944ba6c8f..c4bbc948462 100644
--- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/TemplateString.java
+++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/TemplateString.java
@@ -67,6 +67,8 @@ public class TemplateString {
* @return the string with all variables replaced
*/
public String replaceTokens(String string, VariableResolver resolver) {
+ if (string == null)
+ return null;
TemplateString ts = cache.get(string);
if (ts == null) {
ts = new TemplateString(string);
diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java
index 8aee7b6fa6c..d0aa71c7850 100644
--- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java
+++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java
@@ -37,8 +37,8 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
/**
- * An implementation of EntityProcessor which uses a streaming xpath parser to extract values out of XML documents.
- * It is typically used in conjunction with HttpDataSource or FileDataSource.
Refer to An implementation of EntityProcessor which uses a streaming xpath parser to extract values out of XML documents.
+ * It is typically used in conjunction with HttpDataSource or FileDataSource. Refer to http://wiki.apache.org/solr/DataImportHandler for more
* details.
*
@@ -131,9 +131,8 @@ public class XPathEntityProcessor extends EntityProcessorBase {
"Exception while reading xpaths for fields", e);
}
}
-
- List l = TemplateString.getVariables(context
- .getEntityAttribute(URL));
+ String url = context.getEntityAttribute(URL);
+ List l = url == null ? Collections.EMPTY_LIST : TemplateString.getVariables(url);
for (String s : l) {
if (s.startsWith(entityName + ".")) {
if (placeHolderVariables == null)
@@ -166,7 +165,6 @@ public class XPathEntityProcessor extends EntityProcessorBase {
if (pk == null || result.get(pk) != null)
return result;
}
-
}
@SuppressWarnings("unchecked")
diff --git a/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFieldReader.java b/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFieldReader.java
new file mode 100644
index 00000000000..c780b7d9e70
--- /dev/null
+++ b/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFieldReader.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.dataimport;
+
+import junit.framework.Assert;
+import static org.apache.solr.handler.dataimport.AbstractDataImportHandlerTest.createMap;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Test for FieldReaderDataSource
+ *
+ * @version $Id$
+ * @see org.apache.solr.handler.dataimport.FieldReaderDataSource
+ * @since 1.4
+ */
+public class TestFieldReader {
+
+ @Test
+ public void simple() {
+ DataImporter di = new DataImporter();
+ di.loadAndInit(config);
+ TestDocBuilder.SolrWriterImpl sw = new TestDocBuilder.SolrWriterImpl();
+ DataImporter.RequestParams rp = new DataImporter.RequestParams(createMap("command", "full-import"));
+ List