From dff17dc8a74440f189ccf95c7e19922a0828df46 Mon Sep 17 00:00:00 2001 From: Shalin Shekhar Mangar Date: Wed, 12 Nov 2008 10:29:49 +0000 Subject: [PATCH] SOLR-833 -- A DataSource to read data from a field as a reader. This can be used, for example, to read XMLs residing as CLOBs or BLOBs in databases. git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@713343 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/dataimporthandler/CHANGES.txt | 4 + .../solr/handler/dataimport/DataImporter.java | 53 +++++--- .../solr/handler/dataimport/DocBuilder.java | 4 +- .../dataimport/FieldReaderDataSource.java | 122 ++++++++++++++++++ .../handler/dataimport/TemplateString.java | 2 + .../dataimport/XPathEntityProcessor.java | 10 +- .../handler/dataimport/TestFieldReader.java | 66 ++++++++++ 7 files changed, 232 insertions(+), 29 deletions(-) create mode 100644 contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FieldReaderDataSource.java create mode 100644 contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFieldReader.java diff --git a/contrib/dataimporthandler/CHANGES.txt b/contrib/dataimporthandler/CHANGES.txt index b25b12c26bf..1687260c739 100644 --- a/contrib/dataimporthandler/CHANGES.txt +++ b/contrib/dataimporthandler/CHANGES.txt @@ -28,6 +28,10 @@ New Features 3. SOLR-842: Better error handling in DataImportHandler with options to abort, skip and continue imports. (Noble Paul, shalin) +4. SOLR-833: A DataSource to read data from a field as a reader. This can be used, for example, to read XMLs + residing as CLOBs or BLOBs in databases. + (Noble Paul via shalin) + Optimizations ---------------------- diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImporter.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImporter.java index 0d4cfd0be29..56f9d623d74 100644 --- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImporter.java +++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImporter.java @@ -64,7 +64,7 @@ public class DataImporter { private Properties store = new Properties(); - private Map dataSourceProps; + private Map dataSourceProps = new HashMap(); private IndexSchema schema; @@ -159,6 +159,15 @@ public class DataImporter { } + /**Used by tests + */ + void loadAndInit(String configStr){ + loadDataConfig(configStr); + Map fields = new HashMap(); + DataConfig.Entity e = getConfig().documents.get(0).entities.get(0); + initEntity(e, fields, false); + } + void loadDataConfig(String configFile) { try { @@ -193,27 +202,29 @@ public class DataImporter { if (e.fields != null) { for (DataConfig.Field f : e.fields) { f.nameOrColName = f.getName(); - SchemaField schemaField = schema.getFieldOrNull(f.getName()); - if (schemaField != null) { - f.multiValued = schemaField.multiValued(); - f.allAttributes.put(MULTI_VALUED, Boolean.toString(schemaField - .multiValued())); - f.allAttributes.put(TYPE, schemaField.getType().getTypeName()); - f.allAttributes.put("indexed", Boolean - .toString(schemaField.indexed())); - f.allAttributes.put("stored", Boolean.toString(schemaField.stored())); - f.allAttributes.put("defaultValue", schemaField.getDefaultValue()); - } else { + if (schema != null) { + SchemaField schemaField = schema.getFieldOrNull(f.getName()); + if (schemaField != null) { + f.multiValued = schemaField.multiValued(); + f.allAttributes.put(MULTI_VALUED, Boolean.toString(schemaField + .multiValued())); + f.allAttributes.put(TYPE, schemaField.getType().getTypeName()); + f.allAttributes.put("indexed", Boolean + .toString(schemaField.indexed())); + f.allAttributes.put("stored", Boolean.toString(schemaField.stored())); + f.allAttributes.put("defaultValue", schemaField.getDefaultValue()); + } else { - try { - f.allAttributes.put(TYPE, schema.getDynamicFieldType(f.getName()) - .getTypeName()); - f.allAttributes.put(MULTI_VALUED, "true"); - f.multiValued = true; - } catch (RuntimeException e2) { - LOG.info("Field in data-config.xml - " + f.getName() - + " not found in schema.xml"); - f.toWrite = false; + try { + f.allAttributes.put(TYPE, schema.getDynamicFieldType(f.getName()) + .getTypeName()); + f.allAttributes.put(MULTI_VALUED, "true"); + f.multiValued = true; + } catch (RuntimeException e2) { + LOG.info("Field in data-config.xml - " + f.getName() + + " not found in schema.xml"); + f.toWrite = false; + } } } fields.put(f.getName(), f); diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java index afea626f0b7..9d4bb78ab17 100644 --- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java +++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java @@ -370,7 +370,7 @@ public class DocBuilder { continue; } DataConfig.Field field = entity.colNameVsField.get(key); - if (field == null) { + if (field == null && dataImporter.getSchema() != null) { // This can be a dynamic field or a field which does not have an entry in data-config ( an implicit field) SchemaField sf = dataImporter.getSchema().getFieldOrNull(key); if (sf == null) { @@ -381,7 +381,7 @@ public class DocBuilder { } //else do nothing. if we add it it may fail } else { - if (field.toWrite) { + if (field != null && field.toWrite) { addFieldToDoc(entry.getValue(), key, field.boost, field.multiValued, doc); } } diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FieldReaderDataSource.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FieldReaderDataSource.java new file mode 100644 index 00000000000..563e8f81555 --- /dev/null +++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/FieldReaderDataSource.java @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.dataimport; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; +import java.sql.Blob; +import java.sql.Clob; +import java.util.Properties; + +/** + * This can be useful for users who has a DB field containing xml and wish to use a nested XPathEntityProcessor + *

+ * The datasouce may be configured as follows + *

+ * + *

+ * The enity which uses this datasource must keep the url value as the varaible name url="field-name" + *

+ * The fieldname must be resolvable from VariableResolver + *

+ * This may be used with any EntityProcessor which uses a DataSource eg:XPathEntityProcessor + *

+ * Supports String, BLOB, CLOB data types and there is an extra field (in the entity) 'encoding' for BLOB types + * + * @version $Id$ + * @since 1.4 + */ +public class FieldReaderDataSource extends DataSource { + private static final Logger LOG = LoggerFactory.getLogger(FieldReaderDataSource.class); + protected VariableResolver vr; + protected String dataField; + private String encoding; + + public void init(Context context, Properties initProps) { + vr = context.getVariableResolver(); + dataField = context.getEntityAttribute("dataField"); + encoding = context.getEntityAttribute("encoding"); + /*no op*/ + } + + public Reader getData(String query) { + Object o = vr.resolve(dataField); + if (o == null) return null; + if (o instanceof String) { + return new StringReader((String) o); + } else if (o instanceof Clob) { + Clob clob = (Clob) o; + try { + //Most of the JDBC drivers have getCharacterStream defined as public + // so let us just check it + Method m = clob.getClass().getDeclaredMethod("getCharacterStream"); + if (Modifier.isPublic(m.getModifiers())) { + return (Reader) m.invoke(clob); + } else { + // force invoke + m.setAccessible(true); + return (Reader) m.invoke(clob); + } + } catch (Exception e) { + LOG.info("Unable to get data from CLOB"); + return null; + + } + + } else if (o instanceof Blob) { + Blob blob = (Blob) o; + try { + //Most of the JDBC drivers have getBinaryStream defined as public + // so let us just check it + Method m = blob.getClass().getDeclaredMethod("getBinaryStream"); + if (Modifier.isPublic(m.getModifiers())) { + return getReader(m, blob); + } else { + // force invoke + m.setAccessible(true); + return getReader(m, blob); + } + } catch (Exception e) { + LOG.info("Unable to get data from BLOB"); + return null; + + } + } else { + return new StringReader(o.toString()); + } + + } + + private Reader getReader(Method m, Blob blob) + throws IllegalAccessException, InvocationTargetException, UnsupportedEncodingException { + InputStream is = (InputStream) m.invoke(blob); + if (encoding == null) { + return (new InputStreamReader(is)); + } else { + return (new InputStreamReader(is, encoding)); + } + } + + public void close() { + + } +} diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/TemplateString.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/TemplateString.java index b4944ba6c8f..c4bbc948462 100644 --- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/TemplateString.java +++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/TemplateString.java @@ -67,6 +67,8 @@ public class TemplateString { * @return the string with all variables replaced */ public String replaceTokens(String string, VariableResolver resolver) { + if (string == null) + return null; TemplateString ts = cache.get(string); if (ts == null) { ts = new TemplateString(string); diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java index 8aee7b6fa6c..d0aa71c7850 100644 --- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java +++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java @@ -37,8 +37,8 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; /** - *

An implementation of EntityProcessor which uses a streaming xpath parser to extract values out of XML documents. - * It is typically used in conjunction with HttpDataSource or FileDataSource.

Refer to An implementation of EntityProcessor which uses a streaming xpath parser to extract values out of XML documents. + * It is typically used in conjunction with HttpDataSource or FileDataSource.

Refer to http://wiki.apache.org/solr/DataImportHandler for more * details.

*

@@ -131,9 +131,8 @@ public class XPathEntityProcessor extends EntityProcessorBase { "Exception while reading xpaths for fields", e); } } - - List l = TemplateString.getVariables(context - .getEntityAttribute(URL)); + String url = context.getEntityAttribute(URL); + List l = url == null ? Collections.EMPTY_LIST : TemplateString.getVariables(url); for (String s : l) { if (s.startsWith(entityName + ".")) { if (placeHolderVariables == null) @@ -166,7 +165,6 @@ public class XPathEntityProcessor extends EntityProcessorBase { if (pk == null || result.get(pk) != null) return result; } - } @SuppressWarnings("unchecked") diff --git a/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFieldReader.java b/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFieldReader.java new file mode 100644 index 00000000000..c780b7d9e70 --- /dev/null +++ b/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestFieldReader.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.dataimport; + +import junit.framework.Assert; +import static org.apache.solr.handler.dataimport.AbstractDataImportHandlerTest.createMap; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Test for FieldReaderDataSource + * + * @version $Id$ + * @see org.apache.solr.handler.dataimport.FieldReaderDataSource + * @since 1.4 + */ +public class TestFieldReader { + + @Test + public void simple() { + DataImporter di = new DataImporter(); + di.loadAndInit(config); + TestDocBuilder.SolrWriterImpl sw = new TestDocBuilder.SolrWriterImpl(); + DataImporter.RequestParams rp = new DataImporter.RequestParams(createMap("command", "full-import")); + List> l = new ArrayList>(); + l.add(createMap("xml", xml)); + MockDataSource.setIterator("select * from a", l.iterator()); + di.runCmd(rp, sw, new HashMap()); + Assert.assertEquals(sw.docs.get(0).getFieldValue("y"), "Hello"); + MockDataSource.clearCache(); + } + + String config = "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + ""; + + String xml = "\n" + + " Hello\n" + + ""; +}