diff --git a/contrib/dataimporthandler/CHANGES.txt b/contrib/dataimporthandler/CHANGES.txt index 9e4cea22a02..9939c8f887e 100644 --- a/contrib/dataimporthandler/CHANGES.txt +++ b/contrib/dataimporthandler/CHANGES.txt @@ -32,6 +32,9 @@ New Features residing as CLOBs or BLOBs in databases. (Noble Paul via shalin) +5. SOLR-887: A Transformer to strip HTML tags. + (Ahmed Hammad via shalin) + Optimizations ---------------------- diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java new file mode 100644 index 00000000000..582b93c586b --- /dev/null +++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.dataimport; + +import org.apache.solr.analysis.HTMLStripReader; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * A Transformer implementation which strip off HTML tags using org.apache.solr.analysis.HTMLStripReader This is useful + * in case you don't need this HTML anyway. + * + * @version $Id$ + * @see org.apache.solr.analysis.HTMLStripReader + * @since solr 1.4 + */ +public class HTMLStripTransformer extends Transformer { + + @Override + @SuppressWarnings("unchecked") + public Object transformRow(Map row, Context context) { + List> fields = context.getAllEntityFields(); + for (Map field : fields) { + String col = field.get(DataImporter.COLUMN); + String splitHTML = field.get(STRIP_HTML); + if (!TRUE.equals(splitHTML)) + continue; + Object tmpVal = row.get(col); + if (tmpVal == null) + continue; + + if (tmpVal instanceof List) { + List inputs = (List) tmpVal; + List results = new ArrayList(); + for (String input : inputs) { + Object o = stripHTML(input, col); + if (o != null) + results.add(o); + } + row.put(col, results); + } else { + String value = tmpVal.toString(); + Object o = stripHTML(value, col); + if (o != null) + row.put(col, o); + } + } + return row; + } + + private Object stripHTML(String value, String column) { + StringBuilder out = new StringBuilder(); + StringReader strReader = new StringReader(value); + try { + HTMLStripReader html = new HTMLStripReader(strReader); + char[] cbuf = new char[1024 * 10]; + while (true) { + int count = html.read(cbuf); + if (count == -1) + break; // end of stream mark is -1 + if (count > 0) + out.append(cbuf, 0, count); + } + html.close(); + } catch (IOException e) { + throw new DataImportHandlerException(DataImportHandlerException.SEVERE, + "Failed stripping HTML for column: " + column, e); + } + return out.toString(); + } + + public static final String STRIP_HTML = "stripHTML"; + + public static final String TRUE = "true"; +}