SOLR-887 -- A Transformer to strip HTML tags.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@723410 13f79535-47bb-0310-9956-ffa450edef68
2008-12-04 19:50:43 +00:00 · 2008-12-04 19:50:43 +00:00 · 90b387a92d
parent 44fae80e05
commit 90b387a92d
2 changed files with 96 additions and 0 deletions
--- a/contrib/dataimporthandler/CHANGES.txt
+++ b/contrib/dataimporthandler/CHANGES.txt
@ -32,6 +32,9 @@ New Features
              residing as CLOBs or BLOBs in databases.
              (Noble Paul via shalin)

+5. SOLR-887:  A Transformer to strip HTML tags.
+              (Ahmed Hammad via shalin)
+
 Optimizations
 ----------------------

--- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
+++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.dataimport;
+
+import org.apache.solr.analysis.HTMLStripReader;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A Transformer implementation which strip off HTML tags using org.apache.solr.analysis.HTMLStripReader This is useful
+ * in case you don't need this HTML anyway.
+ *
+ * @version $Id$
+ * @see org.apache.solr.analysis.HTMLStripReader
+ * @since solr 1.4
+ */
+public class HTMLStripTransformer extends Transformer {
+
+  @Override
+  @SuppressWarnings("unchecked")
+  public Object transformRow(Map<String, Object> row, Context context) {
+    List<Map<String, String>> fields = context.getAllEntityFields();
+    for (Map<String, String> field : fields) {
+      String col = field.get(DataImporter.COLUMN);
+      String splitHTML = field.get(STRIP_HTML);
+      if (!TRUE.equals(splitHTML))
+        continue;
+      Object tmpVal = row.get(col);
+      if (tmpVal == null)
+        continue;
+
+      if (tmpVal instanceof List) {
+        List<String> inputs = (List<String>) tmpVal;
+        List results = new ArrayList();
+        for (String input : inputs) {
+          Object o = stripHTML(input, col);
+          if (o != null)
+            results.add(o);
+        }
+        row.put(col, results);
+      } else {
+        String value = tmpVal.toString();
+        Object o = stripHTML(value, col);
+        if (o != null)
+          row.put(col, o);
+      }
+    }
+    return row;
+  }
+
+  private Object stripHTML(String value, String column) {
+    StringBuilder out = new StringBuilder();
+    StringReader strReader = new StringReader(value);
+    try {
+      HTMLStripReader html = new HTMLStripReader(strReader);
+      char[] cbuf = new char[1024 * 10];
+      while (true) {
+        int count = html.read(cbuf);
+        if (count == -1)
+          break; // end of stream mark is -1
+        if (count > 0)
+          out.append(cbuf, 0, count);
+      }
+      html.close();
+    } catch (IOException e) {
+      throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
+              "Failed stripping HTML for column: " + column, e);
+    }
+    return out.toString();
+  }
+
+  public static final String STRIP_HTML = "stripHTML";
+
+  public static final String TRUE = "true";
+}