mirror of https://github.com/apache/lucene.git
SOLR-887 -- A Transformer to strip HTML tags.
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@723410 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
44fae80e05
commit
90b387a92d
|
@ -32,6 +32,9 @@ New Features
|
|||
residing as CLOBs or BLOBs in databases.
|
||||
(Noble Paul via shalin)
|
||||
|
||||
5. SOLR-887: A Transformer to strip HTML tags.
|
||||
(Ahmed Hammad via shalin)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -0,0 +1,93 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.dataimport;
|
||||
|
||||
import org.apache.solr.analysis.HTMLStripReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A Transformer implementation which strip off HTML tags using org.apache.solr.analysis.HTMLStripReader This is useful
|
||||
* in case you don't need this HTML anyway.
|
||||
*
|
||||
* @version $Id$
|
||||
* @see org.apache.solr.analysis.HTMLStripReader
|
||||
* @since solr 1.4
|
||||
*/
|
||||
public class HTMLStripTransformer extends Transformer {
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public Object transformRow(Map<String, Object> row, Context context) {
|
||||
List<Map<String, String>> fields = context.getAllEntityFields();
|
||||
for (Map<String, String> field : fields) {
|
||||
String col = field.get(DataImporter.COLUMN);
|
||||
String splitHTML = field.get(STRIP_HTML);
|
||||
if (!TRUE.equals(splitHTML))
|
||||
continue;
|
||||
Object tmpVal = row.get(col);
|
||||
if (tmpVal == null)
|
||||
continue;
|
||||
|
||||
if (tmpVal instanceof List) {
|
||||
List<String> inputs = (List<String>) tmpVal;
|
||||
List results = new ArrayList();
|
||||
for (String input : inputs) {
|
||||
Object o = stripHTML(input, col);
|
||||
if (o != null)
|
||||
results.add(o);
|
||||
}
|
||||
row.put(col, results);
|
||||
} else {
|
||||
String value = tmpVal.toString();
|
||||
Object o = stripHTML(value, col);
|
||||
if (o != null)
|
||||
row.put(col, o);
|
||||
}
|
||||
}
|
||||
return row;
|
||||
}
|
||||
|
||||
private Object stripHTML(String value, String column) {
|
||||
StringBuilder out = new StringBuilder();
|
||||
StringReader strReader = new StringReader(value);
|
||||
try {
|
||||
HTMLStripReader html = new HTMLStripReader(strReader);
|
||||
char[] cbuf = new char[1024 * 10];
|
||||
while (true) {
|
||||
int count = html.read(cbuf);
|
||||
if (count == -1)
|
||||
break; // end of stream mark is -1
|
||||
if (count > 0)
|
||||
out.append(cbuf, 0, count);
|
||||
}
|
||||
html.close();
|
||||
} catch (IOException e) {
|
||||
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
|
||||
"Failed stripping HTML for column: " + column, e);
|
||||
}
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
public static final String STRIP_HTML = "stripHTML";
|
||||
|
||||
public static final String TRUE = "true";
|
||||
}
|
Loading…
Reference in New Issue