Add HTML strip processor (#41888)
This processor uses the lucene HTMLStripCharFilter class to remove HTML entities from a field. This adds to the char filter, so that there is possibility to store the stripped version as well. Note, that the characeter filter replaces tags with a newline, so that the produced HTML will look slightly different than the incoming HTML with regards to newlines.
This commit is contained in:
parent
37771502ae
commit
8e33a5292a
|
@ -864,6 +864,7 @@ include::processors/foreach.asciidoc[]
|
|||
include::processors/geoip.asciidoc[]
|
||||
include::processors/grok.asciidoc[]
|
||||
include::processors/gsub.asciidoc[]
|
||||
include::processors/html_strip.asciidoc[]
|
||||
include::processors/join.asciidoc[]
|
||||
include::processors/json.asciidoc[]
|
||||
include::processors/kv.asciidoc[]
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
[[htmlstrip-processor]]
|
||||
=== HTML Strip Processor
|
||||
Removes HTML from field.
|
||||
|
||||
NOTE: Each HTML tag is replaced with a `\n` character.
|
||||
|
||||
[[htmlstrip-options]]
|
||||
.HTML Strip Options
|
||||
[options="header"]
|
||||
|======
|
||||
| Name | Required | Default | Description
|
||||
| `field` | yes | - | The string-valued field to remove HTML tags from
|
||||
| `target_field` | no | `field` | The field to assign the value to, by default `field` is updated in-place
|
||||
| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document
|
||||
include::common-options.asciidoc[]
|
||||
|======
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"html_strip": {
|
||||
"field": "foo"
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.ingest.common;
|
||||
|
||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Map;
|
||||
|
||||
public final class HtmlStripProcessor extends AbstractStringProcessor<String> {
|
||||
|
||||
public static final String TYPE = "html_strip";
|
||||
|
||||
HtmlStripProcessor(String tag, String field, boolean ignoreMissing, String targetField) {
|
||||
super(tag, field, ignoreMissing, targetField);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String process(String value) {
|
||||
// shortcut, no need to create a string builder and go through each char
|
||||
if (value.contains("<") == false || value.contains(">") == false) {
|
||||
return value;
|
||||
}
|
||||
|
||||
HTMLStripCharFilter filter = new HTMLStripCharFilter(new StringReader(value));
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch;
|
||||
try {
|
||||
while ((ch = filter.read()) != -1) {
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException(e);
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getType() {
|
||||
return TYPE;
|
||||
}
|
||||
|
||||
public static final class Factory extends AbstractStringProcessor.Factory {
|
||||
|
||||
public Factory() {
|
||||
super(TYPE);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HtmlStripProcessor newProcessor(String tag, Map<String, Object> config, String field,
|
||||
boolean ignoreMissing, String targetField) {
|
||||
return new HtmlStripProcessor(tag, field, ignoreMissing, targetField);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -87,6 +87,7 @@ public class IngestCommonPlugin extends Plugin implements ActionPlugin, IngestPl
|
|||
processors.put(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService));
|
||||
processors.put(DissectProcessor.TYPE, new DissectProcessor.Factory());
|
||||
processors.put(DropProcessor.TYPE, new DropProcessor.Factory());
|
||||
processors.put(HtmlStripProcessor.TYPE, new HtmlStripProcessor.Factory());
|
||||
return Collections.unmodifiableMap(processors);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.ingest.common;
|
||||
|
||||
public class HtmlStripProcessorFactoryTests extends AbstractStringProcessorFactoryTestCase {
|
||||
@Override
|
||||
protected AbstractStringProcessor.Factory newFactory() {
|
||||
return new HtmlStripProcessor.Factory();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.ingest.common;
|
||||
|
||||
public class HtmlStripProcessorTests extends AbstractStringProcessorTestCase<String> {
|
||||
|
||||
@Override
|
||||
protected AbstractStringProcessor<String> newProcessor(String field, boolean ignoreMissing, String targetField) {
|
||||
return new HtmlStripProcessor(randomAlphaOfLength(10), field, ignoreMissing, targetField);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String modifyInput(String input) {
|
||||
return "<p><b>test</b>" + input + "<p><b>test</b>";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String expectedResult(String input) {
|
||||
return "\ntest" + input + "\ntest";
|
||||
}
|
||||
}
|
|
@ -23,6 +23,7 @@
|
|||
- contains: { nodes.$master.ingest.processors: { type: foreach } }
|
||||
- contains: { nodes.$master.ingest.processors: { type: grok } }
|
||||
- contains: { nodes.$master.ingest.processors: { type: gsub } }
|
||||
- contains: { nodes.$master.ingest.processors: { type: html_strip } }
|
||||
- contains: { nodes.$master.ingest.processors: { type: join } }
|
||||
- contains: { nodes.$master.ingest.processors: { type: json } }
|
||||
- contains: { nodes.$master.ingest.processors: { type: kv } }
|
||||
|
|
|
@ -76,6 +76,11 @@ teardown:
|
|||
"pattern" : "-",
|
||||
"replacement" : "."
|
||||
}
|
||||
},
|
||||
{
|
||||
"html_strip" : {
|
||||
"field" : "field_to_html_strip"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -96,7 +101,8 @@ teardown:
|
|||
"field_to_split": "127-0-0-1",
|
||||
"field_to_join": ["127","0","0","1"],
|
||||
"field_to_convert": ["127","0","0","1"],
|
||||
"field_to_gsub": "127-0-0-1"
|
||||
"field_to_gsub": "127-0-0-1",
|
||||
"field_to_html_strip": "<p>this <title>is</title> a <b>test</b>"
|
||||
}
|
||||
|
||||
- do:
|
||||
|
@ -114,6 +120,7 @@ teardown:
|
|||
- match: { _source.field_to_join: "127-0-0-1" }
|
||||
- match: { _source.field_to_convert: [127,0,0,1] }
|
||||
- match: { _source.field_to_gsub: "127.0.0.1" }
|
||||
- match: { _source.field_to_html_strip: "\nthis \nis\n a test" }
|
||||
|
||||
---
|
||||
"Test metadata":
|
||||
|
|
Loading…
Reference in New Issue