Add HTML strip processor (#41888)
This processor uses the lucene HTMLStripCharFilter class to remove HTML entities from a field. This adds to the char filter, so that there is possibility to store the stripped version as well. Note, that the characeter filter replaces tags with a newline, so that the produced HTML will look slightly different than the incoming HTML with regards to newlines.
This commit is contained in:
parent
37771502ae
commit
8e33a5292a
|
@ -864,6 +864,7 @@ include::processors/foreach.asciidoc[]
|
||||||
include::processors/geoip.asciidoc[]
|
include::processors/geoip.asciidoc[]
|
||||||
include::processors/grok.asciidoc[]
|
include::processors/grok.asciidoc[]
|
||||||
include::processors/gsub.asciidoc[]
|
include::processors/gsub.asciidoc[]
|
||||||
|
include::processors/html_strip.asciidoc[]
|
||||||
include::processors/join.asciidoc[]
|
include::processors/join.asciidoc[]
|
||||||
include::processors/json.asciidoc[]
|
include::processors/json.asciidoc[]
|
||||||
include::processors/kv.asciidoc[]
|
include::processors/kv.asciidoc[]
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
[[htmlstrip-processor]]
|
||||||
|
=== HTML Strip Processor
|
||||||
|
Removes HTML from field.
|
||||||
|
|
||||||
|
NOTE: Each HTML tag is replaced with a `\n` character.
|
||||||
|
|
||||||
|
[[htmlstrip-options]]
|
||||||
|
.HTML Strip Options
|
||||||
|
[options="header"]
|
||||||
|
|======
|
||||||
|
| Name | Required | Default | Description
|
||||||
|
| `field` | yes | - | The string-valued field to remove HTML tags from
|
||||||
|
| `target_field` | no | `field` | The field to assign the value to, by default `field` is updated in-place
|
||||||
|
| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document
|
||||||
|
include::common-options.asciidoc[]
|
||||||
|
|======
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"html_strip": {
|
||||||
|
"field": "foo"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// NOTCONSOLE
|
|
@ -0,0 +1,76 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.ingest.common;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
||||||
|
import org.elasticsearch.ElasticsearchException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public final class HtmlStripProcessor extends AbstractStringProcessor<String> {
|
||||||
|
|
||||||
|
public static final String TYPE = "html_strip";
|
||||||
|
|
||||||
|
HtmlStripProcessor(String tag, String field, boolean ignoreMissing, String targetField) {
|
||||||
|
super(tag, field, ignoreMissing, targetField);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected String process(String value) {
|
||||||
|
// shortcut, no need to create a string builder and go through each char
|
||||||
|
if (value.contains("<") == false || value.contains(">") == false) {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
HTMLStripCharFilter filter = new HTMLStripCharFilter(new StringReader(value));
|
||||||
|
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
int ch;
|
||||||
|
try {
|
||||||
|
while ((ch = filter.read()) != -1) {
|
||||||
|
builder.append((char)ch);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new ElasticsearchException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getType() {
|
||||||
|
return TYPE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final class Factory extends AbstractStringProcessor.Factory {
|
||||||
|
|
||||||
|
public Factory() {
|
||||||
|
super(TYPE);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected HtmlStripProcessor newProcessor(String tag, Map<String, Object> config, String field,
|
||||||
|
boolean ignoreMissing, String targetField) {
|
||||||
|
return new HtmlStripProcessor(tag, field, ignoreMissing, targetField);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -87,6 +87,7 @@ public class IngestCommonPlugin extends Plugin implements ActionPlugin, IngestPl
|
||||||
processors.put(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService));
|
processors.put(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService));
|
||||||
processors.put(DissectProcessor.TYPE, new DissectProcessor.Factory());
|
processors.put(DissectProcessor.TYPE, new DissectProcessor.Factory());
|
||||||
processors.put(DropProcessor.TYPE, new DropProcessor.Factory());
|
processors.put(DropProcessor.TYPE, new DropProcessor.Factory());
|
||||||
|
processors.put(HtmlStripProcessor.TYPE, new HtmlStripProcessor.Factory());
|
||||||
return Collections.unmodifiableMap(processors);
|
return Collections.unmodifiableMap(processors);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.ingest.common;
|
||||||
|
|
||||||
|
public class HtmlStripProcessorFactoryTests extends AbstractStringProcessorFactoryTestCase {
|
||||||
|
@Override
|
||||||
|
protected AbstractStringProcessor.Factory newFactory() {
|
||||||
|
return new HtmlStripProcessor.Factory();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.ingest.common;
|
||||||
|
|
||||||
|
public class HtmlStripProcessorTests extends AbstractStringProcessorTestCase<String> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected AbstractStringProcessor<String> newProcessor(String field, boolean ignoreMissing, String targetField) {
|
||||||
|
return new HtmlStripProcessor(randomAlphaOfLength(10), field, ignoreMissing, targetField);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected String modifyInput(String input) {
|
||||||
|
return "<p><b>test</b>" + input + "<p><b>test</b>";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected String expectedResult(String input) {
|
||||||
|
return "\ntest" + input + "\ntest";
|
||||||
|
}
|
||||||
|
}
|
|
@ -23,6 +23,7 @@
|
||||||
- contains: { nodes.$master.ingest.processors: { type: foreach } }
|
- contains: { nodes.$master.ingest.processors: { type: foreach } }
|
||||||
- contains: { nodes.$master.ingest.processors: { type: grok } }
|
- contains: { nodes.$master.ingest.processors: { type: grok } }
|
||||||
- contains: { nodes.$master.ingest.processors: { type: gsub } }
|
- contains: { nodes.$master.ingest.processors: { type: gsub } }
|
||||||
|
- contains: { nodes.$master.ingest.processors: { type: html_strip } }
|
||||||
- contains: { nodes.$master.ingest.processors: { type: join } }
|
- contains: { nodes.$master.ingest.processors: { type: join } }
|
||||||
- contains: { nodes.$master.ingest.processors: { type: json } }
|
- contains: { nodes.$master.ingest.processors: { type: json } }
|
||||||
- contains: { nodes.$master.ingest.processors: { type: kv } }
|
- contains: { nodes.$master.ingest.processors: { type: kv } }
|
||||||
|
|
|
@ -76,6 +76,11 @@ teardown:
|
||||||
"pattern" : "-",
|
"pattern" : "-",
|
||||||
"replacement" : "."
|
"replacement" : "."
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"html_strip" : {
|
||||||
|
"field" : "field_to_html_strip"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -96,7 +101,8 @@ teardown:
|
||||||
"field_to_split": "127-0-0-1",
|
"field_to_split": "127-0-0-1",
|
||||||
"field_to_join": ["127","0","0","1"],
|
"field_to_join": ["127","0","0","1"],
|
||||||
"field_to_convert": ["127","0","0","1"],
|
"field_to_convert": ["127","0","0","1"],
|
||||||
"field_to_gsub": "127-0-0-1"
|
"field_to_gsub": "127-0-0-1",
|
||||||
|
"field_to_html_strip": "<p>this <title>is</title> a <b>test</b>"
|
||||||
}
|
}
|
||||||
|
|
||||||
- do:
|
- do:
|
||||||
|
@ -114,6 +120,7 @@ teardown:
|
||||||
- match: { _source.field_to_join: "127-0-0-1" }
|
- match: { _source.field_to_join: "127-0-0-1" }
|
||||||
- match: { _source.field_to_convert: [127,0,0,1] }
|
- match: { _source.field_to_convert: [127,0,0,1] }
|
||||||
- match: { _source.field_to_gsub: "127.0.0.1" }
|
- match: { _source.field_to_gsub: "127.0.0.1" }
|
||||||
|
- match: { _source.field_to_html_strip: "\nthis \nis\n a test" }
|
||||||
|
|
||||||
---
|
---
|
||||||
"Test metadata":
|
"Test metadata":
|
||||||
|
|
Loading…
Reference in New Issue