Add HTML strip processor (#41888)

This processor uses the lucene HTMLStripCharFilter class to remove HTML
entities from a field. This adds to the char filter, so that there is
possibility to store the stripped version as well.

Note, that the characeter filter replaces tags with a newline, so that
the produced HTML will look slightly different than the incoming HTML
with regards to newlines.
This commit is contained in:
Alexander Reelsen 2019-05-09 12:59:45 +02:00
parent 37771502ae
commit 8e33a5292a
8 changed files with 178 additions and 1 deletions

View File

@ -864,6 +864,7 @@ include::processors/foreach.asciidoc[]
include::processors/geoip.asciidoc[] include::processors/geoip.asciidoc[]
include::processors/grok.asciidoc[] include::processors/grok.asciidoc[]
include::processors/gsub.asciidoc[] include::processors/gsub.asciidoc[]
include::processors/html_strip.asciidoc[]
include::processors/join.asciidoc[] include::processors/join.asciidoc[]
include::processors/json.asciidoc[] include::processors/json.asciidoc[]
include::processors/kv.asciidoc[] include::processors/kv.asciidoc[]

View File

@ -0,0 +1,26 @@
[[htmlstrip-processor]]
=== HTML Strip Processor
Removes HTML from field.
NOTE: Each HTML tag is replaced with a `\n` character.
[[htmlstrip-options]]
.HTML Strip Options
[options="header"]
|======
| Name | Required | Default | Description
| `field` | yes | - | The string-valued field to remove HTML tags from
| `target_field` | no | `field` | The field to assign the value to, by default `field` is updated in-place
| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document
include::common-options.asciidoc[]
|======
[source,js]
--------------------------------------------------
{
"html_strip": {
"field": "foo"
}
}
--------------------------------------------------
// NOTCONSOLE

View File

@ -0,0 +1,76 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.ingest.common;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.elasticsearch.ElasticsearchException;
import java.io.IOException;
import java.io.StringReader;
import java.util.Map;
public final class HtmlStripProcessor extends AbstractStringProcessor<String> {
public static final String TYPE = "html_strip";
HtmlStripProcessor(String tag, String field, boolean ignoreMissing, String targetField) {
super(tag, field, ignoreMissing, targetField);
}
@Override
protected String process(String value) {
// shortcut, no need to create a string builder and go through each char
if (value.contains("<") == false || value.contains(">") == false) {
return value;
}
HTMLStripCharFilter filter = new HTMLStripCharFilter(new StringReader(value));
StringBuilder builder = new StringBuilder();
int ch;
try {
while ((ch = filter.read()) != -1) {
builder.append((char)ch);
}
} catch (IOException e) {
throw new ElasticsearchException(e);
}
return builder.toString();
}
@Override
public String getType() {
return TYPE;
}
public static final class Factory extends AbstractStringProcessor.Factory {
public Factory() {
super(TYPE);
}
@Override
protected HtmlStripProcessor newProcessor(String tag, Map<String, Object> config, String field,
boolean ignoreMissing, String targetField) {
return new HtmlStripProcessor(tag, field, ignoreMissing, targetField);
}
}
}

View File

@ -87,6 +87,7 @@ public class IngestCommonPlugin extends Plugin implements ActionPlugin, IngestPl
processors.put(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService)); processors.put(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService));
processors.put(DissectProcessor.TYPE, new DissectProcessor.Factory()); processors.put(DissectProcessor.TYPE, new DissectProcessor.Factory());
processors.put(DropProcessor.TYPE, new DropProcessor.Factory()); processors.put(DropProcessor.TYPE, new DropProcessor.Factory());
processors.put(HtmlStripProcessor.TYPE, new HtmlStripProcessor.Factory());
return Collections.unmodifiableMap(processors); return Collections.unmodifiableMap(processors);
} }

View File

@ -0,0 +1,27 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.ingest.common;
public class HtmlStripProcessorFactoryTests extends AbstractStringProcessorFactoryTestCase {
@Override
protected AbstractStringProcessor.Factory newFactory() {
return new HtmlStripProcessor.Factory();
}
}

View File

@ -0,0 +1,38 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.ingest.common;
public class HtmlStripProcessorTests extends AbstractStringProcessorTestCase<String> {
@Override
protected AbstractStringProcessor<String> newProcessor(String field, boolean ignoreMissing, String targetField) {
return new HtmlStripProcessor(randomAlphaOfLength(10), field, ignoreMissing, targetField);
}
@Override
protected String modifyInput(String input) {
return "<p><b>test</b>" + input + "<p><b>test</b>";
}
@Override
protected String expectedResult(String input) {
return "\ntest" + input + "\ntest";
}
}

View File

@ -23,6 +23,7 @@
- contains: { nodes.$master.ingest.processors: { type: foreach } } - contains: { nodes.$master.ingest.processors: { type: foreach } }
- contains: { nodes.$master.ingest.processors: { type: grok } } - contains: { nodes.$master.ingest.processors: { type: grok } }
- contains: { nodes.$master.ingest.processors: { type: gsub } } - contains: { nodes.$master.ingest.processors: { type: gsub } }
- contains: { nodes.$master.ingest.processors: { type: html_strip } }
- contains: { nodes.$master.ingest.processors: { type: join } } - contains: { nodes.$master.ingest.processors: { type: join } }
- contains: { nodes.$master.ingest.processors: { type: json } } - contains: { nodes.$master.ingest.processors: { type: json } }
- contains: { nodes.$master.ingest.processors: { type: kv } } - contains: { nodes.$master.ingest.processors: { type: kv } }

View File

@ -76,6 +76,11 @@ teardown:
"pattern" : "-", "pattern" : "-",
"replacement" : "." "replacement" : "."
} }
},
{
"html_strip" : {
"field" : "field_to_html_strip"
}
} }
] ]
} }
@ -96,7 +101,8 @@ teardown:
"field_to_split": "127-0-0-1", "field_to_split": "127-0-0-1",
"field_to_join": ["127","0","0","1"], "field_to_join": ["127","0","0","1"],
"field_to_convert": ["127","0","0","1"], "field_to_convert": ["127","0","0","1"],
"field_to_gsub": "127-0-0-1" "field_to_gsub": "127-0-0-1",
"field_to_html_strip": "<p>this <title>is</title> a <b>test</b>"
} }
- do: - do:
@ -114,6 +120,7 @@ teardown:
- match: { _source.field_to_join: "127-0-0-1" } - match: { _source.field_to_join: "127-0-0-1" }
- match: { _source.field_to_convert: [127,0,0,1] } - match: { _source.field_to_convert: [127,0,0,1] }
- match: { _source.field_to_gsub: "127.0.0.1" } - match: { _source.field_to_gsub: "127.0.0.1" }
- match: { _source.field_to_html_strip: "\nthis \nis\n a test" }
--- ---
"Test metadata": "Test metadata":