From f598297f55fa60df9fdefa4f34574d485b600b20 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 11 Sep 2018 09:16:39 +0100 Subject: [PATCH] Add predicate_token_filter (#33431) This allows users to filter out tokens from a TokenStream using painless scripts, instead of having to write specialised Java code and packaging it up into a plugin. The commit also refactors the AnalysisPredicateScript.Token class so that it wraps and makes read-only an AttributeSource. --- docs/reference/analysis/tokenfilters.asciidoc | 2 + .../predicate-tokenfilter.asciidoc | 79 ++++++++++++++++ .../common/AnalysisPredicateScript.java | 56 ++++++++---- .../analysis/common/CommonAnalysisPlugin.java | 2 + .../PredicateTokenFilterScriptFactory.java | 73 +++++++++++++++ .../ScriptedConditionTokenFilterFactory.java | 47 ++++------ .../PredicateTokenScriptFilterTests.java | 89 +++++++++++++++++++ .../analysis-common/60_analysis_scripting.yml | 37 +++++++- 8 files changed, 341 insertions(+), 44 deletions(-) create mode 100644 docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index f531bc5d0e9..41bb9d38afb 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -37,6 +37,8 @@ include::tokenfilters/multiplexer-tokenfilter.asciidoc[] include::tokenfilters/condition-tokenfilter.asciidoc[] +include::tokenfilters/predicate-tokenfilter.asciidoc[] + include::tokenfilters/stemmer-tokenfilter.asciidoc[] include::tokenfilters/stemmer-override-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc new file mode 100644 index 00000000000..bebf7bd80f2 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc @@ -0,0 +1,79 @@ +[[analysis-predicatefilter-tokenfilter]] +=== Predicate Token Filter Script + +The predicate_token_filter token filter takes a predicate script, and removes tokens that do +not match the predicate. + +[float] +=== Options +[horizontal] +script:: a predicate script that determines whether or not the current token will +be emitted. Note that only inline scripts are supported. + +[float] +=== Settings example + +You can set it up like: + +[source,js] +-------------------------------------------------- +PUT /condition_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : [ "my_script_filter" ] + } + }, + "filter" : { + "my_script_filter" : { + "type" : "predicate_token_filter", + "script" : { + "source" : "token.getTerm().length() > 5" <1> + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<1> This will emit tokens that are more than 5 characters long + +And test it like: + +[source,js] +-------------------------------------------------- +POST /condition_example/_analyze +{ + "analyzer" : "my_analyzer", + "text" : "What Flapdoodle" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +And it'd respond: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "Flapdoodle", <1> + "start_offset": 5, + "end_offset": 15, + "type": "", + "position": 1 <2> + } + ] +} +-------------------------------------------------- +// TESTRESPONSE + +<1> The token 'What' has been removed from the tokenstream because it does not +match the predicate. +<2> The position and offset values are unaffected by the removal of earlier tokens \ No newline at end of file diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java index 7de588a958c..3bda6f393bf 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java @@ -19,6 +19,13 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; import org.elasticsearch.script.ScriptContext; /** @@ -30,21 +37,40 @@ public abstract class AnalysisPredicateScript { * Encapsulation of the state of the current token */ public static class Token { - public CharSequence term; - public int pos; - public int posInc; - public int posLen; - public int startOffset; - public int endOffset; - public String type; - public boolean isKeyword; + + private final CharTermAttribute termAtt; + private final PositionIncrementAttribute posIncAtt; + private final PositionLengthAttribute posLenAtt; + private final OffsetAttribute offsetAtt; + private final TypeAttribute typeAtt; + private final KeywordAttribute keywordAtt; + + // posInc is always 1 at the beginning of a tokenstream and the convention + // from the _analyze endpoint is that tokenstream positions are 0-based + private int pos = -1; + + /** + * Create a token exposing values from an AttributeSource + */ + public Token(AttributeSource source) { + this.termAtt = source.addAttribute(CharTermAttribute.class); + this.posIncAtt = source.addAttribute(PositionIncrementAttribute.class); + this.posLenAtt = source.addAttribute(PositionLengthAttribute.class); + this.offsetAtt = source.addAttribute(OffsetAttribute.class); + this.typeAtt = source.addAttribute(TypeAttribute.class); + this.keywordAtt = source.addAttribute(KeywordAttribute.class); + } + + public void updatePosition() { + this.pos = this.pos + posIncAtt.getPositionIncrement(); + } public CharSequence getTerm() { - return term; + return termAtt; } public int getPositionIncrement() { - return posInc; + return posIncAtt.getPositionIncrement(); } public int getPosition() { @@ -52,23 +78,23 @@ public abstract class AnalysisPredicateScript { } public int getPositionLength() { - return posLen; + return posLenAtt.getPositionLength(); } public int getStartOffset() { - return startOffset; + return offsetAtt.startOffset(); } public int getEndOffset() { - return endOffset; + return offsetAtt.endOffset(); } public String getType() { - return type; + return typeAtt.type(); } public boolean isKeyword() { - return isKeyword; + return keywordAtt.isKeyword(); } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 75ebade0b12..175935258ad 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -264,6 +264,8 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); filters.put("porter_stem", PorterStemTokenFilterFactory::new); + filters.put("predicate_token_filter", + requiresAnalysisSettings((i, e, n, s) -> new PredicateTokenFilterScriptFactory(i, n, s, scriptService.get()))); filters.put("remove_duplicates", RemoveDuplicatesTokenFilterFactory::new); filters.put("reverse", ReverseTokenFilterFactory::new); filters.put("russian_stem", RussianStemTokenFilterFactory::new); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java new file mode 100644 index 00000000000..84f4bb48706 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java @@ -0,0 +1,73 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.FilteringTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.script.Script; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.script.ScriptType; + +import java.io.IOException; + +/** + * A factory for creating FilteringTokenFilters that determine whether or not to + * accept their underlying token by consulting a script + */ +public class PredicateTokenFilterScriptFactory extends AbstractTokenFilterFactory { + + private final AnalysisPredicateScript.Factory factory; + + public PredicateTokenFilterScriptFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) { + super(indexSettings, name, settings); + Settings scriptSettings = settings.getAsSettings("script"); + Script script = Script.parse(scriptSettings); + if (script.getType() != ScriptType.INLINE) { + throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]"); + } + this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ScriptFilteringTokenFilter(tokenStream, factory.newInstance()); + } + + private static class ScriptFilteringTokenFilter extends FilteringTokenFilter { + + final AnalysisPredicateScript script; + final AnalysisPredicateScript.Token token; + + ScriptFilteringTokenFilter(TokenStream in, AnalysisPredicateScript script) { + super(in); + this.script = script; + this.token = new AnalysisPredicateScript.Token(this); + } + + @Override + protected boolean accept() throws IOException { + token.updatePosition(); + return script.execute(token); + } + } +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java index cf7fd5b047a..56f60bb874a 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java @@ -21,12 +21,6 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; @@ -36,6 +30,7 @@ import org.elasticsearch.script.Script; import org.elasticsearch.script.ScriptService; import org.elasticsearch.script.ScriptType; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -76,30 +71,26 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact } return in; }; - AnalysisPredicateScript script = factory.newInstance(); - final AnalysisPredicateScript.Token token = new AnalysisPredicateScript.Token(); - return new ConditionalTokenFilter(tokenStream, filter) { + return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance()); + } - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); - OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter { - @Override - protected boolean shouldFilter() { - token.term = termAtt; - token.posInc = posIncAtt.getPositionIncrement(); - token.pos += token.posInc; - token.posLen = posLenAtt.getPositionLength(); - token.startOffset = offsetAtt.startOffset(); - token.endOffset = offsetAtt.endOffset(); - token.type = typeAtt.type(); - token.isKeyword = keywordAtt.isKeyword(); - return script.execute(token); - } - }; + private final AnalysisPredicateScript script; + private final AnalysisPredicateScript.Token token; + + ScriptedConditionTokenFilter(TokenStream input, Function inputFactory, + AnalysisPredicateScript script) { + super(input, inputFactory); + this.script = script; + this.token = new AnalysisPredicateScript.Token(this); + } + + @Override + protected boolean shouldFilter() throws IOException { + token.updatePosition(); + return script.execute(token); + } } @Override diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java new file mode 100644 index 00000000000..18afbdcecb3 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java @@ -0,0 +1,89 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.script.Script; +import org.elasticsearch.script.ScriptContext; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; + +import java.io.IOException; +import java.util.Collections; + +public class PredicateTokenScriptFilterTests extends ESTokenStreamTestCase { + + public void testSimpleFilter() throws IOException { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.filter.f.type", "predicate_token_filter") + .put("index.analysis.filter.f.script.source", "token.getTerm().length() > 5") + .put("index.analysis.analyzer.myAnalyzer.type", "custom") + .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.myAnalyzer.filter", "f") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + AnalysisPredicateScript.Factory factory = () -> new AnalysisPredicateScript() { + @Override + public boolean execute(Token token) { + return token.getTerm().length() > 5; + } + }; + + @SuppressWarnings("unchecked") + ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()){ + @Override + public FactoryType compile(Script script, ScriptContext context) { + assertEquals(context, AnalysisPredicateScript.CONTEXT); + assertEquals(new Script("token.getTerm().length() > 5"), script); + return (FactoryType) factory; + } + }; + + CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); + plugin.createComponents(null, null, null, null, scriptService, null, null, null, null); + AnalysisModule module + = new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin)); + + IndexAnalyzers analyzers = module.getAnalysisRegistry().build(idxSettings); + + try (NamedAnalyzer analyzer = analyzers.get("myAnalyzer")) { + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "Vorsprung Durch Technik", new String[]{ + "Vorsprung", "Technik" + }); + } + + } + +} diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml index 4305e5db0af..2015fe31fcc 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml @@ -28,9 +28,44 @@ - type: condition filter: [ "lowercase" ] script: - source: "token.position > 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)" + source: "token.position >= 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)" - length: { tokens: 3 } - match: { tokens.0.token: "Vorsprung" } - match: { tokens.1.token: "durch" } - match: { tokens.2.token: "technik" } + +--- +"script_filter": + - do: + indices.analyze: + body: + text: "Vorsprung Durch Technik" + tokenizer: "whitespace" + filter: + - type: predicate_token_filter + script: + source: "token.term.length() > 5" + + - length: { tokens: 2 } + - match: { tokens.0.token: "Vorsprung" } + - match: { tokens.1.token: "Technik" } + +--- +"script_filter_position": + - do: + indices.analyze: + body: + text: "a b c d e f g h" + tokenizer: "whitespace" + filter: + - type: predicate_token_filter + script: + source: "token.position >= 4" + + - length: { tokens: 4 } + - match: { tokens.0.token: "e" } + - match: { tokens.1.token: "f" } + - match: { tokens.2.token: "g" } + - match: { tokens.3.token: "h" } +