Add predicate_token_filter (#33431)

This allows users to filter out tokens from a TokenStream using painless scripts, instead of having to write specialised Java code and packaging it up into a plugin. The commit also refactors the AnalysisPredicateScript.Token class so that it wraps and makes read-only an AttributeSource.
2018-09-11 09:16:39 +01:00 · 2018-09-11 09:16:39 +01:00 · f598297f55
parent a55fa4fd6b
commit f598297f55
8 changed files with 341 additions and 44 deletions
--- a/docs/reference/analysis/tokenfilters.asciidoc
+++ b/docs/reference/analysis/tokenfilters.asciidoc
@ -37,6 +37,8 @@ include::tokenfilters/multiplexer-tokenfilter.asciidoc[]

 include::tokenfilters/condition-tokenfilter.asciidoc[]

+include::tokenfilters/predicate-tokenfilter.asciidoc[]
+
 include::tokenfilters/stemmer-tokenfilter.asciidoc[]

 include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
--- a/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc
@ -0,0 +1,79 @@
+[[analysis-predicatefilter-tokenfilter]]
+=== Predicate Token Filter Script
+
+The predicate_token_filter token filter takes a predicate script, and removes tokens that do
+not match the predicate.
+
+[float]
+=== Options
+[horizontal]
+script:: a predicate script that determines whether or not the current token will
+be emitted.  Note that only inline scripts are supported.
+
+[float]
+=== Settings example
+
+You can set it up like:
+
+[source,js]
+--------------------------------------------------
+PUT /condition_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "my_analyzer" : {
+                    "tokenizer" : "standard",
+                    "filter" : [ "my_script_filter" ]
+                }
+            },
+            "filter" : {
+                "my_script_filter" : {
+                    "type" : "predicate_token_filter",
+                    "script" : {
+                        "source" : "token.getTerm().length() > 5"  <1>
+                    }
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+// CONSOLE
+
+<1> This will emit tokens that are more than 5 characters long
+
+And test it like:
+
+[source,js]
+--------------------------------------------------
+POST /condition_example/_analyze
+{
+  "analyzer" : "my_analyzer",
+  "text" : "What Flapdoodle"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "Flapdoodle",        <1>
+      "start_offset": 5,
+      "end_offset": 15,
+      "type": "<ALPHANUM>",
+      "position": 1                 <2>
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+<1> The token 'What' has been removed from the tokenstream because it does not
+match the predicate.
+<2> The position and offset values are unaffected by the removal of earlier tokens
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java
@ -19,6 +19,13 @@

 package org.elasticsearch.analysis.common;

+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
 import org.elasticsearch.script.ScriptContext;

 /**
@ -30,21 +37,40 @@ public abstract class AnalysisPredicateScript {
     * Encapsulation of the state of the current token
     */
    public static class Token {
-        public CharSequence term;
-        public int pos;
-        public int posInc;
-        public int posLen;
-        public int startOffset;
-        public int endOffset;
-        public String type;
-        public boolean isKeyword;
+
+        private final CharTermAttribute termAtt;
+        private final PositionIncrementAttribute posIncAtt;
+        private final PositionLengthAttribute posLenAtt;
+        private final OffsetAttribute offsetAtt;
+        private final TypeAttribute typeAtt;
+        private final KeywordAttribute keywordAtt;
+
+        // posInc is always 1 at the beginning of a tokenstream and the convention
+        // from the _analyze endpoint is that tokenstream positions are 0-based
+        private int pos = -1;
+
+        /**
+         * Create a token exposing values from an AttributeSource
+         */
+        public Token(AttributeSource source) {
+            this.termAtt = source.addAttribute(CharTermAttribute.class);
+            this.posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
+            this.posLenAtt = source.addAttribute(PositionLengthAttribute.class);
+            this.offsetAtt = source.addAttribute(OffsetAttribute.class);
+            this.typeAtt = source.addAttribute(TypeAttribute.class);
+            this.keywordAtt = source.addAttribute(KeywordAttribute.class);
+        }
+
+        public void updatePosition() {
+            this.pos = this.pos + posIncAtt.getPositionIncrement();
+        }

        public CharSequence getTerm() {
-            return term;
+            return termAtt;
        }

        public int getPositionIncrement() {
-            return posInc;
+            return posIncAtt.getPositionIncrement();
        }

        public int getPosition() {
@ -52,23 +78,23 @@ public abstract class AnalysisPredicateScript {
        }

        public int getPositionLength() {
-            return posLen;
+            return posLenAtt.getPositionLength();
        }

        public int getStartOffset() {
-            return startOffset;
+            return offsetAtt.startOffset();
        }

        public int getEndOffset() {
-            return endOffset;
+            return offsetAtt.endOffset();
        }

        public String getType() {
-            return type;
+            return typeAtt.type();
        }

        public boolean isKeyword() {
-            return isKeyword;
+            return keywordAtt.isKeyword();
        }
    }

--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@ -264,6 +264,8 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
        filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
        filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
        filters.put("porter_stem", PorterStemTokenFilterFactory::new);
+        filters.put("predicate_token_filter",
+            requiresAnalysisSettings((i, e, n, s) -> new PredicateTokenFilterScriptFactory(i, n, s, scriptService.get())));
        filters.put("remove_duplicates", RemoveDuplicatesTokenFilterFactory::new);
        filters.put("reverse", ReverseTokenFilterFactory::new);
        filters.put("russian_stem", RussianStemTokenFilterFactory::new);
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java
@ -0,0 +1,73 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.script.Script;
+import org.elasticsearch.script.ScriptService;
+import org.elasticsearch.script.ScriptType;
+
+import java.io.IOException;
+
+/**
+ * A factory for creating FilteringTokenFilters that determine whether or not to
+ * accept their underlying token by consulting a script
+ */
+public class PredicateTokenFilterScriptFactory extends AbstractTokenFilterFactory {
+
+    private final AnalysisPredicateScript.Factory factory;
+
+    public PredicateTokenFilterScriptFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) {
+        super(indexSettings, name, settings);
+        Settings scriptSettings = settings.getAsSettings("script");
+        Script script = Script.parse(scriptSettings);
+        if (script.getType() != ScriptType.INLINE) {
+            throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]");
+        }
+        this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new ScriptFilteringTokenFilter(tokenStream, factory.newInstance());
+    }
+
+    private static class ScriptFilteringTokenFilter extends FilteringTokenFilter {
+
+        final AnalysisPredicateScript script;
+        final AnalysisPredicateScript.Token token;
+
+        ScriptFilteringTokenFilter(TokenStream in, AnalysisPredicateScript script) {
+            super(in);
+            this.script = script;
+            this.token = new AnalysisPredicateScript.Token(this);
+        }
+
+        @Override
+        protected boolean accept() throws IOException {
+            token.updatePosition();
+            return script.execute(token);
+        }
+    }
+}
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java
@ -21,12 +21,6 @@ package org.elasticsearch.analysis.common;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
@ -36,6 +30,7 @@ import org.elasticsearch.script.Script;
 import org.elasticsearch.script.ScriptService;
 import org.elasticsearch.script.ScriptType;

+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@ -76,30 +71,26 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
            }
            return in;
        };
-        AnalysisPredicateScript script = factory.newInstance();
-        final AnalysisPredicateScript.Token token = new AnalysisPredicateScript.Token();
-        return new ConditionalTokenFilter(tokenStream, filter) {
+        return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
+    }

-            CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-            PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-            PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
-            OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-            TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-            KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+    private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
+
+        private final AnalysisPredicateScript script;
+        private final AnalysisPredicateScript.Token token;
+
+        ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
+                                               AnalysisPredicateScript script) {
+            super(input, inputFactory);
+            this.script = script;
+            this.token = new AnalysisPredicateScript.Token(this);
+        }

        @Override
-            protected boolean shouldFilter() {
-                token.term = termAtt;
-                token.posInc = posIncAtt.getPositionIncrement();
-                token.pos += token.posInc;
-                token.posLen = posLenAtt.getPositionLength();
-                token.startOffset = offsetAtt.startOffset();
-                token.endOffset = offsetAtt.endOffset();
-                token.type = typeAtt.type();
-                token.isKeyword = keywordAtt.isKeyword();
+        protected boolean shouldFilter() throws IOException {
+            token.updatePosition();
            return script.execute(token);
        }
-        };
    }

    @Override
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java
@ -0,0 +1,89 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.IndexAnalyzers;
+import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.script.Script;
+import org.elasticsearch.script.ScriptContext;
+import org.elasticsearch.script.ScriptService;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+import org.elasticsearch.test.IndexSettingsModule;
+
+import java.io.IOException;
+import java.util.Collections;
+
+public class PredicateTokenScriptFilterTests extends ESTokenStreamTestCase {
+
+    public void testSimpleFilter() throws IOException {
+        Settings settings = Settings.builder()
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        Settings indexSettings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put("index.analysis.filter.f.type", "predicate_token_filter")
+            .put("index.analysis.filter.f.script.source", "token.getTerm().length() > 5")
+            .put("index.analysis.analyzer.myAnalyzer.type", "custom")
+            .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
+            .putList("index.analysis.analyzer.myAnalyzer.filter", "f")
+            .build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
+
+        AnalysisPredicateScript.Factory factory = () -> new AnalysisPredicateScript() {
+            @Override
+            public boolean execute(Token token) {
+                return token.getTerm().length() > 5;
+            }
+        };
+
+        @SuppressWarnings("unchecked")
+        ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()){
+            @Override
+            public <FactoryType> FactoryType compile(Script script, ScriptContext<FactoryType> context) {
+                assertEquals(context, AnalysisPredicateScript.CONTEXT);
+                assertEquals(new Script("token.getTerm().length() > 5"), script);
+                return (FactoryType) factory;
+            }
+        };
+
+        CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
+        plugin.createComponents(null, null, null, null, scriptService, null, null, null, null);
+        AnalysisModule module
+            = new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin));
+
+        IndexAnalyzers analyzers = module.getAnalysisRegistry().build(idxSettings);
+
+        try (NamedAnalyzer analyzer = analyzers.get("myAnalyzer")) {
+            assertNotNull(analyzer);
+            assertAnalyzesTo(analyzer, "Vorsprung Durch Technik", new String[]{
+                "Vorsprung", "Technik"
+            });
+        }
+
+    }
+
+}
--- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml
@ -28,9 +28,44 @@
              - type: condition
                filter: [ "lowercase" ]
                script:
-                  source: "token.position > 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)"
+                  source: "token.position >= 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)"

    - length: { tokens: 3 }
    - match: { tokens.0.token: "Vorsprung" }
    - match: { tokens.1.token: "durch" }
    - match: { tokens.2.token: "technik" }
+
+---
+"script_filter":
+    - do:
+        indices.analyze:
+          body:
+            text: "Vorsprung Durch Technik"
+            tokenizer: "whitespace"
+            filter:
+              - type: predicate_token_filter
+                script:
+                  source: "token.term.length() > 5"
+
+    - length: { tokens: 2 }
+    - match: { tokens.0.token: "Vorsprung" }
+    - match: { tokens.1.token: "Technik" }
+
+---
+"script_filter_position":
+    - do:
+        indices.analyze:
+          body:
+            text: "a b c d e f g h"
+            tokenizer: "whitespace"
+            filter:
+              - type: predicate_token_filter
+                script:
+                  source: "token.position >= 4"
+
+    - length: { tokens: 4 }
+    - match: { tokens.0.token: "e" }
+    - match: { tokens.1.token: "f" }
+    - match: { tokens.2.token: "g" }
+    - match: { tokens.3.token: "h" }
+