Add predicate_token_filter (#33431)

This allows users to filter out tokens from a TokenStream using painless scripts, instead of having to write specialised Java code and packaging it up into a plugin. The commit also refactors the AnalysisPredicateScript.Token class so that it wraps and makes read-only an AttributeSource.
2018-09-11 09:16:39 +01:00 · 2018-09-11 09:16:39 +01:00 · f598297f55
parent a55fa4fd6b
commit f598297f55
8 changed files with 341 additions and 44 deletions
--- a/docs/reference/analysis/tokenfilters.asciidoc
+++ b/docs/reference/analysis/tokenfilters.asciidoc
@ -37,6 +37,8 @@ include::tokenfilters/multiplexer-tokenfilter.asciidoc[]
 include::tokenfilters/condition-tokenfilter.asciidoc[]
 include::tokenfilters/predicate-tokenfilter.asciidoc[]
 include::tokenfilters/stemmer-tokenfilter.asciidoc[]
 include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
--- a/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/predicate-tokenfilter.asciidoc
@ -0,0 +1,79 @@
 [[analysis-predicatefilter-tokenfilter]]
 === Predicate Token Filter Script
 The predicate_token_filter token filter takes a predicate script, and removes tokens that do
 not match the predicate.
 [float]
 === Options
 [horizontal]
 script:: a predicate script that determines whether or not the current token will
 be emitted.  Note that only inline scripts are supported.
 [float]
 === Settings example
 You can set it up like:
 [source,js]
 --------------------------------------------------
 PUT /condition_example
 {
    "settings" : {
        "analysis" : {
            "analyzer" : {
                "my_analyzer" : {
                    "tokenizer" : "standard",
                    "filter" : [ "my_script_filter" ]
                }
            },
            "filter" : {
                "my_script_filter" : {
                    "type" : "predicate_token_filter",
                    "script" : {
                        "source" : "token.getTerm().length() > 5"  <1>
                    }
                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
 <1> This will emit tokens that are more than 5 characters long
 And test it like:
 [source,js]
 --------------------------------------------------
 POST /condition_example/_analyze
 {
  "analyzer" : "my_analyzer",
  "text" : "What Flapdoodle"
 }
 --------------------------------------------------
 // CONSOLE
 // TEST[continued]
 And it'd respond:
 [source,js]
 --------------------------------------------------
 {
  "tokens": [
    {
      "token": "Flapdoodle",        <1>
      "start_offset": 5,
      "end_offset": 15,
      "type": "<ALPHANUM>",
      "position": 1                 <2>
    }
  ]
 }
 --------------------------------------------------
 // TESTRESPONSE
 <1> The token 'What' has been removed from the tokenstream because it does not
 match the predicate.
 <2> The position and offset values are unaffected by the removal of earlier tokens
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java
@ -19,6 +19,13 @@
 package org.elasticsearch.analysis.common;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.elasticsearch.script.ScriptContext;
 /**
@ -30,21 +37,40 @@ public abstract class AnalysisPredicateScript {
     * Encapsulation of the state of the current token
     */
    public static class Token {
-        public CharSequence term;
+
-        public int pos;
+        private final CharTermAttribute termAtt;
-        public int posInc;
+        private final PositionIncrementAttribute posIncAtt;
-        public int posLen;
+        private final PositionLengthAttribute posLenAtt;
-        public int startOffset;
+        private final OffsetAttribute offsetAtt;
-        public int endOffset;
+        private final TypeAttribute typeAtt;
-        public String type;
+        private final KeywordAttribute keywordAtt;
-        public boolean isKeyword;
+
        // posInc is always 1 at the beginning of a tokenstream and the convention
        // from the _analyze endpoint is that tokenstream positions are 0-based
        private int pos = -1;
        /**
         * Create a token exposing values from an AttributeSource
         */
        public Token(AttributeSource source) {
            this.termAtt = source.addAttribute(CharTermAttribute.class);
            this.posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
            this.posLenAtt = source.addAttribute(PositionLengthAttribute.class);
            this.offsetAtt = source.addAttribute(OffsetAttribute.class);
            this.typeAtt = source.addAttribute(TypeAttribute.class);
            this.keywordAtt = source.addAttribute(KeywordAttribute.class);
        }
        public void updatePosition() {
            this.pos = this.pos + posIncAtt.getPositionIncrement();
        }
        public CharSequence getTerm() {
-            return term;
+            return termAtt;
        }
        public int getPositionIncrement() {
-            return posInc;
+            return posIncAtt.getPositionIncrement();
        }
        public int getPosition() {
@ -52,23 +78,23 @@ public abstract class AnalysisPredicateScript {
        }
        public int getPositionLength() {
-            return posLen;
+            return posLenAtt.getPositionLength();
        }
        public int getStartOffset() {
-            return startOffset;
+            return offsetAtt.startOffset();
        }
        public int getEndOffset() {
-            return endOffset;
+            return offsetAtt.endOffset();
        }
        public String getType() {
-            return type;
+            return typeAtt.type();
        }
        public boolean isKeyword() {
-            return isKeyword;
+            return keywordAtt.isKeyword();
        }
    }
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@ -264,6 +264,8 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
        filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
        filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
        filters.put("porter_stem", PorterStemTokenFilterFactory::new);
        filters.put("predicate_token_filter",
            requiresAnalysisSettings((i, e, n, s) -> new PredicateTokenFilterScriptFactory(i, n, s, scriptService.get())));
        filters.put("remove_duplicates", RemoveDuplicatesTokenFilterFactory::new);
        filters.put("reverse", ReverseTokenFilterFactory::new);
        filters.put("russian_stem", RussianStemTokenFilterFactory::new);
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PredicateTokenFilterScriptFactory.java
@ -0,0 +1,73 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.analysis.common;
 import org.apache.lucene.analysis.FilteringTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.script.Script;
 import org.elasticsearch.script.ScriptService;
 import org.elasticsearch.script.ScriptType;
 import java.io.IOException;
 /**
 * A factory for creating FilteringTokenFilters that determine whether or not to
 * accept their underlying token by consulting a script
 */
 public class PredicateTokenFilterScriptFactory extends AbstractTokenFilterFactory {
    private final AnalysisPredicateScript.Factory factory;
    public PredicateTokenFilterScriptFactory(IndexSettings indexSettings, String name, Settings settings, ScriptService scriptService) {
        super(indexSettings, name, settings);
        Settings scriptSettings = settings.getAsSettings("script");
        Script script = Script.parse(scriptSettings);
        if (script.getType() != ScriptType.INLINE) {
            throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]");
        }
        this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT);
    }
    @Override
    public TokenStream create(TokenStream tokenStream) {
        return new ScriptFilteringTokenFilter(tokenStream, factory.newInstance());
    }
    private static class ScriptFilteringTokenFilter extends FilteringTokenFilter {
        final AnalysisPredicateScript script;
        final AnalysisPredicateScript.Token token;
        ScriptFilteringTokenFilter(TokenStream in, AnalysisPredicateScript script) {
            super(in);
            this.script = script;
            this.token = new AnalysisPredicateScript.Token(this);
        }
        @Override
        protected boolean accept() throws IOException {
            token.updatePosition();
            return script.execute(token);
        }
    }
 }
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java
@ -21,12 +21,6 @@ package org.elasticsearch.analysis.common;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
@ -36,6 +30,7 @@ import org.elasticsearch.script.Script;
 import org.elasticsearch.script.ScriptService;
 import org.elasticsearch.script.ScriptType;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@ -76,30 +71,26 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
            }
            return in;
        };
-        AnalysisPredicateScript script = factory.newInstance();
+        return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
-        final AnalysisPredicateScript.Token token = new AnalysisPredicateScript.Token();
+    }
        return new ConditionalTokenFilter(tokenStream, filter) {
-            CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
            PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
            PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
            OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
            KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
-            @Override
+        private final AnalysisPredicateScript script;
-            protected boolean shouldFilter() {
+        private final AnalysisPredicateScript.Token token;
-                token.term = termAtt;
+
-                token.posInc = posIncAtt.getPositionIncrement();
+        ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
-                token.pos += token.posInc;
+                                               AnalysisPredicateScript script) {
-                token.posLen = posLenAtt.getPositionLength();
+            super(input, inputFactory);
-                token.startOffset = offsetAtt.startOffset();
+            this.script = script;
-                token.endOffset = offsetAtt.endOffset();
+            this.token = new AnalysisPredicateScript.Token(this);
-                token.type = typeAtt.type();
+        }
-                token.isKeyword = keywordAtt.isKeyword();
+
-                return script.execute(token);
+        @Override
-            }
+        protected boolean shouldFilter() throws IOException {
-        };
+            token.updatePosition();
            return script.execute(token);
        }
    }
    @Override
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PredicateTokenScriptFilterTests.java
@ -0,0 +1,89 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.analysis.common;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.TestEnvironment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.indices.analysis.AnalysisModule;
 import org.elasticsearch.script.Script;
 import org.elasticsearch.script.ScriptContext;
 import org.elasticsearch.script.ScriptService;
 import org.elasticsearch.test.ESTokenStreamTestCase;
 import org.elasticsearch.test.IndexSettingsModule;
 import java.io.IOException;
 import java.util.Collections;
 public class PredicateTokenScriptFilterTests extends ESTokenStreamTestCase {
    public void testSimpleFilter() throws IOException {
        Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .build();
        Settings indexSettings = Settings.builder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.f.type", "predicate_token_filter")
            .put("index.analysis.filter.f.script.source", "token.getTerm().length() > 5")
            .put("index.analysis.analyzer.myAnalyzer.type", "custom")
            .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
            .putList("index.analysis.analyzer.myAnalyzer.filter", "f")
            .build();
        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
        AnalysisPredicateScript.Factory factory = () -> new AnalysisPredicateScript() {
            @Override
            public boolean execute(Token token) {
                return token.getTerm().length() > 5;
            }
        };
        @SuppressWarnings("unchecked")
        ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()){
            @Override
            public <FactoryType> FactoryType compile(Script script, ScriptContext<FactoryType> context) {
                assertEquals(context, AnalysisPredicateScript.CONTEXT);
                assertEquals(new Script("token.getTerm().length() > 5"), script);
                return (FactoryType) factory;
            }
        };
        CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
        plugin.createComponents(null, null, null, null, scriptService, null, null, null, null);
        AnalysisModule module
            = new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin));
        IndexAnalyzers analyzers = module.getAnalysisRegistry().build(idxSettings);
        try (NamedAnalyzer analyzer = analyzers.get("myAnalyzer")) {
            assertNotNull(analyzer);
            assertAnalyzesTo(analyzer, "Vorsprung Durch Technik", new String[]{
                "Vorsprung", "Technik"
            });
        }
    }
 }
--- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml
@ -28,9 +28,44 @@
              - type: condition
                filter: [ "lowercase" ]
                script:
-                  source: "token.position > 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)"
+                  source: "token.position >= 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)"
    - length: { tokens: 3 }
    - match: { tokens.0.token: "Vorsprung" }
    - match: { tokens.1.token: "durch" }
    - match: { tokens.2.token: "technik" }
 ---
 "script_filter":
    - do:
        indices.analyze:
          body:
            text: "Vorsprung Durch Technik"
            tokenizer: "whitespace"
            filter:
              - type: predicate_token_filter
                script:
                  source: "token.term.length() > 5"
    - length: { tokens: 2 }
    - match: { tokens.0.token: "Vorsprung" }
    - match: { tokens.1.token: "Technik" }
 ---
 "script_filter_position":
    - do:
        indices.analyze:
          body:
            text: "a b c d e f g h"
            tokenizer: "whitespace"
            filter:
              - type: predicate_token_filter
                script:
                  source: "token.position >= 4"
    - length: { tokens: 4 }
    - match: { tokens.0.token: "e" }
    - match: { tokens.1.token: "f" }
    - match: { tokens.2.token: "g" }
    - match: { tokens.3.token: "h" }