[Docs] Add example to reimplement stempel analyzer (#42676)

Adding an example of how to re-implement the polish stempel analyzer in case a user want to modify or extend it. In order for the analyzer to be able to use polish stopwords, also registering a polish_stop filter for the stempel plugin. Closes #13150
2025-04-02 13:29:06 +00:00 · 2019-06-03 13:22:10 +02:00 · 2019-06-03 13:22:10 +02:00 · 9a9ee9abed
commit 9a9ee9abed
parent df0f0b3d40
3 changed files with 178 additions and 3 deletions
--- a/docs/plugins/analysis-stempel.asciidoc
+++ b/docs/plugins/analysis-stempel.asciidoc
@ -12,7 +12,107 @@ include::install_remove.asciidoc[]
 [[analysis-stempel-tokenizer]]
 [float]
-==== `stempel` tokenizer and token filter
+==== `stempel` tokenizer and token filters
-The plugin provides the `polish` analyzer and `polish_stem` token filter,
+The plugin provides the `polish` analyzer and the `polish_stem` and `polish_stop` token filters,
 which are not configurable.
 ==== Reimplementing and extending the analyzers
 The `polish` analyzer could be reimplemented as a `custom` analyzer that can
 then be extended and configured differently as follows:
 [source,js]
 ----------------------------------------------------
 PUT /stempel_example
 {
  "settings": {
    "analysis": {
      "analyzer": {
        "rebuilt_stempel": {
          "tokenizer":  "standard",
          "filter": [
            "lowercase",
            "polish_stop",
            "polish_stem"
          ]
        }
      }
    }
  }
 }
 ----------------------------------------------------
 // CONSOLE
 // TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: stempel_example, first: polish, second: rebuilt_stempel}\nendyaml\n/]
 [[analysis-polish-stop]]
 ==== `polish_stop` token filter
 The `polish_stop` token filter filters out Polish stopwords (`_polish_`), and
 any other custom stopwords specified by the user. This filter only supports
 the predefined `_polish_` stopwords list.  If you want to use a different
 predefined list, then use the
 {ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
 [source,js]
 --------------------------------------------------
 PUT /polish_stop_example
 {
  "settings": {
    "index": {
      "analysis": {
        "analyzer": {
          "analyzer_with_stop": {
            "tokenizer": "standard",
            "filter": [
              "lowercase",
              "polish_stop"
            ]
          }
        },
        "filter": {
          "polish_stop": {
            "type": "polish_stop",
            "stopwords": [
              "_polish_",
              "jeść"
            ]
          }
        }
      }
    }
  }
 }
 GET polish_stop_example/_analyze
 {
  "analyzer": "analyzer_with_stop",
  "text": "Gdzie kucharek sześć, tam nie ma co jeść."
 }
 --------------------------------------------------
 // CONSOLE
 The above request returns:
 [source,js]
 --------------------------------------------------
 {
  "tokens" : [
    {
      "token" : "kucharek",
      "start_offset" : 6,
      "end_offset" : 14,
      "type" : "<ALPHANUM>",
      "position" : 1
    },
    {
      "token" : "sześć",
      "start_offset" : 15,
      "end_offset" : 20,
      "type" : "<ALPHANUM>",
      "position" : 2
    }
  ]
 }
 --------------------------------------------------
 // TESTRESPONSE
--- a/plugins/analysis-stempel/src/main/java/org/elasticsearch/index/analysis/pl/PolishStopTokenFilterFactory.java
+++ b/plugins/analysis-stempel/src/main/java/org/elasticsearch/index/analysis/pl/PolishStopTokenFilterFactory.java
@ -0,0 +1,73 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis.pl;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.pl.PolishAnalyzer;
 import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.Analysis;
 import java.util.Map;
 import java.util.Set;
 import static java.util.Collections.singletonMap;
 public class PolishStopTokenFilterFactory extends AbstractTokenFilterFactory {
    private static final Map<String, Set<?>> NAMED_STOP_WORDS = singletonMap("_polish_", PolishAnalyzer.getDefaultStopSet());
    private final CharArraySet stopWords;
    private final boolean ignoreCase;
    private final boolean removeTrailing;
    public PolishStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
        super(indexSettings, name, settings);
        this.ignoreCase = settings.getAsBoolean("ignore_case", false);
        this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
        this.stopWords = Analysis.parseWords(env, settings, "stopwords",
                PolishAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
    }
    @Override
    public TokenStream create(TokenStream tokenStream) {
        if (removeTrailing) {
            return new StopFilter(tokenStream, stopWords);
        } else {
            return new SuggestStopFilter(tokenStream, stopWords);
        }
    }
    public Set<?> stopWords() {
        return stopWords;
    }
    public boolean ignoreCase() {
        return ignoreCase;
    }
 }
--- a/plugins/analysis-stempel/src/main/java/org/elasticsearch/plugin/analysis/stempel/AnalysisStempelPlugin.java
+++ b/plugins/analysis-stempel/src/main/java/org/elasticsearch/plugin/analysis/stempel/AnalysisStempelPlugin.java
@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.analysis.pl.PolishAnalyzerProvider;
 import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
 import org.elasticsearch.index.analysis.pl.PolishStopTokenFilterFactory;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.plugins.Plugin;
@ -35,7 +36,8 @@ import static java.util.Collections.singletonMap;
 public class AnalysisStempelPlugin extends Plugin implements AnalysisPlugin {
    @Override
    public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
-        return singletonMap("polish_stem", PolishStemTokenFilterFactory::new);
+        return Map.of("polish_stem", PolishStemTokenFilterFactory::new,
                      "polish_stop", PolishStopTokenFilterFactory::new);
    }
    @Override