[Docs] Add example to reimplement stempel analyzer (#42676)

Adding an example of how to re-implement the polish stempel analyzer
in case a user want to modify or extend it. In order for the analyzer to be
able to use polish stopwords, also registering a polish_stop filter for the
stempel plugin.

Closes #13150
This commit is contained in:
Christoph Büscher 2019-06-03 13:22:10 +02:00
parent df0f0b3d40
commit 9a9ee9abed
3 changed files with 178 additions and 3 deletions

View File

@ -12,7 +12,107 @@ include::install_remove.asciidoc[]
[[analysis-stempel-tokenizer]]
[float]
==== `stempel` tokenizer and token filter
==== `stempel` tokenizer and token filters
The plugin provides the `polish` analyzer and `polish_stem` token filter,
The plugin provides the `polish` analyzer and the `polish_stem` and `polish_stop` token filters,
which are not configurable.
==== Reimplementing and extending the analyzers
The `polish` analyzer could be reimplemented as a `custom` analyzer that can
then be extended and configured differently as follows:
[source,js]
----------------------------------------------------
PUT /stempel_example
{
"settings": {
"analysis": {
"analyzer": {
"rebuilt_stempel": {
"tokenizer": "standard",
"filter": [
"lowercase",
"polish_stop",
"polish_stem"
]
}
}
}
}
}
----------------------------------------------------
// CONSOLE
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stempel_example, first: polish, second: rebuilt_stempel}\nendyaml\n/]
[[analysis-polish-stop]]
==== `polish_stop` token filter
The `polish_stop` token filter filters out Polish stopwords (`_polish_`), and
any other custom stopwords specified by the user. This filter only supports
the predefined `_polish_` stopwords list. If you want to use a different
predefined list, then use the
{ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
[source,js]
--------------------------------------------------
PUT /polish_stop_example
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"analyzer_with_stop": {
"tokenizer": "standard",
"filter": [
"lowercase",
"polish_stop"
]
}
},
"filter": {
"polish_stop": {
"type": "polish_stop",
"stopwords": [
"_polish_",
"jeść"
]
}
}
}
}
}
}
GET polish_stop_example/_analyze
{
"analyzer": "analyzer_with_stop",
"text": "Gdzie kucharek sześć, tam nie ma co jeść."
}
--------------------------------------------------
// CONSOLE
The above request returns:
[source,js]
--------------------------------------------------
{
"tokens" : [
{
"token" : "kucharek",
"start_offset" : 6,
"end_offset" : 14,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "sześć",
"start_offset" : 15,
"end_offset" : 20,
"type" : "<ALPHANUM>",
"position" : 2
}
]
}
--------------------------------------------------
// TESTRESPONSE

View File

@ -0,0 +1,73 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis.pl;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pl.PolishAnalyzer;
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import java.util.Map;
import java.util.Set;
import static java.util.Collections.singletonMap;
public class PolishStopTokenFilterFactory extends AbstractTokenFilterFactory {
private static final Map<String, Set<?>> NAMED_STOP_WORDS = singletonMap("_polish_", PolishAnalyzer.getDefaultStopSet());
private final CharArraySet stopWords;
private final boolean ignoreCase;
private final boolean removeTrailing;
public PolishStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
this.stopWords = Analysis.parseWords(env, settings, "stopwords",
PolishAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (removeTrailing) {
return new StopFilter(tokenStream, stopWords);
} else {
return new SuggestStopFilter(tokenStream, stopWords);
}
}
public Set<?> stopWords() {
return stopWords;
}
public boolean ignoreCase() {
return ignoreCase;
}
}

View File

@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.pl.PolishAnalyzerProvider;
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
import org.elasticsearch.index.analysis.pl.PolishStopTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
@ -35,7 +36,8 @@ import static java.util.Collections.singletonMap;
public class AnalysisStempelPlugin extends Plugin implements AnalysisPlugin {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("polish_stem", PolishStemTokenFilterFactory::new);
return Map.of("polish_stem", PolishStemTokenFilterFactory::new,
"polish_stop", PolishStopTokenFilterFactory::new);
}
@Override