mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-04-02 13:29:06 +00:00
[Docs] Add example to reimplement stempel analyzer (#42676)
Adding an example of how to re-implement the polish stempel analyzer in case a user want to modify or extend it. In order for the analyzer to be able to use polish stopwords, also registering a polish_stop filter for the stempel plugin. Closes #13150
This commit is contained in:
parent
df0f0b3d40
commit
9a9ee9abed
docs/plugins
plugins/analysis-stempel/src/main/java/org/elasticsearch
@ -12,7 +12,107 @@ include::install_remove.asciidoc[]
|
|||||||
|
|
||||||
[[analysis-stempel-tokenizer]]
|
[[analysis-stempel-tokenizer]]
|
||||||
[float]
|
[float]
|
||||||
==== `stempel` tokenizer and token filter
|
==== `stempel` tokenizer and token filters
|
||||||
|
|
||||||
The plugin provides the `polish` analyzer and `polish_stem` token filter,
|
The plugin provides the `polish` analyzer and the `polish_stem` and `polish_stop` token filters,
|
||||||
which are not configurable.
|
which are not configurable.
|
||||||
|
|
||||||
|
==== Reimplementing and extending the analyzers
|
||||||
|
|
||||||
|
The `polish` analyzer could be reimplemented as a `custom` analyzer that can
|
||||||
|
then be extended and configured differently as follows:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
----------------------------------------------------
|
||||||
|
PUT /stempel_example
|
||||||
|
{
|
||||||
|
"settings": {
|
||||||
|
"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"rebuilt_stempel": {
|
||||||
|
"tokenizer": "standard",
|
||||||
|
"filter": [
|
||||||
|
"lowercase",
|
||||||
|
"polish_stop",
|
||||||
|
"polish_stem"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
----------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stempel_example, first: polish, second: rebuilt_stempel}\nendyaml\n/]
|
||||||
|
|
||||||
|
[[analysis-polish-stop]]
|
||||||
|
==== `polish_stop` token filter
|
||||||
|
|
||||||
|
The `polish_stop` token filter filters out Polish stopwords (`_polish_`), and
|
||||||
|
any other custom stopwords specified by the user. This filter only supports
|
||||||
|
the predefined `_polish_` stopwords list. If you want to use a different
|
||||||
|
predefined list, then use the
|
||||||
|
{ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
PUT /polish_stop_example
|
||||||
|
{
|
||||||
|
"settings": {
|
||||||
|
"index": {
|
||||||
|
"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"analyzer_with_stop": {
|
||||||
|
"tokenizer": "standard",
|
||||||
|
"filter": [
|
||||||
|
"lowercase",
|
||||||
|
"polish_stop"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"filter": {
|
||||||
|
"polish_stop": {
|
||||||
|
"type": "polish_stop",
|
||||||
|
"stopwords": [
|
||||||
|
"_polish_",
|
||||||
|
"jeść"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GET polish_stop_example/_analyze
|
||||||
|
{
|
||||||
|
"analyzer": "analyzer_with_stop",
|
||||||
|
"text": "Gdzie kucharek sześć, tam nie ma co jeść."
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
|
||||||
|
The above request returns:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"tokens" : [
|
||||||
|
{
|
||||||
|
"token" : "kucharek",
|
||||||
|
"start_offset" : 6,
|
||||||
|
"end_offset" : 14,
|
||||||
|
"type" : "<ALPHANUM>",
|
||||||
|
"position" : 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token" : "sześć",
|
||||||
|
"start_offset" : 15,
|
||||||
|
"end_offset" : 20,
|
||||||
|
"type" : "<ALPHANUM>",
|
||||||
|
"position" : 2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// TESTRESPONSE
|
||||||
|
73
plugins/analysis-stempel/src/main/java/org/elasticsearch/index/analysis/pl/PolishStopTokenFilterFactory.java
Normal file
73
plugins/analysis-stempel/src/main/java/org/elasticsearch/index/analysis/pl/PolishStopTokenFilterFactory.java
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis.pl;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.pl.PolishAnalyzer;
|
||||||
|
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.Analysis;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import static java.util.Collections.singletonMap;
|
||||||
|
|
||||||
|
public class PolishStopTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
private static final Map<String, Set<?>> NAMED_STOP_WORDS = singletonMap("_polish_", PolishAnalyzer.getDefaultStopSet());
|
||||||
|
|
||||||
|
private final CharArraySet stopWords;
|
||||||
|
|
||||||
|
private final boolean ignoreCase;
|
||||||
|
|
||||||
|
private final boolean removeTrailing;
|
||||||
|
|
||||||
|
public PolishStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
super(indexSettings, name, settings);
|
||||||
|
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||||
|
this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
|
||||||
|
this.stopWords = Analysis.parseWords(env, settings, "stopwords",
|
||||||
|
PolishAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
if (removeTrailing) {
|
||||||
|
return new StopFilter(tokenStream, stopWords);
|
||||||
|
} else {
|
||||||
|
return new SuggestStopFilter(tokenStream, stopWords);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<?> stopWords() {
|
||||||
|
return stopWords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean ignoreCase() {
|
||||||
|
return ignoreCase;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
|
|||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.pl.PolishAnalyzerProvider;
|
import org.elasticsearch.index.analysis.pl.PolishAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
|
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.pl.PolishStopTokenFilterFactory;
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
import org.elasticsearch.plugins.Plugin;
|
import org.elasticsearch.plugins.Plugin;
|
||||||
@ -35,7 +36,8 @@ import static java.util.Collections.singletonMap;
|
|||||||
public class AnalysisStempelPlugin extends Plugin implements AnalysisPlugin {
|
public class AnalysisStempelPlugin extends Plugin implements AnalysisPlugin {
|
||||||
@Override
|
@Override
|
||||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||||
return singletonMap("polish_stem", PolishStemTokenFilterFactory::new);
|
return Map.of("polish_stem", PolishStemTokenFilterFactory::new,
|
||||||
|
"polish_stop", PolishStopTokenFilterFactory::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
Loading…
x
Reference in New Issue
Block a user