mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-24 17:09:48 +00:00
[Docs] Add example to reimplement stempel analyzer (#42676)
Adding an example of how to re-implement the polish stempel analyzer in case a user want to modify or extend it. In order for the analyzer to be able to use polish stopwords, also registering a polish_stop filter for the stempel plugin. Closes #13150
This commit is contained in:
parent
df0f0b3d40
commit
9a9ee9abed
@ -12,7 +12,107 @@ include::install_remove.asciidoc[]
|
||||
|
||||
[[analysis-stempel-tokenizer]]
|
||||
[float]
|
||||
==== `stempel` tokenizer and token filter
|
||||
==== `stempel` tokenizer and token filters
|
||||
|
||||
The plugin provides the `polish` analyzer and `polish_stem` token filter,
|
||||
The plugin provides the `polish` analyzer and the `polish_stem` and `polish_stop` token filters,
|
||||
which are not configurable.
|
||||
|
||||
==== Reimplementing and extending the analyzers
|
||||
|
||||
The `polish` analyzer could be reimplemented as a `custom` analyzer that can
|
||||
then be extended and configured differently as follows:
|
||||
|
||||
[source,js]
|
||||
----------------------------------------------------
|
||||
PUT /stempel_example
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"rebuilt_stempel": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"lowercase",
|
||||
"polish_stop",
|
||||
"polish_stem"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
// CONSOLE
|
||||
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stempel_example, first: polish, second: rebuilt_stempel}\nendyaml\n/]
|
||||
|
||||
[[analysis-polish-stop]]
|
||||
==== `polish_stop` token filter
|
||||
|
||||
The `polish_stop` token filter filters out Polish stopwords (`_polish_`), and
|
||||
any other custom stopwords specified by the user. This filter only supports
|
||||
the predefined `_polish_` stopwords list. If you want to use a different
|
||||
predefined list, then use the
|
||||
{ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT /polish_stop_example
|
||||
{
|
||||
"settings": {
|
||||
"index": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"analyzer_with_stop": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"lowercase",
|
||||
"polish_stop"
|
||||
]
|
||||
}
|
||||
},
|
||||
"filter": {
|
||||
"polish_stop": {
|
||||
"type": "polish_stop",
|
||||
"stopwords": [
|
||||
"_polish_",
|
||||
"jeść"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET polish_stop_example/_analyze
|
||||
{
|
||||
"analyzer": "analyzer_with_stop",
|
||||
"text": "Gdzie kucharek sześć, tam nie ma co jeść."
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
The above request returns:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"tokens" : [
|
||||
{
|
||||
"token" : "kucharek",
|
||||
"start_offset" : 6,
|
||||
"end_offset" : 14,
|
||||
"type" : "<ALPHANUM>",
|
||||
"position" : 1
|
||||
},
|
||||
{
|
||||
"token" : "sześć",
|
||||
"start_offset" : 15,
|
||||
"end_offset" : 20,
|
||||
"type" : "<ALPHANUM>",
|
||||
"position" : 2
|
||||
}
|
||||
]
|
||||
}
|
||||
--------------------------------------------------
|
||||
// TESTRESPONSE
|
||||
|
@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis.pl;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.pl.PolishAnalyzer;
|
||||
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static java.util.Collections.singletonMap;
|
||||
|
||||
public class PolishStopTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
private static final Map<String, Set<?>> NAMED_STOP_WORDS = singletonMap("_polish_", PolishAnalyzer.getDefaultStopSet());
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
|
||||
private final boolean ignoreCase;
|
||||
|
||||
private final boolean removeTrailing;
|
||||
|
||||
public PolishStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||
this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
|
||||
this.stopWords = Analysis.parseWords(env, settings, "stopwords",
|
||||
PolishAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
if (removeTrailing) {
|
||||
return new StopFilter(tokenStream, stopWords);
|
||||
} else {
|
||||
return new SuggestStopFilter(tokenStream, stopWords);
|
||||
}
|
||||
}
|
||||
|
||||
public Set<?> stopWords() {
|
||||
return stopWords;
|
||||
}
|
||||
|
||||
public boolean ignoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
}
|
@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.pl.PolishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.pl.PolishStopTokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
@ -35,7 +36,8 @@ import static java.util.Collections.singletonMap;
|
||||
public class AnalysisStempelPlugin extends Plugin implements AnalysisPlugin {
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||
return singletonMap("polish_stem", PolishStemTokenFilterFactory::new);
|
||||
return Map.of("polish_stem", PolishStemTokenFilterFactory::new,
|
||||
"polish_stop", PolishStopTokenFilterFactory::new);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
Loading…
x
Reference in New Issue
Block a user