Improve documentation for smart_cn analyzer (#42822)
This commit is contained in:
parent
9def454ea9
commit
81a3b6e2fe
|
@ -17,7 +17,415 @@ include::install_remove.asciidoc[]
|
|||
[float]
|
||||
==== `smartcn` tokenizer and token filter
|
||||
|
||||
The plugin provides the `smartcn` analyzer and `smartcn_tokenizer` tokenizer,
|
||||
which are not configurable.
|
||||
The plugin provides the `smartcn` analyzer, `smartcn_tokenizer` tokenizer, and
|
||||
`smartcn_stop` token filter which are not configurable.
|
||||
|
||||
NOTE: The `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
|
||||
|
||||
==== Reimplementing and extending the analyzers
|
||||
|
||||
The `smartcn` analyzer could be reimplemented as a `custom` analyzer that can
|
||||
then be extended and configured as follows:
|
||||
|
||||
[source,js]
|
||||
----------------------------------------------------
|
||||
PUT smartcn_example
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"rebuilt_smartcn": {
|
||||
"tokenizer": "smartcn_tokenizer",
|
||||
"filter": [
|
||||
"porter_stem",
|
||||
"smartcn_stop"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
// CONSOLE
|
||||
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: smartcn_example, first: smartcn, second: rebuilt_smartcn}\nendyaml\n/]
|
||||
|
||||
[[analysis-smartcn_stop]]
|
||||
==== `smartcn_stop` token filter
|
||||
|
||||
The `smartcn_stop` token filter filters out stopwords defined by `smartcn`
|
||||
analyzer (`_smartcn_`), and any other custom stopwords specified by the user.
|
||||
This filter only supports the predefined `_smartcn_` stopwords list.
|
||||
If you want to use a different predefined list, then use the
|
||||
{ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT smartcn_example
|
||||
{
|
||||
"settings": {
|
||||
"index": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"smartcn_with_stop": {
|
||||
"tokenizer": "smartcn_tokenizer",
|
||||
"filter": [
|
||||
"porter_stem",
|
||||
"my_smartcn_stop"
|
||||
]
|
||||
}
|
||||
},
|
||||
"filter": {
|
||||
"my_smartcn_stop": {
|
||||
"type": "smartcn_stop",
|
||||
"stopwords": [
|
||||
"_smartcn_",
|
||||
"stack",
|
||||
"的"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GET smartcn_example/_analyze
|
||||
{
|
||||
"analyzer": "smartcn_with_stop",
|
||||
"text": "哈喽,我们是 Elastic 我们是 Elastic Stack(Elasticsearch、Kibana、Beats 和 Logstash)的开发公司。从股票行情到 Twitter 消息流,从 Apache 日志到 WordPress 博文,我们可以帮助人们体验搜索的强大力量,帮助他们以截然不同的方式探索和分析数据"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
The above request returns:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "哈",
|
||||
"start_offset": 0,
|
||||
"end_offset": 1,
|
||||
"type": "word",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "喽",
|
||||
"start_offset": 1,
|
||||
"end_offset": 2,
|
||||
"type": "word",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "我们",
|
||||
"start_offset": 3,
|
||||
"end_offset": 5,
|
||||
"type": "word",
|
||||
"position": 3
|
||||
},
|
||||
{
|
||||
"token": "是",
|
||||
"start_offset": 5,
|
||||
"end_offset": 6,
|
||||
"type": "word",
|
||||
"position": 4
|
||||
},
|
||||
{
|
||||
"token": "elast",
|
||||
"start_offset": 7,
|
||||
"end_offset": 14,
|
||||
"type": "word",
|
||||
"position": 5
|
||||
},
|
||||
{
|
||||
"token": "我们",
|
||||
"start_offset": 17,
|
||||
"end_offset": 19,
|
||||
"type": "word",
|
||||
"position": 6
|
||||
},
|
||||
{
|
||||
"token": "是",
|
||||
"start_offset": 19,
|
||||
"end_offset": 20,
|
||||
"type": "word",
|
||||
"position": 7
|
||||
},
|
||||
{
|
||||
"token": "elast",
|
||||
"start_offset": 21,
|
||||
"end_offset": 28,
|
||||
"type": "word",
|
||||
"position": 8
|
||||
},
|
||||
{
|
||||
"token": "elasticsearch",
|
||||
"start_offset": 35,
|
||||
"end_offset": 48,
|
||||
"type": "word",
|
||||
"position": 11
|
||||
},
|
||||
{
|
||||
"token": "kibana",
|
||||
"start_offset": 49,
|
||||
"end_offset": 55,
|
||||
"type": "word",
|
||||
"position": 13
|
||||
},
|
||||
{
|
||||
"token": "beat",
|
||||
"start_offset": 56,
|
||||
"end_offset": 61,
|
||||
"type": "word",
|
||||
"position": 15
|
||||
},
|
||||
{
|
||||
"token": "和",
|
||||
"start_offset": 62,
|
||||
"end_offset": 63,
|
||||
"type": "word",
|
||||
"position": 16
|
||||
},
|
||||
{
|
||||
"token": "logstash",
|
||||
"start_offset": 64,
|
||||
"end_offset": 72,
|
||||
"type": "word",
|
||||
"position": 17
|
||||
},
|
||||
{
|
||||
"token": "开发",
|
||||
"start_offset": 74,
|
||||
"end_offset": 76,
|
||||
"type": "word",
|
||||
"position": 20
|
||||
},
|
||||
{
|
||||
"token": "公司",
|
||||
"start_offset": 76,
|
||||
"end_offset": 78,
|
||||
"type": "word",
|
||||
"position": 21
|
||||
},
|
||||
{
|
||||
"token": "从",
|
||||
"start_offset": 79,
|
||||
"end_offset": 80,
|
||||
"type": "word",
|
||||
"position": 23
|
||||
},
|
||||
{
|
||||
"token": "股票",
|
||||
"start_offset": 80,
|
||||
"end_offset": 82,
|
||||
"type": "word",
|
||||
"position": 24
|
||||
},
|
||||
{
|
||||
"token": "行情",
|
||||
"start_offset": 82,
|
||||
"end_offset": 84,
|
||||
"type": "word",
|
||||
"position": 25
|
||||
},
|
||||
{
|
||||
"token": "到",
|
||||
"start_offset": 84,
|
||||
"end_offset": 85,
|
||||
"type": "word",
|
||||
"position": 26
|
||||
},
|
||||
{
|
||||
"token": "twitter",
|
||||
"start_offset": 86,
|
||||
"end_offset": 93,
|
||||
"type": "word",
|
||||
"position": 27
|
||||
},
|
||||
{
|
||||
"token": "消息",
|
||||
"start_offset": 94,
|
||||
"end_offset": 96,
|
||||
"type": "word",
|
||||
"position": 28
|
||||
},
|
||||
{
|
||||
"token": "流",
|
||||
"start_offset": 96,
|
||||
"end_offset": 97,
|
||||
"type": "word",
|
||||
"position": 29
|
||||
},
|
||||
{
|
||||
"token": "从",
|
||||
"start_offset": 98,
|
||||
"end_offset": 99,
|
||||
"type": "word",
|
||||
"position": 31
|
||||
},
|
||||
{
|
||||
"token": "apach",
|
||||
"start_offset": 100,
|
||||
"end_offset": 106,
|
||||
"type": "word",
|
||||
"position": 32
|
||||
},
|
||||
{
|
||||
"token": "日志",
|
||||
"start_offset": 107,
|
||||
"end_offset": 109,
|
||||
"type": "word",
|
||||
"position": 33
|
||||
},
|
||||
{
|
||||
"token": "到",
|
||||
"start_offset": 109,
|
||||
"end_offset": 110,
|
||||
"type": "word",
|
||||
"position": 34
|
||||
},
|
||||
{
|
||||
"token": "wordpress",
|
||||
"start_offset": 111,
|
||||
"end_offset": 120,
|
||||
"type": "word",
|
||||
"position": 35
|
||||
},
|
||||
{
|
||||
"token": "博",
|
||||
"start_offset": 121,
|
||||
"end_offset": 122,
|
||||
"type": "word",
|
||||
"position": 36
|
||||
},
|
||||
{
|
||||
"token": "文",
|
||||
"start_offset": 122,
|
||||
"end_offset": 123,
|
||||
"type": "word",
|
||||
"position": 37
|
||||
},
|
||||
{
|
||||
"token": "我们",
|
||||
"start_offset": 124,
|
||||
"end_offset": 126,
|
||||
"type": "word",
|
||||
"position": 39
|
||||
},
|
||||
{
|
||||
"token": "可以",
|
||||
"start_offset": 126,
|
||||
"end_offset": 128,
|
||||
"type": "word",
|
||||
"position": 40
|
||||
},
|
||||
{
|
||||
"token": "帮助",
|
||||
"start_offset": 128,
|
||||
"end_offset": 130,
|
||||
"type": "word",
|
||||
"position": 41
|
||||
},
|
||||
{
|
||||
"token": "人们",
|
||||
"start_offset": 130,
|
||||
"end_offset": 132,
|
||||
"type": "word",
|
||||
"position": 42
|
||||
},
|
||||
{
|
||||
"token": "体验",
|
||||
"start_offset": 132,
|
||||
"end_offset": 134,
|
||||
"type": "word",
|
||||
"position": 43
|
||||
},
|
||||
{
|
||||
"token": "搜索",
|
||||
"start_offset": 134,
|
||||
"end_offset": 136,
|
||||
"type": "word",
|
||||
"position": 44
|
||||
},
|
||||
{
|
||||
"token": "强大",
|
||||
"start_offset": 137,
|
||||
"end_offset": 139,
|
||||
"type": "word",
|
||||
"position": 46
|
||||
},
|
||||
{
|
||||
"token": "力量",
|
||||
"start_offset": 139,
|
||||
"end_offset": 141,
|
||||
"type": "word",
|
||||
"position": 47
|
||||
},
|
||||
{
|
||||
"token": "帮助",
|
||||
"start_offset": 142,
|
||||
"end_offset": 144,
|
||||
"type": "word",
|
||||
"position": 49
|
||||
},
|
||||
{
|
||||
"token": "他们",
|
||||
"start_offset": 144,
|
||||
"end_offset": 146,
|
||||
"type": "word",
|
||||
"position": 50
|
||||
},
|
||||
{
|
||||
"token": "以",
|
||||
"start_offset": 146,
|
||||
"end_offset": 147,
|
||||
"type": "word",
|
||||
"position": 51
|
||||
},
|
||||
{
|
||||
"token": "截然不同",
|
||||
"start_offset": 147,
|
||||
"end_offset": 151,
|
||||
"type": "word",
|
||||
"position": 52
|
||||
},
|
||||
{
|
||||
"token": "方式",
|
||||
"start_offset": 152,
|
||||
"end_offset": 154,
|
||||
"type": "word",
|
||||
"position": 54
|
||||
},
|
||||
{
|
||||
"token": "探索",
|
||||
"start_offset": 154,
|
||||
"end_offset": 156,
|
||||
"type": "word",
|
||||
"position": 55
|
||||
},
|
||||
{
|
||||
"token": "和",
|
||||
"start_offset": 156,
|
||||
"end_offset": 157,
|
||||
"type": "word",
|
||||
"position": 56
|
||||
},
|
||||
{
|
||||
"token": "分析",
|
||||
"start_offset": 157,
|
||||
"end_offset": 159,
|
||||
"type": "word",
|
||||
"position": 57
|
||||
},
|
||||
{
|
||||
"token": "数据",
|
||||
"start_offset": 159,
|
||||
"end_offset": 161,
|
||||
"type": "word",
|
||||
"position": 58
|
||||
}
|
||||
]
|
||||
}
|
||||
--------------------------------------------------
|
||||
// TESTRESPONSE
|
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
|
||||
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static java.util.Collections.singletonMap;
|
||||
|
||||
public class SmartChineseStopTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
private static final Map<String, Set<?>> NAMED_STOP_WORDS = singletonMap("_smartcn_", SmartChineseAnalyzer.getDefaultStopSet());
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
|
||||
private final boolean ignoreCase;
|
||||
|
||||
private final boolean removeTrailing;
|
||||
|
||||
public SmartChineseStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||
this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
|
||||
this.stopWords = Analysis.parseWords(env, settings, "stopwords",
|
||||
SmartChineseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
if (removeTrailing) {
|
||||
return new StopFilter(tokenStream, stopWords);
|
||||
} else {
|
||||
return new SuggestStopFilter(tokenStream, stopWords);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.SmartChineseAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.SmartChineseNoOpTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SmartChineseStopTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SmartChineseTokenizerTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
|
@ -38,15 +39,18 @@ import static java.util.Collections.singletonMap;
|
|||
public class AnalysisSmartChinesePlugin extends Plugin implements AnalysisPlugin {
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||
// This is a noop token filter; it's here for backwards compat before we had "smartcn_tokenizer"
|
||||
return singletonMap("smartcn_word", SmartChineseNoOpTokenFilterFactory::new);
|
||||
Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters = new HashMap<>();
|
||||
tokenFilters.put("smartcn_stop", SmartChineseStopTokenFilterFactory::new);
|
||||
// TODO: deprecate and remove, this is a noop token filter; it's here for backwards compat before we had "smartcn_tokenizer"
|
||||
tokenFilters.put("smartcn_word", SmartChineseNoOpTokenFilterFactory::new);
|
||||
return tokenFilters;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
|
||||
Map<String, AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
|
||||
extra.put("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory::new);
|
||||
// This is an alias to "smartcn_tokenizer"; it's here for backwards compat
|
||||
// TODO: deprecate and remove, this is an alias to "smartcn_tokenizer"; it's here for backwards compat
|
||||
extra.put("smartcn_sentence", SmartChineseTokenizerTokenizerFactory::new);
|
||||
return extra;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue