Improve documentation for smart_cn analyzer (#42822)

This commit is contained in:
Mayya Sharipova 2019-06-10 06:26:56 -04:00
parent 9def454ea9
commit 81a3b6e2fe
3 changed files with 478 additions and 5 deletions

View File

@ -17,7 +17,415 @@ include::install_remove.asciidoc[]
[float]
==== `smartcn` tokenizer and token filter
The plugin provides the `smartcn` analyzer and `smartcn_tokenizer` tokenizer,
which are not configurable.
The plugin provides the `smartcn` analyzer, `smartcn_tokenizer` tokenizer, and
`smartcn_stop` token filter which are not configurable.
NOTE: The `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
==== Reimplementing and extending the analyzers
The `smartcn` analyzer could be reimplemented as a `custom` analyzer that can
then be extended and configured as follows:
[source,js]
----------------------------------------------------
PUT smartcn_example
{
"settings": {
"analysis": {
"analyzer": {
"rebuilt_smartcn": {
"tokenizer": "smartcn_tokenizer",
"filter": [
"porter_stem",
"smartcn_stop"
]
}
}
}
}
}
----------------------------------------------------
// CONSOLE
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: smartcn_example, first: smartcn, second: rebuilt_smartcn}\nendyaml\n/]
[[analysis-smartcn_stop]]
==== `smartcn_stop` token filter
The `smartcn_stop` token filter filters out stopwords defined by `smartcn`
analyzer (`_smartcn_`), and any other custom stopwords specified by the user.
This filter only supports the predefined `_smartcn_` stopwords list.
If you want to use a different predefined list, then use the
{ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
[source,js]
--------------------------------------------------
PUT smartcn_example
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"smartcn_with_stop": {
"tokenizer": "smartcn_tokenizer",
"filter": [
"porter_stem",
"my_smartcn_stop"
]
}
},
"filter": {
"my_smartcn_stop": {
"type": "smartcn_stop",
"stopwords": [
"_smartcn_",
"stack",
"的"
]
}
}
}
}
}
}
GET smartcn_example/_analyze
{
"analyzer": "smartcn_with_stop",
"text": "哈喽,我们是 Elastic 我们是 Elastic StackElasticsearch、Kibana、Beats 和 Logstash的开发公司。从股票行情到 Twitter 消息流,从 Apache 日志到 WordPress 博文,我们可以帮助人们体验搜索的强大力量,帮助他们以截然不同的方式探索和分析数据"
}
--------------------------------------------------
// CONSOLE
The above request returns:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "哈",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "喽",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "我们",
"start_offset": 3,
"end_offset": 5,
"type": "word",
"position": 3
},
{
"token": "是",
"start_offset": 5,
"end_offset": 6,
"type": "word",
"position": 4
},
{
"token": "elast",
"start_offset": 7,
"end_offset": 14,
"type": "word",
"position": 5
},
{
"token": "我们",
"start_offset": 17,
"end_offset": 19,
"type": "word",
"position": 6
},
{
"token": "是",
"start_offset": 19,
"end_offset": 20,
"type": "word",
"position": 7
},
{
"token": "elast",
"start_offset": 21,
"end_offset": 28,
"type": "word",
"position": 8
},
{
"token": "elasticsearch",
"start_offset": 35,
"end_offset": 48,
"type": "word",
"position": 11
},
{
"token": "kibana",
"start_offset": 49,
"end_offset": 55,
"type": "word",
"position": 13
},
{
"token": "beat",
"start_offset": 56,
"end_offset": 61,
"type": "word",
"position": 15
},
{
"token": "和",
"start_offset": 62,
"end_offset": 63,
"type": "word",
"position": 16
},
{
"token": "logstash",
"start_offset": 64,
"end_offset": 72,
"type": "word",
"position": 17
},
{
"token": "开发",
"start_offset": 74,
"end_offset": 76,
"type": "word",
"position": 20
},
{
"token": "公司",
"start_offset": 76,
"end_offset": 78,
"type": "word",
"position": 21
},
{
"token": "从",
"start_offset": 79,
"end_offset": 80,
"type": "word",
"position": 23
},
{
"token": "股票",
"start_offset": 80,
"end_offset": 82,
"type": "word",
"position": 24
},
{
"token": "行情",
"start_offset": 82,
"end_offset": 84,
"type": "word",
"position": 25
},
{
"token": "到",
"start_offset": 84,
"end_offset": 85,
"type": "word",
"position": 26
},
{
"token": "twitter",
"start_offset": 86,
"end_offset": 93,
"type": "word",
"position": 27
},
{
"token": "消息",
"start_offset": 94,
"end_offset": 96,
"type": "word",
"position": 28
},
{
"token": "流",
"start_offset": 96,
"end_offset": 97,
"type": "word",
"position": 29
},
{
"token": "从",
"start_offset": 98,
"end_offset": 99,
"type": "word",
"position": 31
},
{
"token": "apach",
"start_offset": 100,
"end_offset": 106,
"type": "word",
"position": 32
},
{
"token": "日志",
"start_offset": 107,
"end_offset": 109,
"type": "word",
"position": 33
},
{
"token": "到",
"start_offset": 109,
"end_offset": 110,
"type": "word",
"position": 34
},
{
"token": "wordpress",
"start_offset": 111,
"end_offset": 120,
"type": "word",
"position": 35
},
{
"token": "博",
"start_offset": 121,
"end_offset": 122,
"type": "word",
"position": 36
},
{
"token": "文",
"start_offset": 122,
"end_offset": 123,
"type": "word",
"position": 37
},
{
"token": "我们",
"start_offset": 124,
"end_offset": 126,
"type": "word",
"position": 39
},
{
"token": "可以",
"start_offset": 126,
"end_offset": 128,
"type": "word",
"position": 40
},
{
"token": "帮助",
"start_offset": 128,
"end_offset": 130,
"type": "word",
"position": 41
},
{
"token": "人们",
"start_offset": 130,
"end_offset": 132,
"type": "word",
"position": 42
},
{
"token": "体验",
"start_offset": 132,
"end_offset": 134,
"type": "word",
"position": 43
},
{
"token": "搜索",
"start_offset": 134,
"end_offset": 136,
"type": "word",
"position": 44
},
{
"token": "强大",
"start_offset": 137,
"end_offset": 139,
"type": "word",
"position": 46
},
{
"token": "力量",
"start_offset": 139,
"end_offset": 141,
"type": "word",
"position": 47
},
{
"token": "帮助",
"start_offset": 142,
"end_offset": 144,
"type": "word",
"position": 49
},
{
"token": "他们",
"start_offset": 144,
"end_offset": 146,
"type": "word",
"position": 50
},
{
"token": "以",
"start_offset": 146,
"end_offset": 147,
"type": "word",
"position": 51
},
{
"token": "截然不同",
"start_offset": 147,
"end_offset": 151,
"type": "word",
"position": 52
},
{
"token": "方式",
"start_offset": 152,
"end_offset": 154,
"type": "word",
"position": 54
},
{
"token": "探索",
"start_offset": 154,
"end_offset": 156,
"type": "word",
"position": 55
},
{
"token": "和",
"start_offset": 156,
"end_offset": 157,
"type": "word",
"position": 56
},
{
"token": "分析",
"start_offset": 157,
"end_offset": 159,
"type": "word",
"position": 57
},
{
"token": "数据",
"start_offset": 159,
"end_offset": 161,
"type": "word",
"position": 58
}
]
}
--------------------------------------------------
// TESTRESPONSE

View File

@ -0,0 +1,61 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import java.util.Map;
import java.util.Set;
import static java.util.Collections.singletonMap;
public class SmartChineseStopTokenFilterFactory extends AbstractTokenFilterFactory {
private static final Map<String, Set<?>> NAMED_STOP_WORDS = singletonMap("_smartcn_", SmartChineseAnalyzer.getDefaultStopSet());
private final CharArraySet stopWords;
private final boolean ignoreCase;
private final boolean removeTrailing;
public SmartChineseStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
this.stopWords = Analysis.parseWords(env, settings, "stopwords",
SmartChineseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (removeTrailing) {
return new StopFilter(tokenStream, stopWords);
} else {
return new SuggestStopFilter(tokenStream, stopWords);
}
}
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.SmartChineseAnalyzerProvider;
import org.elasticsearch.index.analysis.SmartChineseNoOpTokenFilterFactory;
import org.elasticsearch.index.analysis.SmartChineseStopTokenFilterFactory;
import org.elasticsearch.index.analysis.SmartChineseTokenizerTokenizerFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
@ -38,15 +39,18 @@ import static java.util.Collections.singletonMap;
public class AnalysisSmartChinesePlugin extends Plugin implements AnalysisPlugin {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
// This is a noop token filter; it's here for backwards compat before we had "smartcn_tokenizer"
return singletonMap("smartcn_word", SmartChineseNoOpTokenFilterFactory::new);
Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters = new HashMap<>();
tokenFilters.put("smartcn_stop", SmartChineseStopTokenFilterFactory::new);
// TODO: deprecate and remove, this is a noop token filter; it's here for backwards compat before we had "smartcn_tokenizer"
tokenFilters.put("smartcn_word", SmartChineseNoOpTokenFilterFactory::new);
return tokenFilters;
}
@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
Map<String, AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
extra.put("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory::new);
// This is an alias to "smartcn_tokenizer"; it's here for backwards compat
// TODO: deprecate and remove, this is an alias to "smartcn_tokenizer"; it's here for backwards compat
extra.put("smartcn_sentence", SmartChineseTokenizerTokenizerFactory::new);
return extra;
}