Deprecate smartcn_word
Looks like `WordTokenFilter` has been [deprecated in Lucene 4.8](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/WordTokenFilter.html) and looking at the javadoc, it looks like that only the [HMMChineseTokenizer](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.html) will be supported. We need to deprecate `smartcn_word` and `smartcn_sentence`. We add `smartcn_tokenizer` which does the both things. Closes #22. (cherry picked from commit 64dcb9b)
This commit is contained in:
parent
45dfe9abb6
commit
d063fe6019
|
@ -20,7 +20,9 @@ Please read documentation relative to the version you are using:
|
||||||
|
|
||||||
* [3.0.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-smartcn/blob/master/README.md)
|
* [3.0.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-smartcn/blob/master/README.md)
|
||||||
|
|
||||||
The plugin includes the `smartcn` analyzer, `smartcn_sentence` tokenizer, and `smartcn_word` token filter.
|
The plugin includes the `smartcn` analyzer and `smartcn_tokenizer` tokenizer.
|
||||||
|
|
||||||
|
Note that `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
|
||||||
|
|
||||||
License
|
License
|
||||||
-------
|
-------
|
||||||
|
|
|
@ -30,11 +30,14 @@ public class SmartChineseAnalysisBinderProcessor extends AnalysisModule.Analysis
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
||||||
|
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
|
||||||
tokenizersBindings.processTokenizer("smartcn_sentence", SmartChineseSentenceTokenizerFactory.class);
|
tokenizersBindings.processTokenizer("smartcn_sentence", SmartChineseSentenceTokenizerFactory.class);
|
||||||
|
tokenizersBindings.processTokenizer("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
||||||
|
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
|
||||||
tokenFiltersBindings.processTokenFilter("smartcn_word", SmartChineseWordTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("smartcn_word", SmartChineseWordTokenFilterFactory.class);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,7 +30,9 @@ import org.elasticsearch.index.settings.IndexSettings;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* SentenceTokenizer has been deprecated in Lucene 4.8
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class SmartChineseSentenceTokenizerFactory extends AbstractTokenizerFactory {
|
public class SmartChineseSentenceTokenizerFactory extends AbstractTokenizerFactory {
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
|
||||||
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
public class SmartChineseTokenizerTokenizerFactory extends AbstractTokenizerFactory {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public SmartChineseTokenizerTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||||
|
super(index, indexSettings, name, settings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Tokenizer create(Reader reader) {
|
||||||
|
return new HMMChineseTokenizer(reader);
|
||||||
|
}
|
||||||
|
}
|
|
@ -28,7 +28,9 @@ import org.elasticsearch.index.Index;
|
||||||
import org.elasticsearch.index.settings.IndexSettings;
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* WordTokenFilter has been deprecated in Lucene 4.8
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class SmartChineseWordTokenFilterFactory extends AbstractTokenFilterFactory {
|
public class SmartChineseWordTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.indices.analysis.smartcn;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
|
||||||
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
|
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
|
||||||
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
|
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
|
||||||
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
|
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
|
||||||
|
@ -47,6 +48,7 @@ public class SmartChineseIndicesAnalysis extends AbstractComponent {
|
||||||
indicesAnalysisService.analyzerProviderFactories().put("smartcn", new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer(Lucene.ANALYZER_VERSION)));
|
indicesAnalysisService.analyzerProviderFactories().put("smartcn", new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
|
|
||||||
// Register smartcn_word token filter
|
// Register smartcn_word token filter
|
||||||
|
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
|
||||||
indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
@Override public String name() {
|
@Override public String name() {
|
||||||
return "smartcn_word";
|
return "smartcn_word";
|
||||||
|
@ -70,6 +72,18 @@ public class SmartChineseIndicesAnalysis extends AbstractComponent {
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
// Register smartcn_sentence tokenizer
|
||||||
|
indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return "smartcn_tokenizer";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Tokenizer create(Reader reader) {
|
||||||
|
return new HMMChineseTokenizer(reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,10 +53,7 @@ public class SimpleSmartChineseAnalysisTests extends ElasticsearchTestCase {
|
||||||
|
|
||||||
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
|
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
|
||||||
|
|
||||||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_sentence");
|
TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_tokenizer");
|
||||||
MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseSentenceTokenizerFactory.class));
|
MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseTokenizerTokenizerFactory.class));
|
||||||
|
|
||||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("smartcn_word");
|
|
||||||
MatcherAssert.assertThat(filterFactory, instanceOf(SmartChineseWordTokenFilterFactory.class));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,20 +54,10 @@ public class SimpleSmartChineseIntegrationTests extends ElasticsearchIntegration
|
||||||
@Test
|
@Test
|
||||||
public void testSmartcnTokenizer() throws ExecutionException, InterruptedException {
|
public void testSmartcnTokenizer() throws ExecutionException, InterruptedException {
|
||||||
AnalyzeResponse response = client().admin().indices()
|
AnalyzeResponse response = client().admin().indices()
|
||||||
.prepareAnalyze("叻出色").setTokenizer("smartcn_sentence")
|
.prepareAnalyze("叻出色").setTokenizer("smartcn_tokenizer")
|
||||||
.execute().get();
|
.execute().get();
|
||||||
|
|
||||||
assertThat(response, notNullValue());
|
assertThat(response, notNullValue());
|
||||||
assertThat(response.getTokens().size(), is(1));
|
assertThat(response.getTokens().size(), is(2));
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSmartcnTokenFilter() throws ExecutionException, InterruptedException {
|
|
||||||
AnalyzeResponse response = client().admin().indices()
|
|
||||||
.prepareAnalyze("叻出色").setTokenFilters("smartcn_word")
|
|
||||||
.execute().get();
|
|
||||||
|
|
||||||
assertThat(response, notNullValue());
|
|
||||||
assertThat(response.getTokens().size(), is(3));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue