Deprecate smartcn_word
Looks like `WordTokenFilter` has been [deprecated in Lucene 4.8](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/WordTokenFilter.html) and looking at the javadoc, it looks like that only the [HMMChineseTokenizer](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.html) will be supported. We need to deprecate `smartcn_word` and `smartcn_sentence`. We add `smartcn_tokenizer` which does the both things. Closes #22. (cherry picked from commit 64dcb9b)
This commit is contained in:
parent
45dfe9abb6
commit
d063fe6019
|
@ -20,7 +20,9 @@ Please read documentation relative to the version you are using:
|
|||
|
||||
* [3.0.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-smartcn/blob/master/README.md)
|
||||
|
||||
The plugin includes the `smartcn` analyzer, `smartcn_sentence` tokenizer, and `smartcn_word` token filter.
|
||||
The plugin includes the `smartcn` analyzer and `smartcn_tokenizer` tokenizer.
|
||||
|
||||
Note that `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
|
||||
|
||||
License
|
||||
-------
|
||||
|
|
|
@ -30,11 +30,14 @@ public class SmartChineseAnalysisBinderProcessor extends AnalysisModule.Analysis
|
|||
|
||||
@Override
|
||||
public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
||||
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
|
||||
tokenizersBindings.processTokenizer("smartcn_sentence", SmartChineseSentenceTokenizerFactory.class);
|
||||
tokenizersBindings.processTokenizer("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
||||
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
|
||||
tokenFiltersBindings.processTokenFilter("smartcn_word", SmartChineseWordTokenFilterFactory.class);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,9 @@ import org.elasticsearch.index.settings.IndexSettings;
|
|||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* SentenceTokenizer has been deprecated in Lucene 4.8
|
||||
*/
|
||||
@Deprecated
|
||||
public class SmartChineseSentenceTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
@Inject
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
public class SmartChineseTokenizerTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
@Inject
|
||||
public SmartChineseTokenizerTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader reader) {
|
||||
return new HMMChineseTokenizer(reader);
|
||||
}
|
||||
}
|
|
@ -28,7 +28,9 @@ import org.elasticsearch.index.Index;
|
|||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
/**
|
||||
* WordTokenFilter has been deprecated in Lucene 4.8
|
||||
*/
|
||||
@Deprecated
|
||||
public class SmartChineseWordTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
@Inject
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.indices.analysis.smartcn;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
|
||||
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
|
||||
|
@ -47,6 +48,7 @@ public class SmartChineseIndicesAnalysis extends AbstractComponent {
|
|||
indicesAnalysisService.analyzerProviderFactories().put("smartcn", new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||
|
||||
// Register smartcn_word token filter
|
||||
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
|
||||
indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override public String name() {
|
||||
return "smartcn_word";
|
||||
|
@ -70,6 +72,18 @@ public class SmartChineseIndicesAnalysis extends AbstractComponent {
|
|||
}
|
||||
}));
|
||||
|
||||
// Register smartcn_sentence tokenizer
|
||||
indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "smartcn_tokenizer";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader reader) {
|
||||
return new HMMChineseTokenizer(reader);
|
||||
}
|
||||
}));
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,10 +53,7 @@ public class SimpleSmartChineseAnalysisTests extends ElasticsearchTestCase {
|
|||
|
||||
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
|
||||
|
||||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_sentence");
|
||||
MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseSentenceTokenizerFactory.class));
|
||||
|
||||
TokenFilterFactory filterFactory = analysisService.tokenFilter("smartcn_word");
|
||||
MatcherAssert.assertThat(filterFactory, instanceOf(SmartChineseWordTokenFilterFactory.class));
|
||||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_tokenizer");
|
||||
MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseTokenizerTokenizerFactory.class));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,20 +54,10 @@ public class SimpleSmartChineseIntegrationTests extends ElasticsearchIntegration
|
|||
@Test
|
||||
public void testSmartcnTokenizer() throws ExecutionException, InterruptedException {
|
||||
AnalyzeResponse response = client().admin().indices()
|
||||
.prepareAnalyze("叻出色").setTokenizer("smartcn_sentence")
|
||||
.prepareAnalyze("叻出色").setTokenizer("smartcn_tokenizer")
|
||||
.execute().get();
|
||||
|
||||
assertThat(response, notNullValue());
|
||||
assertThat(response.getTokens().size(), is(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSmartcnTokenFilter() throws ExecutionException, InterruptedException {
|
||||
AnalyzeResponse response = client().admin().indices()
|
||||
.prepareAnalyze("叻出色").setTokenFilters("smartcn_word")
|
||||
.execute().get();
|
||||
|
||||
assertThat(response, notNullValue());
|
||||
assertThat(response.getTokens().size(), is(3));
|
||||
assertThat(response.getTokens().size(), is(2));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue