Deprecate smartcn_word

Looks like `WordTokenFilter` has been [deprecated in Lucene 4.8](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/WordTokenFilter.html) and looking at the javadoc, it looks like that only the [HMMChineseTokenizer](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.html) will be supported.

We need to deprecate `smartcn_word` and `smartcn_sentence`.
We add `smartcn_tokenizer` which does the both things.

 Closes #22.

(cherry picked from commit 64dcb9b)
This commit is contained in:
David Pilato 2014-06-27 16:12:13 +02:00
parent 45dfe9abb6
commit d063fe6019
8 changed files with 71 additions and 18 deletions

View File

@ -20,7 +20,9 @@ Please read documentation relative to the version you are using:
* [3.0.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-smartcn/blob/master/README.md)
The plugin includes the `smartcn` analyzer, `smartcn_sentence` tokenizer, and `smartcn_word` token filter.
The plugin includes the `smartcn` analyzer and `smartcn_tokenizer` tokenizer.
Note that `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
License
-------

View File

@ -30,11 +30,14 @@ public class SmartChineseAnalysisBinderProcessor extends AnalysisModule.Analysis
@Override
public void processTokenizers(TokenizersBindings tokenizersBindings) {
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
tokenizersBindings.processTokenizer("smartcn_sentence", SmartChineseSentenceTokenizerFactory.class);
tokenizersBindings.processTokenizer("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory.class);
}
@Override
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
tokenFiltersBindings.processTokenFilter("smartcn_word", SmartChineseWordTokenFilterFactory.class);
}
}

View File

@ -30,7 +30,9 @@ import org.elasticsearch.index.settings.IndexSettings;
import java.io.Reader;
/**
* SentenceTokenizer has been deprecated in Lucene 4.8
*/
@Deprecated
public class SmartChineseSentenceTokenizerFactory extends AbstractTokenizerFactory {
@Inject

View File

@ -0,0 +1,43 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.io.Reader;
public class SmartChineseTokenizerTokenizerFactory extends AbstractTokenizerFactory {
@Inject
public SmartChineseTokenizerTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
}
@Override
public Tokenizer create(Reader reader) {
return new HMMChineseTokenizer(reader);
}
}

View File

@ -28,7 +28,9 @@ import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
/**
* WordTokenFilter has been deprecated in Lucene 4.8
*/
@Deprecated
public class SmartChineseWordTokenFilterFactory extends AbstractTokenFilterFactory {
@Inject

View File

@ -21,6 +21,7 @@ package org.elasticsearch.indices.analysis.smartcn;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
@ -47,6 +48,7 @@ public class SmartChineseIndicesAnalysis extends AbstractComponent {
indicesAnalysisService.analyzerProviderFactories().put("smartcn", new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer(Lucene.ANALYZER_VERSION)));
// Register smartcn_word token filter
// TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override public String name() {
return "smartcn_word";
@ -70,6 +72,18 @@ public class SmartChineseIndicesAnalysis extends AbstractComponent {
}
}));
// Register smartcn_sentence tokenizer
indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
@Override
public String name() {
return "smartcn_tokenizer";
}
@Override
public Tokenizer create(Reader reader) {
return new HMMChineseTokenizer(reader);
}
}));
}
}

View File

@ -53,10 +53,7 @@ public class SimpleSmartChineseAnalysisTests extends ElasticsearchTestCase {
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_sentence");
MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseSentenceTokenizerFactory.class));
TokenFilterFactory filterFactory = analysisService.tokenFilter("smartcn_word");
MatcherAssert.assertThat(filterFactory, instanceOf(SmartChineseWordTokenFilterFactory.class));
TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_tokenizer");
MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseTokenizerTokenizerFactory.class));
}
}

View File

@ -54,20 +54,10 @@ public class SimpleSmartChineseIntegrationTests extends ElasticsearchIntegration
@Test
public void testSmartcnTokenizer() throws ExecutionException, InterruptedException {
AnalyzeResponse response = client().admin().indices()
.prepareAnalyze("叻出色").setTokenizer("smartcn_sentence")
.prepareAnalyze("叻出色").setTokenizer("smartcn_tokenizer")
.execute().get();
assertThat(response, notNullValue());
assertThat(response.getTokens().size(), is(1));
}
@Test
public void testSmartcnTokenFilter() throws ExecutionException, InterruptedException {
AnalyzeResponse response = client().admin().indices()
.prepareAnalyze("叻出色").setTokenFilters("smartcn_word")
.execute().get();
assertThat(response, notNullValue());
assertThat(response.getTokens().size(), is(3));
assertThat(response.getTokens().size(), is(2));
}
}