Deprecate smartcn_word

Looks like `WordTokenFilter` has been [deprecated in Lucene 4.8](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/WordTokenFilter.html) and looking at the javadoc, it looks like that only the [HMMChineseTokenizer](http://lucene.apache.org/core/4_8_0/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizer.html) will be supported. We need to deprecate `smartcn_word` and `smartcn_sentence`. We add `smartcn_tokenizer` which does the both things. Closes #22. (cherry picked from commit 64dcb9b)
2014-06-27 16:12:13 +02:00 · 2014-06-27 16:12:13 +02:00 · d063fe6019
parent 45dfe9abb6
commit d063fe6019
8 changed files with 71 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -20,7 +20,9 @@ Please read documentation relative to the version you are using:

 * [3.0.0-SNAPSHOT](https://github.com/elasticsearch/elasticsearch-analysis-smartcn/blob/master/README.md)

-The plugin includes the `smartcn` analyzer, `smartcn_sentence` tokenizer, and `smartcn_word` token filter.
+The plugin includes the `smartcn` analyzer and `smartcn_tokenizer` tokenizer.
+
+ Note that `smartcn_word` token filter and `smartcn_sentence` have been deprecated.

 License
 -------
--- a/src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalysisBinderProcessor.java
+++ b/src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalysisBinderProcessor.java
@ -30,11 +30,14 @@ public class SmartChineseAnalysisBinderProcessor extends AnalysisModule.Analysis

    @Override
    public void processTokenizers(TokenizersBindings tokenizersBindings) {
+        // TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
        tokenizersBindings.processTokenizer("smartcn_sentence", SmartChineseSentenceTokenizerFactory.class);
+        tokenizersBindings.processTokenizer("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory.class);
    }

    @Override
    public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
+        // TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
        tokenFiltersBindings.processTokenFilter("smartcn_word", SmartChineseWordTokenFilterFactory.class);
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/SmartChineseSentenceTokenizerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/SmartChineseSentenceTokenizerFactory.java
@ -30,7 +30,9 @@ import org.elasticsearch.index.settings.IndexSettings;
 import java.io.Reader;

 /**
+ * SentenceTokenizer has been deprecated in Lucene 4.8
 */
+@Deprecated
 public class SmartChineseSentenceTokenizerFactory extends AbstractTokenizerFactory {

    @Inject
--- a/src/main/java/org/elasticsearch/index/analysis/SmartChineseTokenizerTokenizerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/SmartChineseTokenizerTokenizerFactory.java
@ -0,0 +1,43 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.io.Reader;
+
+public class SmartChineseTokenizerTokenizerFactory extends AbstractTokenizerFactory {
+
+    @Inject
+    public SmartChineseTokenizerTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+    }
+
+    @Override
+    public Tokenizer create(Reader reader) {
+        return new HMMChineseTokenizer(reader);
+    }
+}
--- a/src/main/java/org/elasticsearch/index/analysis/SmartChineseWordTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/SmartChineseWordTokenFilterFactory.java
@ -28,7 +28,9 @@ import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;

 /**
+ * WordTokenFilter has been deprecated in Lucene 4.8
 */
+@Deprecated
 public class SmartChineseWordTokenFilterFactory extends AbstractTokenFilterFactory {

    @Inject
--- a/src/main/java/org/elasticsearch/indices/analysis/smartcn/SmartChineseIndicesAnalysis.java
+++ b/src/main/java/org/elasticsearch/indices/analysis/smartcn/SmartChineseIndicesAnalysis.java
@ -21,6 +21,7 @@ package org.elasticsearch.indices.analysis.smartcn;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
 import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
 import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
 import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
@ -47,6 +48,7 @@ public class SmartChineseIndicesAnalysis extends AbstractComponent {
        indicesAnalysisService.analyzerProviderFactories().put("smartcn", new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer(Lucene.ANALYZER_VERSION)));

        // Register smartcn_word token filter
+        // TODO Remove it in 2.3.0 (was deprecated: see https://github.com/elasticsearch/elasticsearch-analysis-smartcn/issues/22)
        indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
            @Override public String name() {
                return "smartcn_word";
@ -70,6 +72,18 @@ public class SmartChineseIndicesAnalysis extends AbstractComponent {
            }
        }));

+        // Register smartcn_sentence tokenizer
+        indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
+            @Override
+            public String name() {
+                return "smartcn_tokenizer";
+            }
+
+            @Override
+            public Tokenizer create(Reader reader) {
+                return new HMMChineseTokenizer(reader);
+            }
+        }));

    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseAnalysisTests.java
@ -53,10 +53,7 @@ public class SimpleSmartChineseAnalysisTests extends ElasticsearchTestCase {

        AnalysisService analysisService = injector.getInstance(AnalysisService.class);

-        TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_sentence");
-        MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseSentenceTokenizerFactory.class));
-
-        TokenFilterFactory filterFactory = analysisService.tokenFilter("smartcn_word");
-        MatcherAssert.assertThat(filterFactory, instanceOf(SmartChineseWordTokenFilterFactory.class));
+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_tokenizer");
+        MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseTokenizerTokenizerFactory.class));
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseIntegrationTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseIntegrationTests.java
@ -54,20 +54,10 @@ public class SimpleSmartChineseIntegrationTests extends ElasticsearchIntegration
    @Test
    public void testSmartcnTokenizer() throws ExecutionException, InterruptedException {
        AnalyzeResponse response = client().admin().indices()
-                .prepareAnalyze("叻出色").setTokenizer("smartcn_sentence")
+                .prepareAnalyze("叻出色").setTokenizer("smartcn_tokenizer")
                .execute().get();

        assertThat(response, notNullValue());
-        assertThat(response.getTokens().size(), is(1));
-    }
-
-    @Test
-    public void testSmartcnTokenFilter() throws ExecutionException, InterruptedException {
-        AnalyzeResponse response = client().admin().indices()
-                .prepareAnalyze("叻出色").setTokenFilters("smartcn_word")
-                .execute().get();
-
-        assertThat(response, notNullValue());
-        assertThat(response.getTokens().size(), is(3));
+        assertThat(response.getTokens().size(), is(2));
    }
 }