add ICUNormalizer2CharFilter

Included ICUNormalizer2Charfilter in Lucene 4.8.0. Add CharFilterFactory. Now, char_filter name is "icu_normalizer", however token_filter name is same name. Closes #27. (cherry picked from commit 0cbf1b3)
2014-05-28 15:46:59 +02:00 · 2014-05-28 15:46:59 +02:00 · f1eae455fb
parent f068ef88a4
commit f1eae455fb
5 changed files with 205 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -184,6 +184,31 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://
 ```
 ICU Normalization CharFilter
 -----------------
 Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization).
 It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings.
 Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
 Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`.
 Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively.
 Here is a sample settings:
 ```js
 {
    "index" : {
        "analysis" : {
            "analyzer" : {
                "collation" : {
                    "tokenizer" : "keyword",
                    "char_filter" : ["icu_normalizer"]
                }
            }
        }
    }
 }
 ```
 License
 -------
--- a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java
@ -23,6 +23,11 @@ package org.elasticsearch.index.analysis;
 */
 public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
    @Override
    public void processCharFilters(CharFiltersBindings charFiltersBindings) {
        charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class);
    }
    @Override
    public void processTokenizers(TokenizersBindings tokenizersBindings) {
        tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class);
--- a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java
@ -0,0 +1,63 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import com.ibm.icu.text.Normalizer2;
 import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;
 import java.io.Reader;
 /**
 * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
 * <p/>
 * <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
 * <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
 */
 public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
    private final String name;
    private final Normalizer2 normalizer;
    @Inject
    public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name);
        this.name = settings.get("name", "nfkc_cf");
        String mode = settings.get("mode");
        if (!"compose".equals(mode) && !"decompose".equals(mode)) {
            mode = "compose";
        }
        this.normalizer = Normalizer2.getInstance(
            null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
    }
    @Override
    public Reader create(Reader reader) {
        return new ICUNormalizer2CharFilter(reader, normalizer);
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
@ -65,5 +65,8 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase {
        filterFactory = analysisService.tokenFilter("icu_transform");
        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
        CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer");
        assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java
@ -0,0 +1,109 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import com.ibm.icu.text.Normalizer2;
 import org.apache.lucene.analysis.CharFilter;
 import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.inject.ModulesBuilder;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.settings.SettingsModule;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.EnvironmentModule;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexNameModule;
 import org.elasticsearch.index.settings.IndexSettingsModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
 import org.elasticsearch.test.ElasticsearchTestCase;
 import org.junit.Test;
 import java.io.StringReader;
 /**
 * Test
 */
 public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase {
    @Test
    public void testDefaultSetting() throws Exception {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
            .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
            .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
        String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
        Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
        String expectedOutput = normalizer.normalize(input);
        CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
        char[] tempBuff = new char[10];
        StringBuilder output = new StringBuilder();
        while (true) {
            int length = inputReader.read(tempBuff);
            if (length == -1) break;
            output.append(tempBuff, 0, length);
            assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
        }
        assertEquals(expectedOutput, output.toString());
    }
    @Test
    public void testNameAndModeSetting() throws Exception {
        Index index = new Index("test");
        Settings settings = ImmutableSettings.settingsBuilder()
            .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
            .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc")
            .put("index.analysis.char_filter.myNormalizerChar.mode", "decompose")
            .build();
        AnalysisService analysisService = createAnalysisService(index, settings);
        CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
        String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
        Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
        String expectedOutput = normalizer.normalize(input);
        CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
        char[] tempBuff = new char[10];
        StringBuilder output = new StringBuilder();
        while (true) {
            int length = inputReader.read(tempBuff);
            if (length == -1) break;
            output.append(tempBuff, 0, length);
            assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
        }
        assertEquals(expectedOutput, output.toString());
    }
    private AnalysisService createAnalysisService(Index index, Settings settings) {
        Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
        Injector injector = new ModulesBuilder().add(
            new IndexSettingsModule(index, settings),
            new IndexNameModule(index),
            new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
            .createChildInjector(parentInjector);
        return injector.getInstance(AnalysisService.class);
    }
 }