add ICUNormalizer2CharFilter

Included ICUNormalizer2Charfilter in Lucene 4.8.0. Add CharFilterFactory. Now, char_filter name is "icu_normalizer", however token_filter name is same name. Closes #27. (cherry picked from commit 0cbf1b3)
2025-03-09 14:34:43 +00:00 · 2014-05-28 15:46:59 +02:00 · 2014-05-28 15:46:59 +02:00 · f1eae455fb
commit f1eae455fb
parent f068ef88a4
5 changed files with 205 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -184,6 +184,31 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://
 ```


+ICU Normalization CharFilter
+-----------------
+
+Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization).
+It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings.
+Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
+Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`.
+Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively.
+Here is a sample settings:
+
+```js
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "collation" : {
+                    "tokenizer" : "keyword",
+                    "char_filter" : ["icu_normalizer"]
+                }
+            }
+        }
+    }
+}
+```
+
 License
 -------

--- a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java
@ -23,6 +23,11 @@ package org.elasticsearch.index.analysis;
 */
 public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {

+    @Override
+    public void processCharFilters(CharFiltersBindings charFiltersBindings) {
+        charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class);
+    }
+
    @Override
    public void processTokenizers(TokenizersBindings tokenizersBindings) {
        tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class);
--- a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java
@ -0,0 +1,63 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+
+import com.ibm.icu.text.Normalizer2;
+import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.io.Reader;
+
+
+/**
+ * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
+ * <p/>
+ * <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
+ * <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
+ */
+public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
+
+    private final String name;
+
+    private final Normalizer2 normalizer;
+
+
+    @Inject
+    public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name);
+        this.name = settings.get("name", "nfkc_cf");
+        String mode = settings.get("mode");
+        if (!"compose".equals(mode) && !"decompose".equals(mode)) {
+            mode = "compose";
+        }
+        this.normalizer = Normalizer2.getInstance(
+            null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
+    }
+
+    @Override
+    public Reader create(Reader reader) {
+        return new ICUNormalizer2CharFilter(reader, normalizer);
+    }
+}
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java
@ -65,5 +65,8 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase {

        filterFactory = analysisService.tokenFilter("icu_transform");
        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
+
+        CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer");
+        assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java
@ -0,0 +1,109 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import com.ibm.icu.text.Normalizer2;
+import org.apache.lucene.analysis.CharFilter;
+import org.elasticsearch.common.inject.Injector;
+import org.elasticsearch.common.inject.ModulesBuilder;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.settings.SettingsModule;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.EnvironmentModule;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexNameModule;
+import org.elasticsearch.index.settings.IndexSettingsModule;
+import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
+import org.elasticsearch.indices.analysis.IndicesAnalysisService;
+import org.elasticsearch.test.ElasticsearchTestCase;
+import org.junit.Test;
+
+import java.io.StringReader;
+
+/**
+ * Test
+ */
+public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase {
+
+    @Test
+    public void testDefaultSetting() throws Exception {
+
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+            .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
+            .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+        CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
+
+        String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
+        Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
+        String expectedOutput = normalizer.normalize(input);
+        CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
+        char[] tempBuff = new char[10];
+        StringBuilder output = new StringBuilder();
+        while (true) {
+            int length = inputReader.read(tempBuff);
+            if (length == -1) break;
+            output.append(tempBuff, 0, length);
+            assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
+        }
+        assertEquals(expectedOutput, output.toString());
+    }
+
+
+    @Test
+    public void testNameAndModeSetting() throws Exception {
+
+        Index index = new Index("test");
+        Settings settings = ImmutableSettings.settingsBuilder()
+            .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
+            .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc")
+            .put("index.analysis.char_filter.myNormalizerChar.mode", "decompose")
+            .build();
+        AnalysisService analysisService = createAnalysisService(index, settings);
+        CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
+
+        String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
+        Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
+        String expectedOutput = normalizer.normalize(input);
+        CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
+        char[] tempBuff = new char[10];
+        StringBuilder output = new StringBuilder();
+        while (true) {
+            int length = inputReader.read(tempBuff);
+            if (length == -1) break;
+            output.append(tempBuff, 0, length);
+            assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
+        }
+        assertEquals(expectedOutput, output.toString());
+    }
+
+    private AnalysisService createAnalysisService(Index index, Settings settings) {
+        Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
+        Injector injector = new ModulesBuilder().add(
+            new IndexSettingsModule(index, settings),
+            new IndexNameModule(index),
+            new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
+            .createChildInjector(parentInjector);
+
+        return injector.getInstance(AnalysisService.class);
+    }
+}