From f1eae455fb8bc12f007245e8f70ece7c95f823a0 Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Wed, 28 May 2014 15:46:59 +0200 Subject: [PATCH] add ICUNormalizer2CharFilter Included ICUNormalizer2Charfilter in Lucene 4.8.0. Add CharFilterFactory. Now, char_filter name is "icu_normalizer", however token_filter name is same name. Closes #27. (cherry picked from commit 0cbf1b3) --- README.md | 25 ++++ .../analysis/IcuAnalysisBinderProcessor.java | 5 + .../IcuNormalizerCharFilterFactory.java | 63 ++++++++++ .../analysis/SimpleIcuAnalysisTests.java | 3 + .../SimpleIcuNormalizerCharFilterTests.java | 109 ++++++++++++++++++ 5 files changed, 205 insertions(+) create mode 100644 src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java diff --git a/README.md b/README.md index cc528c013cb..0cc22a6b7f8 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,31 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http:// ``` +ICU Normalization CharFilter +----------------- + +Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization). +It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings. +Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. +Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`. +Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively. +Here is a sample settings: + +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "char_filter" : ["icu_normalizer"] + } + } + } + } +} +``` + License ------- diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java index f23d32b5c74..8db169b9318 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java +++ b/src/main/java/org/elasticsearch/index/analysis/IcuAnalysisBinderProcessor.java @@ -23,6 +23,11 @@ package org.elasticsearch.index.analysis; */ public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { + @Override + public void processCharFilters(CharFiltersBindings charFiltersBindings) { + charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class); + } + @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java new file mode 100644 index 00000000000..337461c5095 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java @@ -0,0 +1,63 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + + +import com.ibm.icu.text.Normalizer2; +import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.Reader; + + +/** + * Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character. + *

+ *

The name can be used to provide the type of normalization to perform.

+ *

The mode can be used to provide 'compose' or 'decompose'. Default is compose.

+ */ +public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory { + + private final String name; + + private final Normalizer2 normalizer; + + + @Inject + public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name); + this.name = settings.get("name", "nfkc_cf"); + String mode = settings.get("mode"); + if (!"compose".equals(mode) && !"decompose".equals(mode)) { + mode = "compose"; + } + this.normalizer = Normalizer2.getInstance( + null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); + } + + @Override + public Reader create(Reader reader) { + return new ICUNormalizer2CharFilter(reader, normalizer); + } +} diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java index 99f46ebffbe..e12db59c6a8 100644 --- a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuAnalysisTests.java @@ -65,5 +65,8 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase { filterFactory = analysisService.tokenFilter("icu_transform"); assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); + + CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer"); + assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class)); } } diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java new file mode 100644 index 00000000000..c4cbb945e4c --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleIcuNormalizerCharFilterTests.java @@ -0,0 +1,109 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.ibm.icu.text.Normalizer2; +import org.apache.lucene.analysis.CharFilter; +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; + +import java.io.StringReader; + +/** + * Test + */ +public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase { + + @Test + public void testDefaultSetting() throws Exception { + + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar"); + + String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; + Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); + String expectedOutput = normalizer.normalize(input); + CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input)); + char[] tempBuff = new char[10]; + StringBuilder output = new StringBuilder(); + while (true) { + int length = inputReader.read(tempBuff); + if (length == -1) break; + output.append(tempBuff, 0, length); + assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length())))); + } + assertEquals(expectedOutput, output.toString()); + } + + + @Test + public void testNameAndModeSetting() throws Exception { + + Index index = new Index("test"); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") + .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc") + .put("index.analysis.char_filter.myNormalizerChar.mode", "decompose") + .build(); + AnalysisService analysisService = createAnalysisService(index, settings); + CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar"); + + String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; + Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE); + String expectedOutput = normalizer.normalize(input); + CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input)); + char[] tempBuff = new char[10]; + StringBuilder output = new StringBuilder(); + while (true) { + int length = inputReader.read(tempBuff); + if (length == -1) break; + output.append(tempBuff, 0, length); + assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length())))); + } + assertEquals(expectedOutput, output.toString()); + } + + private AnalysisService createAnalysisService(Index index, Settings settings) { + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, settings), + new IndexNameModule(index), + new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + return injector.getInstance(AnalysisService.class); + } +}