add ICUNormalizer2CharFilter

Included ICUNormalizer2Charfilter in Lucene 4.8.0.
Add CharFilterFactory.

Now, char_filter name is "icu_normalizer", however token_filter name is same name.

Closes #27.

(cherry picked from commit 0cbf1b3)
This commit is contained in:
Jun Ohtani 2014-05-28 15:46:59 +02:00 committed by David Pilato
parent f068ef88a4
commit f1eae455fb
5 changed files with 205 additions and 0 deletions

View File

@ -184,6 +184,31 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://
``` ```
ICU Normalization CharFilter
-----------------
Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization).
It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings.
Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`.
Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively.
Here is a sample settings:
```js
{
"index" : {
"analysis" : {
"analyzer" : {
"collation" : {
"tokenizer" : "keyword",
"char_filter" : ["icu_normalizer"]
}
}
}
}
}
```
License License
------- -------

View File

@ -23,6 +23,11 @@ package org.elasticsearch.index.analysis;
*/ */
public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
@Override
public void processCharFilters(CharFiltersBindings charFiltersBindings) {
charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class);
}
@Override @Override
public void processTokenizers(TokenizersBindings tokenizersBindings) { public void processTokenizers(TokenizersBindings tokenizersBindings) {
tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class); tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class);

View File

@ -0,0 +1,63 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Normalizer2;
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.io.Reader;
/**
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
* <p/>
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
* <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
*/
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
private final String name;
private final Normalizer2 normalizer;
@Inject
public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
this.name = settings.get("name", "nfkc_cf");
String mode = settings.get("mode");
if (!"compose".equals(mode) && !"decompose".equals(mode)) {
mode = "compose";
}
this.normalizer = Normalizer2.getInstance(
null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
}
@Override
public Reader create(Reader reader) {
return new ICUNormalizer2CharFilter(reader, normalizer);
}
}

View File

@ -65,5 +65,8 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase {
filterFactory = analysisService.tokenFilter("icu_transform"); filterFactory = analysisService.tokenFilter("icu_transform");
assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer");
assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));
} }
} }

View File

@ -0,0 +1,109 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Normalizer2;
import org.apache.lucene.analysis.CharFilter;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
import java.io.StringReader;
/**
* Test
*/
public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase {
@Test
public void testDefaultSetting() throws Exception {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
String input = "ʰ㌰゙5℃№㈱㌘バッファーの正規化のテスト㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1) break;
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
@Test
public void testNameAndModeSetting() throws Exception {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
.put("index.analysis.char_filter.myNormalizerChar.name", "nfkc")
.put("index.analysis.char_filter.myNormalizerChar.mode", "decompose")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
String input = "ʰ㌰゙5℃№㈱㌘バッファーの正規化のテスト㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1) break;
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
private AnalysisService createAnalysisService(Index index, Settings settings) {
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
Injector injector = new ModulesBuilder().add(
new IndexSettingsModule(index, settings),
new IndexNameModule(index),
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
.createChildInjector(parentInjector);
return injector.getInstance(AnalysisService.class);
}
}