add ICUNormalizer2CharFilter
Included ICUNormalizer2Charfilter in Lucene 4.8.0. Add CharFilterFactory. Now, char_filter name is "icu_normalizer", however token_filter name is same name. Closes #27. (cherry picked from commit 0cbf1b3)
This commit is contained in:
parent
f068ef88a4
commit
f1eae455fb
25
README.md
25
README.md
|
@ -184,6 +184,31 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
ICU Normalization CharFilter
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization).
|
||||||
|
It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings.
|
||||||
|
Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
|
||||||
|
Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`.
|
||||||
|
Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively.
|
||||||
|
Here is a sample settings:
|
||||||
|
|
||||||
|
```js
|
||||||
|
{
|
||||||
|
"index" : {
|
||||||
|
"analysis" : {
|
||||||
|
"analyzer" : {
|
||||||
|
"collation" : {
|
||||||
|
"tokenizer" : "keyword",
|
||||||
|
"char_filter" : ["icu_normalizer"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
License
|
License
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,11 @@ package org.elasticsearch.index.analysis;
|
||||||
*/
|
*/
|
||||||
public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
|
public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void processCharFilters(CharFiltersBindings charFiltersBindings) {
|
||||||
|
charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
||||||
tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class);
|
tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class);
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Normalizer2;
|
||||||
|
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
|
||||||
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
|
||||||
|
* <p/>
|
||||||
|
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
|
||||||
|
* <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
|
||||||
|
*/
|
||||||
|
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
|
||||||
|
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
private final Normalizer2 normalizer;
|
||||||
|
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||||
|
super(index, indexSettings, name);
|
||||||
|
this.name = settings.get("name", "nfkc_cf");
|
||||||
|
String mode = settings.get("mode");
|
||||||
|
if (!"compose".equals(mode) && !"decompose".equals(mode)) {
|
||||||
|
mode = "compose";
|
||||||
|
}
|
||||||
|
this.normalizer = Normalizer2.getInstance(
|
||||||
|
null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader create(Reader reader) {
|
||||||
|
return new ICUNormalizer2CharFilter(reader, normalizer);
|
||||||
|
}
|
||||||
|
}
|
|
@ -65,5 +65,8 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase {
|
||||||
|
|
||||||
filterFactory = analysisService.tokenFilter("icu_transform");
|
filterFactory = analysisService.tokenFilter("icu_transform");
|
||||||
assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
|
assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
|
||||||
|
|
||||||
|
CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer");
|
||||||
|
assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,109 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.Normalizer2;
|
||||||
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
|
import org.elasticsearch.common.inject.Injector;
|
||||||
|
import org.elasticsearch.common.inject.ModulesBuilder;
|
||||||
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.common.settings.SettingsModule;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.env.EnvironmentModule;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.IndexNameModule;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||||
|
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
||||||
|
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
||||||
|
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test
|
||||||
|
*/
|
||||||
|
public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDefaultSetting() throws Exception {
|
||||||
|
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
|
||||||
|
|
||||||
|
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
|
||||||
|
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
|
||||||
|
String expectedOutput = normalizer.normalize(input);
|
||||||
|
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
|
||||||
|
char[] tempBuff = new char[10];
|
||||||
|
StringBuilder output = new StringBuilder();
|
||||||
|
while (true) {
|
||||||
|
int length = inputReader.read(tempBuff);
|
||||||
|
if (length == -1) break;
|
||||||
|
output.append(tempBuff, 0, length);
|
||||||
|
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
|
||||||
|
}
|
||||||
|
assertEquals(expectedOutput, output.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNameAndModeSetting() throws Exception {
|
||||||
|
|
||||||
|
Index index = new Index("test");
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
|
||||||
|
.put("index.analysis.char_filter.myNormalizerChar.name", "nfkc")
|
||||||
|
.put("index.analysis.char_filter.myNormalizerChar.mode", "decompose")
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||||
|
CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
|
||||||
|
|
||||||
|
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
|
||||||
|
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
|
||||||
|
String expectedOutput = normalizer.normalize(input);
|
||||||
|
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
|
||||||
|
char[] tempBuff = new char[10];
|
||||||
|
StringBuilder output = new StringBuilder();
|
||||||
|
while (true) {
|
||||||
|
int length = inputReader.read(tempBuff);
|
||||||
|
if (length == -1) break;
|
||||||
|
output.append(tempBuff, 0, length);
|
||||||
|
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
|
||||||
|
}
|
||||||
|
assertEquals(expectedOutput, output.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
private AnalysisService createAnalysisService(Index index, Settings settings) {
|
||||||
|
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
|
||||||
|
Injector injector = new ModulesBuilder().add(
|
||||||
|
new IndexSettingsModule(index, settings),
|
||||||
|
new IndexNameModule(index),
|
||||||
|
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
|
||||||
|
.createChildInjector(parentInjector);
|
||||||
|
|
||||||
|
return injector.getInstance(AnalysisService.class);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue