add ICUNormalizer2CharFilter
Included ICUNormalizer2Charfilter in Lucene 4.8.0. Add CharFilterFactory. Now, char_filter name is "icu_normalizer", however token_filter name is same name. Closes #27. (cherry picked from commit 0cbf1b3)
This commit is contained in:
parent
f068ef88a4
commit
f1eae455fb
25
README.md
25
README.md
|
@ -184,6 +184,31 @@ Breaks text into words according to [UAX #29: Unicode Text Segmentation](http://
|
|||
```
|
||||
|
||||
|
||||
ICU Normalization CharFilter
|
||||
-----------------
|
||||
|
||||
Normalizes characters as explained [here](http://userguide.icu-project.org/transforms/normalization).
|
||||
It registers itself by default under `icu_normalizer` or `icuNormalizer` using the default settings.
|
||||
Allows for the name parameter to be provided which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`.
|
||||
Allows for the mode parameter to be provided which can include the following values: `compose` and `decompose`.
|
||||
Use `decompose` with `nfc` or `nfkc`, to get `nfd` or `nfkd`, respectively.
|
||||
Here is a sample settings:
|
||||
|
||||
```js
|
||||
{
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"collation" : {
|
||||
"tokenizer" : "keyword",
|
||||
"char_filter" : ["icu_normalizer"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
|
|
|
@ -23,6 +23,11 @@ package org.elasticsearch.index.analysis;
|
|||
*/
|
||||
public class IcuAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
|
||||
|
||||
@Override
|
||||
public void processCharFilters(CharFiltersBindings charFiltersBindings) {
|
||||
charFiltersBindings.processCharFilter("icu_normalizer", IcuNormalizerCharFilterFactory.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processTokenizers(TokenizersBindings tokenizersBindings) {
|
||||
tokenizersBindings.processTokenizer("icu_tokenizer", IcuTokenizerFactory.class);
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
|
||||
/**
|
||||
* Uses the {@link org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter} to normalize character.
|
||||
* <p/>
|
||||
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
|
||||
* <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
|
||||
*/
|
||||
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
|
||||
|
||||
private final String name;
|
||||
|
||||
private final Normalizer2 normalizer;
|
||||
|
||||
|
||||
@Inject
|
||||
public IcuNormalizerCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name);
|
||||
this.name = settings.get("name", "nfkc_cf");
|
||||
String mode = settings.get("mode");
|
||||
if (!"compose".equals(mode) && !"decompose".equals(mode)) {
|
||||
mode = "compose";
|
||||
}
|
||||
this.normalizer = Normalizer2.getInstance(
|
||||
null, this.name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader create(Reader reader) {
|
||||
return new ICUNormalizer2CharFilter(reader, normalizer);
|
||||
}
|
||||
}
|
|
@ -65,5 +65,8 @@ public class SimpleIcuAnalysisTests extends ElasticsearchTestCase {
|
|||
|
||||
filterFactory = analysisService.tokenFilter("icu_transform");
|
||||
assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
|
||||
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("icu_normalizer");
|
||||
assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.elasticsearch.common.inject.Injector;
|
||||
import org.elasticsearch.common.inject.ModulesBuilder;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.settings.SettingsModule;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.env.EnvironmentModule;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexNameModule;
|
||||
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
||||
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* Test
|
||||
*/
|
||||
public class SimpleIcuNormalizerCharFilterTests extends ElasticsearchTestCase {
|
||||
|
||||
@Test
|
||||
public void testDefaultSetting() throws Exception {
|
||||
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
|
||||
|
||||
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
|
||||
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
|
||||
String expectedOutput = normalizer.normalize(input);
|
||||
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
|
||||
char[] tempBuff = new char[10];
|
||||
StringBuilder output = new StringBuilder();
|
||||
while (true) {
|
||||
int length = inputReader.read(tempBuff);
|
||||
if (length == -1) break;
|
||||
output.append(tempBuff, 0, length);
|
||||
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
|
||||
}
|
||||
assertEquals(expectedOutput, output.toString());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testNameAndModeSetting() throws Exception {
|
||||
|
||||
Index index = new Index("test");
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
|
||||
.put("index.analysis.char_filter.myNormalizerChar.name", "nfkc")
|
||||
.put("index.analysis.char_filter.myNormalizerChar.mode", "decompose")
|
||||
.build();
|
||||
AnalysisService analysisService = createAnalysisService(index, settings);
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("myNormalizerChar");
|
||||
|
||||
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
|
||||
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
|
||||
String expectedOutput = normalizer.normalize(input);
|
||||
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
|
||||
char[] tempBuff = new char[10];
|
||||
StringBuilder output = new StringBuilder();
|
||||
while (true) {
|
||||
int length = inputReader.read(tempBuff);
|
||||
if (length == -1) break;
|
||||
output.append(tempBuff, 0, length);
|
||||
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
|
||||
}
|
||||
assertEquals(expectedOutput, output.toString());
|
||||
}
|
||||
|
||||
private AnalysisService createAnalysisService(Index index, Settings settings) {
|
||||
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
|
||||
Injector injector = new ModulesBuilder().add(
|
||||
new IndexSettingsModule(index, settings),
|
||||
new IndexNameModule(index),
|
||||
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
|
||||
.createChildInjector(parentInjector);
|
||||
|
||||
return injector.getInstance(AnalysisService.class);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue