From 98b2544f2a6e0ed356f329e446ea1da4689329db Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Thu, 7 Nov 2013 11:22:50 +0900 Subject: [PATCH] Add JapaneseIterationMarkCharFilter support Currently, Kuromoji have JapaneseIterationMarkCharFilter. Add IterationMarkCharFilter to analysis-kuromoji. Closes #7. --- README.md | 13 +++++ ...uromojiIterationMarkCharFilterFactory.java | 29 ++++++++++ .../analysis/KuromojiIndicesAnalysis.java | 24 ++++++-- .../kuromoji/AnalysisKuromojiPlugin.java | 1 + .../index/analysis/KuromojiAnalysisTests.java | 56 ++++++++++++++++++- .../index/analysis/kuromoji_analysis.json | 18 +++++- 6 files changed, 133 insertions(+), 8 deletions(-) create mode 100644 src/main/java/org/elasticsearch/index/analysis/KuromojiIterationMarkCharFilterFactory.java diff --git a/README.md b/README.md index d45c9c8208f..7df37ec4675 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter. | name | type | |-------------------------|-------------| +| kuromoji_iteration_mark | charfilter | | kuromoji | analyzer | | kuromoji_tokenizer | tokenizer | | kuromoji_baseform | tokenfilter | @@ -49,6 +50,18 @@ This analyzer is the following tokenizer and tokenfilter combination. * `kuromoji_stemmer` : Kuromiji Katakana Stemmer Filter(TokenFilter) * `lowercase` : LowerCase Filter (TokenFilter) +## CharFilter : kuromoji_iteration_mark + +A charfilter of type `kuromoji_iteration_mark`. +This charfilter is Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form. + +The following ar setting that can be set for a `kuromoji_iteration_mark` charfilter type: + +| **Setting** | **Description** | **Default value** | +|:----------------|:-------------------------------------------------------------|:------------------| +| normalize_kanji | indicates whether kanji iteration marks should be normalized | `true` | +| normalize_kana | indicates whether kanji iteration marks should be normalized | `true` | + ## Tokenizer : kuromoji_tokenizer A tokenizer of type `kuromoji_tokenizer`. diff --git a/src/main/java/org/elasticsearch/index/analysis/KuromojiIterationMarkCharFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/KuromojiIterationMarkCharFilterFactory.java new file mode 100644 index 00000000000..8b693793801 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiIterationMarkCharFilterFactory.java @@ -0,0 +1,29 @@ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.Reader; + +public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory { + + private final boolean normalizeKanji; + private final boolean normalizeKana; + + @Inject + public KuromojiIterationMarkCharFilterFactory(Index index, @IndexSettings Settings indexSettings, + @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name); + normalizeKanji = settings.getAsBoolean("normalize_kanji", JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT); + normalizeKana = settings.getAsBoolean("normalize_kana", JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT); + } + + @Override + public Reader create(Reader reader) { + return new JapaneseIterationMarkCharFilter(reader, normalizeKanji, normalizeKana); + } +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java index 613c8b3b232..ea6d2a67077 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java +++ b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java @@ -26,10 +26,7 @@ import org.apache.lucene.util.Version; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory; -import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; -import org.elasticsearch.index.analysis.TokenFilterFactory; -import org.elasticsearch.index.analysis.TokenizerFactory; +import org.elasticsearch.index.analysis.*; import java.io.Reader; @@ -41,9 +38,24 @@ public class KuromojiIndicesAnalysis extends AbstractComponent { @Inject public KuromojiIndicesAnalysis(Settings settings, - IndicesAnalysisService indicesAnalysisService) { + IndicesAnalysisService indicesAnalysisService) { super(settings); + indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark", + new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { + @Override + public String name() { + return "kuromoji_iteration_mark"; + } + + @Override + public Reader create(Reader reader) { + return new JapaneseIterationMarkCharFilter(reader, + JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT, + JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT); + } + })); + indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override @@ -83,7 +95,7 @@ public class KuromojiIndicesAnalysis extends AbstractComponent { public TokenStream create(TokenStream tokenStream) { return new JapanesePartOfSpeechStopFilter(Version.LUCENE_44, tokenStream, JapaneseAnalyzer - .getDefaultStopTags()); + .getDefaultStopTags()); } })); diff --git a/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java index 039a2716e57..f3bceaeb350 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java @@ -48,6 +48,7 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin { } public void onModule(AnalysisModule module) { + module.addCharFilter("kuromoji_iteration_mark", KuromojiIterationMarkCharFilterFactory.class); module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class); module.addTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class); module.addTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class); diff --git a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java index 0861911cd50..e49d8ce9301 100644 --- a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java @@ -41,9 +41,9 @@ import org.elasticsearch.test.ElasticsearchTestCase; import org.junit.Test; import java.io.IOException; +import java.io.Reader; import java.io.StringReader; -import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.*; /** @@ -75,6 +75,9 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase { analyzer = analysisService.analyzer("my_analyzer"); assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class)); assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class)); + + CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark"); + assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class)); } @Test @@ -130,6 +133,41 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase { expected_tokens_katakana = new String[]{"明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"}; assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); } + @Test + public void testIterationMarkCharFilter() throws IOException { + AnalysisService analysisService = createAnalysisService(); + // test only kanji + CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_im_only_kanji"); + assertNotNull(charFilterFactory); + assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class)); + + String source = "ところゞゝゝ、ジヾが、時々、馬鹿々々しい"; + String expected = "ところゞゝゝ、ジヾが、時時、馬鹿馬鹿しい"; + + assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected); + + // test only kana + + charFilterFactory = analysisService.charFilter("kuromoji_im_only_kana"); + assertNotNull(charFilterFactory); + assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class)); + + expected = "ところどころ、ジジが、時々、馬鹿々々しい"; + + assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected); + + // test default + + charFilterFactory = analysisService.charFilter("kuromoji_im_default"); + assertNotNull(charFilterFactory); + assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class)); + + expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい"; + + assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected); + + + } public AnalysisService createAnalysisService() { Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json").build(); @@ -165,4 +203,20 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase { } assertThat("not all tokens produced", i, equalTo(expected.length)); } + + private void assertCharFilterEquals(Reader filtered, + String expected) throws IOException { + String actual = readFully(filtered); + assertThat(actual, equalTo(expected)); + } + + private String readFully(Reader reader) throws IOException { + StringBuilder buffer = new StringBuilder(); + int ch; + while((ch = reader.read()) != -1){ + buffer.append((char)ch); + } + return buffer.toString(); + } + } diff --git a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json index b0960850e85..478a6c1d293 100644 --- a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json +++ b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json @@ -17,7 +17,23 @@ }, - + + "char_filter":{ + "kuromoji_im_only_kanji":{ + "type":"kuromoji_iteration_mark", + "normalize_kanji":true, + "normalize_kana":false + }, + "kuromoji_im_only_kana":{ + "type":"kuromoji_iteration_mark", + "normalize_kanji":false, + "normalize_kana":true + }, + "kuromoji_im_default":{ + "type":"kuromoji_iteration_mark" + } + }, + "tokenizer" : { "kuromoji" : { "type":"kuromoji_tokenizer"