Add JapaneseIterationMarkCharFilter support

Currently, Kuromoji have JapaneseIterationMarkCharFilter.
Add IterationMarkCharFilter to analysis-kuromoji.

Closes #7.
This commit is contained in:
Jun Ohtani 2013-11-07 11:22:50 +09:00 committed by David Pilato
parent 4c95a3ef70
commit 98b2544f2a
6 changed files with 133 additions and 8 deletions

View File

@ -25,6 +25,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter.
| name | type |
|-------------------------|-------------|
| kuromoji_iteration_mark | charfilter |
| kuromoji | analyzer |
| kuromoji_tokenizer | tokenizer |
| kuromoji_baseform | tokenfilter |
@ -49,6 +50,18 @@ This analyzer is the following tokenizer and tokenfilter combination.
* `kuromoji_stemmer` : Kuromiji Katakana Stemmer Filter(TokenFilter)
* `lowercase` : LowerCase Filter (TokenFilter)
## CharFilter : kuromoji_iteration_mark
A charfilter of type `kuromoji_iteration_mark`.
This charfilter is Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
The following ar setting that can be set for a `kuromoji_iteration_mark` charfilter type:
| **Setting** | **Description** | **Default value** |
|:----------------|:-------------------------------------------------------------|:------------------|
| normalize_kanji | indicates whether kanji iteration marks should be normalized | `true` |
| normalize_kana | indicates whether kanji iteration marks should be normalized | `true` |
## Tokenizer : kuromoji_tokenizer
A tokenizer of type `kuromoji_tokenizer`.

View File

@ -0,0 +1,29 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.io.Reader;
public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory {
private final boolean normalizeKanji;
private final boolean normalizeKana;
@Inject
public KuromojiIterationMarkCharFilterFactory(Index index, @IndexSettings Settings indexSettings,
@Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
normalizeKanji = settings.getAsBoolean("normalize_kanji", JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
normalizeKana = settings.getAsBoolean("normalize_kana", JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
}
@Override
public Reader create(Reader reader) {
return new JapaneseIterationMarkCharFilter(reader, normalizeKanji, normalizeKana);
}
}

View File

@ -26,10 +26,7 @@ import org.apache.lucene.util.Version;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.analysis.*;
import java.io.Reader;
@ -41,9 +38,24 @@ public class KuromojiIndicesAnalysis extends AbstractComponent {
@Inject
public KuromojiIndicesAnalysis(Settings settings,
IndicesAnalysisService indicesAnalysisService) {
IndicesAnalysisService indicesAnalysisService) {
super(settings);
indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark",
new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
@Override
public String name() {
return "kuromoji_iteration_mark";
}
@Override
public Reader create(Reader reader) {
return new JapaneseIterationMarkCharFilter(reader,
JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT,
JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
}
}));
indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
@Override
@ -83,7 +95,7 @@ public class KuromojiIndicesAnalysis extends AbstractComponent {
public TokenStream create(TokenStream tokenStream) {
return new JapanesePartOfSpeechStopFilter(Version.LUCENE_44,
tokenStream, JapaneseAnalyzer
.getDefaultStopTags());
.getDefaultStopTags());
}
}));

View File

@ -48,6 +48,7 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin {
}
public void onModule(AnalysisModule module) {
module.addCharFilter("kuromoji_iteration_mark", KuromojiIterationMarkCharFilterFactory.class);
module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class);
module.addTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class);
module.addTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class);

View File

@ -41,9 +41,9 @@ import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.*;
/**
@ -75,6 +75,9 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
analyzer = analysisService.analyzer("my_analyzer");
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
}
@Test
@ -130,6 +133,41 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
expected_tokens_katakana = new String[]{"明後日", "パーティー", "", "行く", "予定", "", "ある", "図書館", "", "資料", "", "コピー", "", "まし", ""};
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}
@Test
public void testIterationMarkCharFilter() throws IOException {
AnalysisService analysisService = createAnalysisService();
// test only kanji
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_im_only_kanji");
assertNotNull(charFilterFactory);
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
String source = "ところゞゝゝ、ジヾが、時々、馬鹿々々しい";
String expected = "ところゞゝゝ、ジヾが、時時、馬鹿馬鹿しい";
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
// test only kana
charFilterFactory = analysisService.charFilter("kuromoji_im_only_kana");
assertNotNull(charFilterFactory);
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
expected = "ところどころ、ジジが、時々、馬鹿々々しい";
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
// test default
charFilterFactory = analysisService.charFilter("kuromoji_im_default");
assertNotNull(charFilterFactory);
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
}
public AnalysisService createAnalysisService() {
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json").build();
@ -165,4 +203,20 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
}
assertThat("not all tokens produced", i, equalTo(expected.length));
}
private void assertCharFilterEquals(Reader filtered,
String expected) throws IOException {
String actual = readFully(filtered);
assertThat(actual, equalTo(expected));
}
private String readFully(Reader reader) throws IOException {
StringBuilder buffer = new StringBuilder();
int ch;
while((ch = reader.read()) != -1){
buffer.append((char)ch);
}
return buffer.toString();
}
}

View File

@ -17,7 +17,23 @@
},
"char_filter":{
"kuromoji_im_only_kanji":{
"type":"kuromoji_iteration_mark",
"normalize_kanji":true,
"normalize_kana":false
},
"kuromoji_im_only_kana":{
"type":"kuromoji_iteration_mark",
"normalize_kanji":false,
"normalize_kana":true
},
"kuromoji_im_default":{
"type":"kuromoji_iteration_mark"
}
},
"tokenizer" : {
"kuromoji" : {
"type":"kuromoji_tokenizer"