Add JapaneseIterationMarkCharFilter support
Currently, Kuromoji have JapaneseIterationMarkCharFilter. Add IterationMarkCharFilter to analysis-kuromoji. Closes #7.
This commit is contained in:
parent
4c95a3ef70
commit
98b2544f2a
13
README.md
13
README.md
|
@ -25,6 +25,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter.
|
|||
|
||||
| name | type |
|
||||
|-------------------------|-------------|
|
||||
| kuromoji_iteration_mark | charfilter |
|
||||
| kuromoji | analyzer |
|
||||
| kuromoji_tokenizer | tokenizer |
|
||||
| kuromoji_baseform | tokenfilter |
|
||||
|
@ -49,6 +50,18 @@ This analyzer is the following tokenizer and tokenfilter combination.
|
|||
* `kuromoji_stemmer` : Kuromiji Katakana Stemmer Filter(TokenFilter)
|
||||
* `lowercase` : LowerCase Filter (TokenFilter)
|
||||
|
||||
## CharFilter : kuromoji_iteration_mark
|
||||
|
||||
A charfilter of type `kuromoji_iteration_mark`.
|
||||
This charfilter is Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
|
||||
|
||||
The following ar setting that can be set for a `kuromoji_iteration_mark` charfilter type:
|
||||
|
||||
| **Setting** | **Description** | **Default value** |
|
||||
|:----------------|:-------------------------------------------------------------|:------------------|
|
||||
| normalize_kanji | indicates whether kanji iteration marks should be normalized | `true` |
|
||||
| normalize_kana | indicates whether kanji iteration marks should be normalized | `true` |
|
||||
|
||||
## Tokenizer : kuromoji_tokenizer
|
||||
|
||||
A tokenizer of type `kuromoji_tokenizer`.
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory {
|
||||
|
||||
private final boolean normalizeKanji;
|
||||
private final boolean normalizeKana;
|
||||
|
||||
@Inject
|
||||
public KuromojiIterationMarkCharFilterFactory(Index index, @IndexSettings Settings indexSettings,
|
||||
@Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name);
|
||||
normalizeKanji = settings.getAsBoolean("normalize_kanji", JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
|
||||
normalizeKana = settings.getAsBoolean("normalize_kana", JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader create(Reader reader) {
|
||||
return new JapaneseIterationMarkCharFilter(reader, normalizeKanji, normalizeKana);
|
||||
}
|
||||
}
|
|
@ -26,10 +26,7 @@ import org.apache.lucene.util.Version;
|
|||
import org.elasticsearch.common.component.AbstractComponent;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
|
||||
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.*;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
|
@ -41,9 +38,24 @@ public class KuromojiIndicesAnalysis extends AbstractComponent {
|
|||
|
||||
@Inject
|
||||
public KuromojiIndicesAnalysis(Settings settings,
|
||||
IndicesAnalysisService indicesAnalysisService) {
|
||||
IndicesAnalysisService indicesAnalysisService) {
|
||||
super(settings);
|
||||
|
||||
indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark",
|
||||
new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "kuromoji_iteration_mark";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader create(Reader reader) {
|
||||
return new JapaneseIterationMarkCharFilter(reader,
|
||||
JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT,
|
||||
JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
|
||||
}
|
||||
}));
|
||||
|
||||
indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
|
||||
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||
@Override
|
||||
|
@ -83,7 +95,7 @@ public class KuromojiIndicesAnalysis extends AbstractComponent {
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new JapanesePartOfSpeechStopFilter(Version.LUCENE_44,
|
||||
tokenStream, JapaneseAnalyzer
|
||||
.getDefaultStopTags());
|
||||
.getDefaultStopTags());
|
||||
}
|
||||
}));
|
||||
|
||||
|
|
|
@ -48,6 +48,7 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin {
|
|||
}
|
||||
|
||||
public void onModule(AnalysisModule module) {
|
||||
module.addCharFilter("kuromoji_iteration_mark", KuromojiIterationMarkCharFilterFactory.class);
|
||||
module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class);
|
||||
module.addTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class);
|
||||
module.addTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class);
|
||||
|
|
|
@ -41,9 +41,9 @@ import org.elasticsearch.test.ElasticsearchTestCase;
|
|||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.hamcrest.Matchers.*;
|
||||
|
||||
/**
|
||||
|
@ -75,6 +75,9 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
|||
analyzer = analysisService.analyzer("my_analyzer");
|
||||
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
|
||||
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));
|
||||
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
|
||||
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -130,6 +133,41 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
|||
expected_tokens_katakana = new String[]{"明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"};
|
||||
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
|
||||
}
|
||||
@Test
|
||||
public void testIterationMarkCharFilter() throws IOException {
|
||||
AnalysisService analysisService = createAnalysisService();
|
||||
// test only kanji
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_im_only_kanji");
|
||||
assertNotNull(charFilterFactory);
|
||||
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||
|
||||
String source = "ところゞゝゝ、ジヾが、時々、馬鹿々々しい";
|
||||
String expected = "ところゞゝゝ、ジヾが、時時、馬鹿馬鹿しい";
|
||||
|
||||
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
|
||||
|
||||
// test only kana
|
||||
|
||||
charFilterFactory = analysisService.charFilter("kuromoji_im_only_kana");
|
||||
assertNotNull(charFilterFactory);
|
||||
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||
|
||||
expected = "ところどころ、ジジが、時々、馬鹿々々しい";
|
||||
|
||||
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
|
||||
|
||||
// test default
|
||||
|
||||
charFilterFactory = analysisService.charFilter("kuromoji_im_default");
|
||||
assertNotNull(charFilterFactory);
|
||||
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||
|
||||
expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";
|
||||
|
||||
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
|
||||
|
||||
|
||||
}
|
||||
|
||||
public AnalysisService createAnalysisService() {
|
||||
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json").build();
|
||||
|
@ -165,4 +203,20 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
|||
}
|
||||
assertThat("not all tokens produced", i, equalTo(expected.length));
|
||||
}
|
||||
|
||||
private void assertCharFilterEquals(Reader filtered,
|
||||
String expected) throws IOException {
|
||||
String actual = readFully(filtered);
|
||||
assertThat(actual, equalTo(expected));
|
||||
}
|
||||
|
||||
private String readFully(Reader reader) throws IOException {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
int ch;
|
||||
while((ch = reader.read()) != -1){
|
||||
buffer.append((char)ch);
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,7 +17,23 @@
|
|||
|
||||
|
||||
},
|
||||
|
||||
|
||||
"char_filter":{
|
||||
"kuromoji_im_only_kanji":{
|
||||
"type":"kuromoji_iteration_mark",
|
||||
"normalize_kanji":true,
|
||||
"normalize_kana":false
|
||||
},
|
||||
"kuromoji_im_only_kana":{
|
||||
"type":"kuromoji_iteration_mark",
|
||||
"normalize_kanji":false,
|
||||
"normalize_kana":true
|
||||
},
|
||||
"kuromoji_im_default":{
|
||||
"type":"kuromoji_iteration_mark"
|
||||
}
|
||||
},
|
||||
|
||||
"tokenizer" : {
|
||||
"kuromoji" : {
|
||||
"type":"kuromoji_tokenizer"
|
||||
|
|
Loading…
Reference in New Issue