Add JapaneseIterationMarkCharFilter support
Currently, Kuromoji have JapaneseIterationMarkCharFilter. Add IterationMarkCharFilter to analysis-kuromoji. Closes #7.
This commit is contained in:
parent
4c95a3ef70
commit
98b2544f2a
13
README.md
13
README.md
|
@ -25,6 +25,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter.
|
||||||
|
|
||||||
| name | type |
|
| name | type |
|
||||||
|-------------------------|-------------|
|
|-------------------------|-------------|
|
||||||
|
| kuromoji_iteration_mark | charfilter |
|
||||||
| kuromoji | analyzer |
|
| kuromoji | analyzer |
|
||||||
| kuromoji_tokenizer | tokenizer |
|
| kuromoji_tokenizer | tokenizer |
|
||||||
| kuromoji_baseform | tokenfilter |
|
| kuromoji_baseform | tokenfilter |
|
||||||
|
@ -49,6 +50,18 @@ This analyzer is the following tokenizer and tokenfilter combination.
|
||||||
* `kuromoji_stemmer` : Kuromiji Katakana Stemmer Filter(TokenFilter)
|
* `kuromoji_stemmer` : Kuromiji Katakana Stemmer Filter(TokenFilter)
|
||||||
* `lowercase` : LowerCase Filter (TokenFilter)
|
* `lowercase` : LowerCase Filter (TokenFilter)
|
||||||
|
|
||||||
|
## CharFilter : kuromoji_iteration_mark
|
||||||
|
|
||||||
|
A charfilter of type `kuromoji_iteration_mark`.
|
||||||
|
This charfilter is Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
|
||||||
|
|
||||||
|
The following ar setting that can be set for a `kuromoji_iteration_mark` charfilter type:
|
||||||
|
|
||||||
|
| **Setting** | **Description** | **Default value** |
|
||||||
|
|:----------------|:-------------------------------------------------------------|:------------------|
|
||||||
|
| normalize_kanji | indicates whether kanji iteration marks should be normalized | `true` |
|
||||||
|
| normalize_kana | indicates whether kanji iteration marks should be normalized | `true` |
|
||||||
|
|
||||||
## Tokenizer : kuromoji_tokenizer
|
## Tokenizer : kuromoji_tokenizer
|
||||||
|
|
||||||
A tokenizer of type `kuromoji_tokenizer`.
|
A tokenizer of type `kuromoji_tokenizer`.
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter;
|
||||||
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory {
|
||||||
|
|
||||||
|
private final boolean normalizeKanji;
|
||||||
|
private final boolean normalizeKana;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public KuromojiIterationMarkCharFilterFactory(Index index, @IndexSettings Settings indexSettings,
|
||||||
|
@Assisted String name, @Assisted Settings settings) {
|
||||||
|
super(index, indexSettings, name);
|
||||||
|
normalizeKanji = settings.getAsBoolean("normalize_kanji", JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
|
||||||
|
normalizeKana = settings.getAsBoolean("normalize_kana", JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader create(Reader reader) {
|
||||||
|
return new JapaneseIterationMarkCharFilter(reader, normalizeKanji, normalizeKana);
|
||||||
|
}
|
||||||
|
}
|
|
@ -26,10 +26,7 @@ import org.apache.lucene.util.Version;
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
import org.elasticsearch.common.component.AbstractComponent;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
|
import org.elasticsearch.index.analysis.*;
|
||||||
import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
|
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
@ -44,6 +41,21 @@ public class KuromojiIndicesAnalysis extends AbstractComponent {
|
||||||
IndicesAnalysisService indicesAnalysisService) {
|
IndicesAnalysisService indicesAnalysisService) {
|
||||||
super(settings);
|
super(settings);
|
||||||
|
|
||||||
|
indicesAnalysisService.charFilterFactories().put("kuromoji_iteration_mark",
|
||||||
|
new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return "kuromoji_iteration_mark";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reader create(Reader reader) {
|
||||||
|
return new JapaneseIterationMarkCharFilter(reader,
|
||||||
|
JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT,
|
||||||
|
JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
|
indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
|
||||||
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -48,6 +48,7 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onModule(AnalysisModule module) {
|
public void onModule(AnalysisModule module) {
|
||||||
|
module.addCharFilter("kuromoji_iteration_mark", KuromojiIterationMarkCharFilterFactory.class);
|
||||||
module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class);
|
module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class);
|
||||||
module.addTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class);
|
module.addTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class);
|
||||||
module.addTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class);
|
module.addTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class);
|
||||||
|
|
|
@ -41,9 +41,9 @@ import org.elasticsearch.test.ElasticsearchTestCase;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import static org.hamcrest.MatcherAssert.assertThat;
|
|
||||||
import static org.hamcrest.Matchers.*;
|
import static org.hamcrest.Matchers.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -75,6 +75,9 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
||||||
analyzer = analysisService.analyzer("my_analyzer");
|
analyzer = analysisService.analyzer("my_analyzer");
|
||||||
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
|
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
|
||||||
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));
|
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));
|
||||||
|
|
||||||
|
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
|
||||||
|
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -130,6 +133,41 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
||||||
expected_tokens_katakana = new String[]{"明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"};
|
expected_tokens_katakana = new String[]{"明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"};
|
||||||
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
|
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
|
||||||
}
|
}
|
||||||
|
@Test
|
||||||
|
public void testIterationMarkCharFilter() throws IOException {
|
||||||
|
AnalysisService analysisService = createAnalysisService();
|
||||||
|
// test only kanji
|
||||||
|
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_im_only_kanji");
|
||||||
|
assertNotNull(charFilterFactory);
|
||||||
|
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||||
|
|
||||||
|
String source = "ところゞゝゝ、ジヾが、時々、馬鹿々々しい";
|
||||||
|
String expected = "ところゞゝゝ、ジヾが、時時、馬鹿馬鹿しい";
|
||||||
|
|
||||||
|
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
|
||||||
|
|
||||||
|
// test only kana
|
||||||
|
|
||||||
|
charFilterFactory = analysisService.charFilter("kuromoji_im_only_kana");
|
||||||
|
assertNotNull(charFilterFactory);
|
||||||
|
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||||
|
|
||||||
|
expected = "ところどころ、ジジが、時々、馬鹿々々しい";
|
||||||
|
|
||||||
|
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
|
||||||
|
|
||||||
|
// test default
|
||||||
|
|
||||||
|
charFilterFactory = analysisService.charFilter("kuromoji_im_default");
|
||||||
|
assertNotNull(charFilterFactory);
|
||||||
|
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||||
|
|
||||||
|
expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";
|
||||||
|
|
||||||
|
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public AnalysisService createAnalysisService() {
|
public AnalysisService createAnalysisService() {
|
||||||
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json").build();
|
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json").build();
|
||||||
|
@ -165,4 +203,20 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
||||||
}
|
}
|
||||||
assertThat("not all tokens produced", i, equalTo(expected.length));
|
assertThat("not all tokens produced", i, equalTo(expected.length));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void assertCharFilterEquals(Reader filtered,
|
||||||
|
String expected) throws IOException {
|
||||||
|
String actual = readFully(filtered);
|
||||||
|
assertThat(actual, equalTo(expected));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String readFully(Reader reader) throws IOException {
|
||||||
|
StringBuilder buffer = new StringBuilder();
|
||||||
|
int ch;
|
||||||
|
while((ch = reader.read()) != -1){
|
||||||
|
buffer.append((char)ch);
|
||||||
|
}
|
||||||
|
return buffer.toString();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,22 @@
|
||||||
|
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"char_filter":{
|
||||||
|
"kuromoji_im_only_kanji":{
|
||||||
|
"type":"kuromoji_iteration_mark",
|
||||||
|
"normalize_kanji":true,
|
||||||
|
"normalize_kana":false
|
||||||
|
},
|
||||||
|
"kuromoji_im_only_kana":{
|
||||||
|
"type":"kuromoji_iteration_mark",
|
||||||
|
"normalize_kanji":false,
|
||||||
|
"normalize_kana":true
|
||||||
|
},
|
||||||
|
"kuromoji_im_default":{
|
||||||
|
"type":"kuromoji_iteration_mark"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
"tokenizer" : {
|
"tokenizer" : {
|
||||||
"kuromoji" : {
|
"kuromoji" : {
|
||||||
"type":"kuromoji_tokenizer"
|
"type":"kuromoji_tokenizer"
|
||||||
|
|
Loading…
Reference in New Issue