Add "ja_stop" filter

* can use a predefined "_japanese_" stop words
 * can not use other predefined stop words
 * upgrade to lucene 5
 * add ja_stop to README

  Closes #45
This commit is contained in:
Jun Ohtani 2014-10-21 18:07:00 +09:00
parent d0f629b0f5
commit 0a0d6fd644
5 changed files with 143 additions and 3 deletions

View File

@ -45,6 +45,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter.
| kuromoji_part_of_speech | tokenfilter |
| kuromoji_readingform | tokenfilter |
| kuromoji_stemmer | tokenfilter |
| ja_stop | tokenfilter |
Usage
@ -475,6 +476,50 @@ _Response :_
```
## TokenFilter : kuromoji_part_of_speech
A token filter of type `ja_stop` that provide a predefined "_japanese_" stop words.
*Note: It is only provide "_japanese_". If you want to use other predefined stop words, you can use `stop` token filter.*
### example
```sh
curl -XPUT 'http://localhost:9200/kuromoji_sample/' -d'
{
"settings": {
"index":{
"analysis":{
"analyzer" : {
"analyzer_with_ja_stop" : {
"tokenizer" : "kuromoji_tokenizer",
"filter" : ["ja_stop"]
}
},
"filter" : {
"ja_stop" : {
"type" : "ja_stop",
"stopwords" : ["_japanese_", "ストップ"]
}
}
}
}
}
}
'
curl -XPOST 'http://localhost:9200/kuromoji_sample/_analyze?analyzer=katakana_analyzer&pretty' -d 'ストップは消える'
{
"tokens" : [ {
"token" : "消える",
"start_offset" : 5,
"end_offset" : 8,
"type" : "word",
"position" : 3
} ]
}
```
License
-------

View File

@ -0,0 +1,76 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
import org.elasticsearch.common.collect.ImmutableMap;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Set;
public class JapaneseStopTokenFilterFactory extends AbstractTokenFilterFactory{
private final CharArraySet stopWords;
private final boolean ignoreCase;
private final boolean removeTrailing;
@Inject
public JapaneseStopTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder()
.put("_japanese_", JapaneseAnalyzer.getDefaultStopSet())
.immutableMap();
this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), namedStopWords, ignoreCase);
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (removeTrailing) {
return new StopFilter(tokenStream, stopWords);
} else {
return new SuggestStopFilter(tokenStream, stopWords);
}
}
public Set<?> stopWords() {
return stopWords;
}
public boolean ignoreCase() {
return ignoreCase;
}
}

View File

@ -55,5 +55,6 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin {
module.addTokenFilter("kuromoji_part_of_speech", KuromojiPartOfSpeechFilterFactory.class);
module.addTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory.class);
module.addTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory.class);
module.addTokenFilter("ja_stop", JapaneseStopTokenFilterFactory.class);
}
}

View File

@ -71,6 +71,9 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));
filterFactory = analysisService.tokenFilter("ja_stop");
assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));
NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
@ -80,6 +83,7 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
}
@Test
@ -172,10 +176,21 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
}
@Test
public void testJapaneseStopFilterFactory() throws IOException {
AnalysisService analysisService = createAnalysisService();
TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop");
assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
String source = "私は制限スピードを超える。";
String[] expected = new String[]{"", "制限", "超える"};
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
public AnalysisService createAnalysisService() {
Settings settings = ImmutableSettings.settingsBuilder()
.loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json")

View File

@ -13,9 +13,12 @@
"kuromoji_ks" : {
"type": "kuromoji_stemmer",
"minimum_length" : 6
},
"ja_stop" : {
"type": "ja_stop",
"stopwords": ["_japanese_", "スピード"]
}
},
"char_filter":{