mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-17 10:25:15 +00:00
Add "ja_stop" filter
* can use a predefined "_japanese_" stop words * can not use other predefined stop words * upgrade to lucene 5 * add ja_stop to README Closes #45
This commit is contained in:
parent
d0f629b0f5
commit
0a0d6fd644
45
README.md
45
README.md
@ -45,6 +45,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter.
|
||||
| kuromoji_part_of_speech | tokenfilter |
|
||||
| kuromoji_readingform | tokenfilter |
|
||||
| kuromoji_stemmer | tokenfilter |
|
||||
| ja_stop | tokenfilter |
|
||||
|
||||
|
||||
Usage
|
||||
@ -475,6 +476,50 @@ _Response :_
|
||||
```
|
||||
|
||||
|
||||
## TokenFilter : kuromoji_part_of_speech
|
||||
|
||||
|
||||
A token filter of type `ja_stop` that provide a predefined "_japanese_" stop words.
|
||||
*Note: It is only provide "_japanese_". If you want to use other predefined stop words, you can use `stop` token filter.*
|
||||
|
||||
### example
|
||||
|
||||
```sh
|
||||
curl -XPUT 'http://localhost:9200/kuromoji_sample/' -d'
|
||||
{
|
||||
"settings": {
|
||||
"index":{
|
||||
"analysis":{
|
||||
"analyzer" : {
|
||||
"analyzer_with_ja_stop" : {
|
||||
"tokenizer" : "kuromoji_tokenizer",
|
||||
"filter" : ["ja_stop"]
|
||||
}
|
||||
},
|
||||
"filter" : {
|
||||
"ja_stop" : {
|
||||
"type" : "ja_stop",
|
||||
"stopwords" : ["_japanese_", "ストップ"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
'
|
||||
|
||||
curl -XPOST 'http://localhost:9200/kuromoji_sample/_analyze?analyzer=katakana_analyzer&pretty' -d 'ストップは消える'
|
||||
{
|
||||
"tokens" : [ {
|
||||
"token" : "消える",
|
||||
"start_offset" : 5,
|
||||
"end_offset" : 8,
|
||||
"type" : "word",
|
||||
"position" : 3
|
||||
} ]
|
||||
}
|
||||
```
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
|
@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
|
||||
import org.elasticsearch.common.collect.ImmutableMap;
|
||||
import org.elasticsearch.common.collect.MapBuilder;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public class JapaneseStopTokenFilterFactory extends AbstractTokenFilterFactory{
|
||||
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
|
||||
private final boolean ignoreCase;
|
||||
|
||||
private final boolean removeTrailing;
|
||||
|
||||
@Inject
|
||||
public JapaneseStopTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||
this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
|
||||
ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder()
|
||||
.put("_japanese_", JapaneseAnalyzer.getDefaultStopSet())
|
||||
.immutableMap();
|
||||
this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), namedStopWords, ignoreCase);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
if (removeTrailing) {
|
||||
return new StopFilter(tokenStream, stopWords);
|
||||
} else {
|
||||
return new SuggestStopFilter(tokenStream, stopWords);
|
||||
}
|
||||
}
|
||||
|
||||
public Set<?> stopWords() {
|
||||
return stopWords;
|
||||
}
|
||||
|
||||
public boolean ignoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
}
|
@ -55,5 +55,6 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin {
|
||||
module.addTokenFilter("kuromoji_part_of_speech", KuromojiPartOfSpeechFilterFactory.class);
|
||||
module.addTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory.class);
|
||||
module.addTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory.class);
|
||||
module.addTokenFilter("ja_stop", JapaneseStopTokenFilterFactory.class);
|
||||
}
|
||||
}
|
||||
|
@ -71,6 +71,9 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
||||
filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
|
||||
assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));
|
||||
|
||||
filterFactory = analysisService.tokenFilter("ja_stop");
|
||||
assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));
|
||||
|
||||
NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
|
||||
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
|
||||
|
||||
@ -80,6 +83,7 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
||||
|
||||
CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
|
||||
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -172,10 +176,21 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
|
||||
expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";
|
||||
|
||||
assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJapaneseStopFilterFactory() throws IOException {
|
||||
AnalysisService analysisService = createAnalysisService();
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop");
|
||||
assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
|
||||
String source = "私は制限スピードを超える。";
|
||||
String[] expected = new String[]{"私", "制限", "超える"};
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
|
||||
public AnalysisService createAnalysisService() {
|
||||
Settings settings = ImmutableSettings.settingsBuilder()
|
||||
.loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json")
|
||||
|
@ -13,9 +13,12 @@
|
||||
"kuromoji_ks" : {
|
||||
"type": "kuromoji_stemmer",
|
||||
"minimum_length" : 6
|
||||
},
|
||||
"ja_stop" : {
|
||||
"type": "ja_stop",
|
||||
"stopwords": ["_japanese_", "スピード"]
|
||||
}
|
||||
|
||||
|
||||
},
|
||||
|
||||
"char_filter":{
|
||||
|
Loading…
x
Reference in New Issue
Block a user