mirror of https://github.com/apache/lucene.git
LUCENE-3751: align default japanese configurations for lucene/solr
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1242543 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
343fead2bc
commit
509f4c557d
|
@ -63,7 +63,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
DEFAULT_STOP_SET = loadStopwordSet(false, KuromojiAnalyzer.class, "stopwords.txt", "#");
|
DEFAULT_STOP_SET = loadStopwordSet(true, KuromojiAnalyzer.class, "stopwords.txt", "#"); // ignore case
|
||||||
final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#");
|
final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#");
|
||||||
DEFAULT_STOP_TAGS = new HashSet<String>();
|
DEFAULT_STOP_TAGS = new HashSet<String>();
|
||||||
for (Object element : tagset) {
|
for (Object element : tagset) {
|
||||||
|
@ -71,9 +71,8 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
||||||
DEFAULT_STOP_TAGS.add(new String(chars));
|
DEFAULT_STOP_TAGS.add(new String(chars));
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
// default set should always be present as it is part of the
|
// default set should always be present as it is part of the distribution (JAR)
|
||||||
// distribution (JAR)
|
throw new RuntimeException("Unable to load default stopword or stoptag set");
|
||||||
throw new RuntimeException("Unable to load default stopword set");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -81,11 +80,11 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
|
Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
|
||||||
TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer);
|
TokenStream stream = new KuromojiBaseFormFilter(tokenizer);
|
||||||
stream = new CJKWidthFilter(stream);
|
|
||||||
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
|
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
|
||||||
|
stream = new CJKWidthFilter(stream);
|
||||||
stream = new StopFilter(matchVersion, stream, stopwords);
|
stream = new StopFilter(matchVersion, stream, stopwords);
|
||||||
stream = new KuromojiBaseFormFilter(stream);
|
stream = new LowerCaseFilter(matchVersion, stream);
|
||||||
return new TokenStreamComponents(tokenizer, stream);
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes tokens that match a set of POS tags.
|
* Removes tokens that match a set of part-of-speech tags.
|
||||||
*/
|
*/
|
||||||
public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter {
|
public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter {
|
||||||
private final Set<String> stopTags;
|
private final Set<String> stopTags;
|
||||||
|
|
Loading…
Reference in New Issue