From 509f4c557d6177b0a84a5f221927c385e3afe376 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 9 Feb 2012 21:45:41 +0000 Subject: [PATCH] LUCENE-3751: align default japanese configurations for lucene/solr git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1242543 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/kuromoji/KuromojiAnalyzer.java | 13 ++++++------- .../kuromoji/KuromojiPartOfSpeechStopFilter.java | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java index 324136d04f1..41763be5dc3 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java @@ -63,7 +63,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = loadStopwordSet(false, KuromojiAnalyzer.class, "stopwords.txt", "#"); + DEFAULT_STOP_SET = loadStopwordSet(true, KuromojiAnalyzer.class, "stopwords.txt", "#"); // ignore case final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#"); DEFAULT_STOP_TAGS = new HashSet(); for (Object element : tagset) { @@ -71,9 +71,8 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase { DEFAULT_STOP_TAGS.add(new String(chars)); } } catch (IOException ex) { - // default set should always be present as it is part of the - // distribution (JAR) - throw new RuntimeException("Unable to load default stopword set"); + // default set should always be present as it is part of the distribution (JAR) + throw new RuntimeException("Unable to load default stopword or stoptag set"); } } } @@ -81,11 +80,11 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader); - TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer); - stream = new CJKWidthFilter(stream); + TokenStream stream = new KuromojiBaseFormFilter(tokenizer); stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags); + stream = new CJKWidthFilter(stream); stream = new StopFilter(matchVersion, stream, stopwords); - stream = new KuromojiBaseFormFilter(stream); + stream = new LowerCaseFilter(matchVersion, stream); return new TokenStreamComponents(tokenizer, stream); } } diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java index fe30891f751..fdd8c163565 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java @@ -25,7 +25,7 @@ import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; /** - * Removes tokens that match a set of POS tags. + * Removes tokens that match a set of part-of-speech tags. */ public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter { private final Set stopTags;