LUCENE-3751: align default japanese configurations for lucene/solr

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1242543 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-02-09 21:45:41 +00:00
parent 343fead2bc
commit 509f4c557d
2 changed files with 7 additions and 8 deletions

View File

@ -63,7 +63,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = loadStopwordSet(false, KuromojiAnalyzer.class, "stopwords.txt", "#"); DEFAULT_STOP_SET = loadStopwordSet(true, KuromojiAnalyzer.class, "stopwords.txt", "#"); // ignore case
final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#"); final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#");
DEFAULT_STOP_TAGS = new HashSet<String>(); DEFAULT_STOP_TAGS = new HashSet<String>();
for (Object element : tagset) { for (Object element : tagset) {
@ -71,9 +71,8 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
DEFAULT_STOP_TAGS.add(new String(chars)); DEFAULT_STOP_TAGS.add(new String(chars));
} }
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the distribution (JAR)
// distribution (JAR) throw new RuntimeException("Unable to load default stopword or stoptag set");
throw new RuntimeException("Unable to load default stopword set");
} }
} }
} }
@ -81,11 +80,11 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
@Override @Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) { protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader); Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer); TokenStream stream = new KuromojiBaseFormFilter(tokenizer);
stream = new CJKWidthFilter(stream);
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags); stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
stream = new CJKWidthFilter(stream);
stream = new StopFilter(matchVersion, stream, stopwords); stream = new StopFilter(matchVersion, stream, stopwords);
stream = new KuromojiBaseFormFilter(stream); stream = new LowerCaseFilter(matchVersion, stream);
return new TokenStreamComponents(tokenizer, stream); return new TokenStreamComponents(tokenizer, stream);
} }
} }

View File

@ -25,7 +25,7 @@ import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
/** /**
* Removes tokens that match a set of POS tags. * Removes tokens that match a set of part-of-speech tags.
*/ */
public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter { public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter {
private final Set<String> stopTags; private final Set<String> stopTags;