LUCENE-2564: Cut over WordListLoader to CharArrayMap/Set and use CharSetDecoder to detect encoding problems early

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-11-10 01:52:48 +00:00
parent dc6b4b6533
commit c0a7abbec0
1 changed files with 6 additions and 10 deletions

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@ -89,16 +90,11 @@ public final class SmartChineseAnalyzer extends Analyzer {
}
static CharArraySet loadDefaultStopWordSet() throws IOException {
InputStream stream = SmartChineseAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT));
} finally {
stream.close();
}
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils
.getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE,
IOUtils.CHARSET_UTF_8), STOPWORD_FILE_COMMENT,
Version.LUCENE_CURRENT));
}
}