LUCENE-2564: Cut over WordListLoader to CharArrayMap/Set and use CharSetDecoder to detect encoding problems early

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-11-10 01:52:48 +00:00
parent dc6b4b6533
commit c0a7abbec0
1 changed files with 6 additions and 10 deletions

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer; import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
import org.apache.lucene.analysis.cn.smart.WordTokenFilter; import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
/** /**
@ -89,16 +90,11 @@ public final class SmartChineseAnalyzer extends Analyzer {
} }
static CharArraySet loadDefaultStopWordSet() throws IOException { static CharArraySet loadDefaultStopWordSet() throws IOException {
InputStream stream = SmartChineseAnalyzer.class // make sure it is unmodifiable as we expose it in the outer class
.getResourceAsStream(DEFAULT_STOPWORD_FILE); return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils
try { .getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE,
InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); IOUtils.CHARSET_UTF_8), STOPWORD_FILE_COMMENT,
// make sure it is unmodifiable as we expose it in the outer class Version.LUCENE_CURRENT));
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT));
} finally {
stream.close();
}
} }
} }