mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 05:19:17 +00:00
LUCENE-2564: Cut over WordListLoader to CharArrayMap/Set and use CharSetDecoder to detect encoding problems early
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dc6b4b6533
commit
c0a7abbec0
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
|
||||
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
@ -89,16 +90,11 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
||||
}
|
||||
|
||||
static CharArraySet loadDefaultStopWordSet() throws IOException {
|
||||
InputStream stream = SmartChineseAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||
STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT));
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils
|
||||
.getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE,
|
||||
IOUtils.CHARSET_UTF_8), STOPWORD_FILE_COMMENT,
|
||||
Version.LUCENE_CURRENT));
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user