mirror of https://github.com/apache/lucene.git
LUCENE-1965: Lazy Atomic Loading Stopwords in SmartCN
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823285 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
84b2c6ecaa
commit
05b7822170
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -59,6 +60,48 @@ import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
|
|||
public class SmartChineseAnalyzer extends Analyzer {
|
||||
|
||||
private final Set stopWords;
|
||||
|
||||
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
private static final String STOPWORD_FILE_COMMENT = "//";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<String> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<String> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||
InputStream stream = SmartChineseAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new SmartChineseAnalyzer, using the default stopword list.
|
||||
|
@ -79,18 +122,8 @@ public class SmartChineseAnalyzer extends Analyzer {
|
|||
* @param useDefaultStopWords true to use the default stopword list.
|
||||
*/
|
||||
public SmartChineseAnalyzer(boolean useDefaultStopWords) {
|
||||
if (useDefaultStopWords) {
|
||||
try {
|
||||
InputStream stream = this.getClass().getResourceAsStream("stopwords.txt");
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
stopWords = WordlistLoader.getWordSet(reader, "//");
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}else{
|
||||
stopWords = null;
|
||||
}
|
||||
stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||
: Collections.EMPTY_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -103,7 +136,7 @@ public class SmartChineseAnalyzer extends Analyzer {
|
|||
* @param stopWords {@link Set} of stopwords to use.
|
||||
*/
|
||||
public SmartChineseAnalyzer(Set stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords;
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
|
@ -113,8 +146,8 @@ public class SmartChineseAnalyzer extends Analyzer {
|
|||
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
|
||||
// The porter stemming is too strict, this is not a bug, this is a feature:)
|
||||
result = new PorterStemFilter(result);
|
||||
if (stopWords != null) {
|
||||
result = new StopFilter(result, stopWords, false);
|
||||
if (!stopWords.isEmpty()) {
|
||||
result = new StopFilter(false,result, stopWords, false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -133,8 +166,8 @@ public class SmartChineseAnalyzer extends Analyzer {
|
|||
streams.tokenStream = new SentenceTokenizer(reader);
|
||||
streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
|
||||
streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
|
||||
if (stopWords != null) {
|
||||
streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords, false);
|
||||
if (!stopWords.isEmpty()) {
|
||||
streams.filteredTokenStream = new StopFilter(false, streams.filteredTokenStream, stopWords, false);
|
||||
}
|
||||
} else {
|
||||
streams.tokenStream.reset(reader);
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.util.Date;
|
|||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -33,6 +34,9 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
String sentence = "我购买了道具和服装。";
|
||||
String result[] = { "我", "购买", "了", "道具", "和", "服装" };
|
||||
assertAnalyzesTo(ca, sentence, result);
|
||||
// set stop-words from the outer world - must yield same behavior
|
||||
ca = new SmartChineseAnalyzer(SmartChineseAnalyzer.getDefaultStopSet());
|
||||
assertAnalyzesTo(ca, sentence, result);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -63,11 +67,16 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
* if you don't supply (true) to the constructor, or use a different stopwords list,
|
||||
* then punctuation is indexed.
|
||||
*/
|
||||
public void testChineseStopWordsOff() throws Exception {
|
||||
Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */
|
||||
public void testChineseStopWordsOff() throws Exception {
|
||||
Analyzer[] analyzers = new Analyzer[] {
|
||||
new SmartChineseAnalyzer(false),/* doesn't load stopwords */
|
||||
new SmartChineseAnalyzer(null) /* sets stopwords to empty set */};
|
||||
String sentence = "我购买了道具和服装。";
|
||||
String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
|
||||
assertAnalyzesTo(ca, sentence, result);
|
||||
for (Analyzer analyzer : analyzers) {
|
||||
assertAnalyzesTo(analyzer, sentence, result);
|
||||
assertAnalyzesToReuse(analyzer, sentence, result);
|
||||
}
|
||||
}
|
||||
|
||||
public void testChineseAnalyzer() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue