mirror of https://github.com/apache/lucene.git
LUCENE-1965: Lazy Atomic Loading Stopwords in SmartCN
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823285 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
84b2c6ecaa
commit
05b7822170
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
@ -59,6 +60,48 @@ import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
|
||||||
public class SmartChineseAnalyzer extends Analyzer {
|
public class SmartChineseAnalyzer extends Analyzer {
|
||||||
|
|
||||||
private final Set stopWords;
|
private final Set stopWords;
|
||||||
|
|
||||||
|
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
private static final String STOPWORD_FILE_COMMENT = "//";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
|
*/
|
||||||
|
public static Set<String> getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final Set<String> DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||||
|
InputStream stream = SmartChineseAnalyzer.class
|
||||||
|
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||||
|
try {
|
||||||
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||||
|
// make sure it is unmodifiable as we expose it in the outer class
|
||||||
|
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
|
||||||
|
} finally {
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new SmartChineseAnalyzer, using the default stopword list.
|
* Create a new SmartChineseAnalyzer, using the default stopword list.
|
||||||
|
@ -79,18 +122,8 @@ public class SmartChineseAnalyzer extends Analyzer {
|
||||||
* @param useDefaultStopWords true to use the default stopword list.
|
* @param useDefaultStopWords true to use the default stopword list.
|
||||||
*/
|
*/
|
||||||
public SmartChineseAnalyzer(boolean useDefaultStopWords) {
|
public SmartChineseAnalyzer(boolean useDefaultStopWords) {
|
||||||
if (useDefaultStopWords) {
|
stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||||
try {
|
: Collections.EMPTY_SET;
|
||||||
InputStream stream = this.getClass().getResourceAsStream("stopwords.txt");
|
|
||||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
|
||||||
stopWords = WordlistLoader.getWordSet(reader, "//");
|
|
||||||
} catch (IOException e) {
|
|
||||||
// TODO: throw IOException
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
stopWords = null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -103,7 +136,7 @@ public class SmartChineseAnalyzer extends Analyzer {
|
||||||
* @param stopWords {@link Set} of stopwords to use.
|
* @param stopWords {@link Set} of stopwords to use.
|
||||||
*/
|
*/
|
||||||
public SmartChineseAnalyzer(Set stopWords) {
|
public SmartChineseAnalyzer(Set stopWords) {
|
||||||
this.stopWords = stopWords;
|
this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
@ -113,8 +146,8 @@ public class SmartChineseAnalyzer extends Analyzer {
|
||||||
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
|
// LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
|
||||||
// The porter stemming is too strict, this is not a bug, this is a feature:)
|
// The porter stemming is too strict, this is not a bug, this is a feature:)
|
||||||
result = new PorterStemFilter(result);
|
result = new PorterStemFilter(result);
|
||||||
if (stopWords != null) {
|
if (!stopWords.isEmpty()) {
|
||||||
result = new StopFilter(result, stopWords, false);
|
result = new StopFilter(false,result, stopWords, false);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -133,8 +166,8 @@ public class SmartChineseAnalyzer extends Analyzer {
|
||||||
streams.tokenStream = new SentenceTokenizer(reader);
|
streams.tokenStream = new SentenceTokenizer(reader);
|
||||||
streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
|
streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
|
||||||
streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
|
streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
|
||||||
if (stopWords != null) {
|
if (!stopWords.isEmpty()) {
|
||||||
streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords, false);
|
streams.filteredTokenStream = new StopFilter(false, streams.filteredTokenStream, stopWords, false);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
streams.tokenStream.reset(reader);
|
streams.tokenStream.reset(reader);
|
||||||
|
|
|
@ -25,6 +25,7 @@ import java.util.Date;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -33,6 +34,9 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
String sentence = "我购买了道具和服装。";
|
String sentence = "我购买了道具和服装。";
|
||||||
String result[] = { "我", "购买", "了", "道具", "和", "服装" };
|
String result[] = { "我", "购买", "了", "道具", "和", "服装" };
|
||||||
assertAnalyzesTo(ca, sentence, result);
|
assertAnalyzesTo(ca, sentence, result);
|
||||||
|
// set stop-words from the outer world - must yield same behavior
|
||||||
|
ca = new SmartChineseAnalyzer(SmartChineseAnalyzer.getDefaultStopSet());
|
||||||
|
assertAnalyzesTo(ca, sentence, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -63,11 +67,16 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* if you don't supply (true) to the constructor, or use a different stopwords list,
|
* if you don't supply (true) to the constructor, or use a different stopwords list,
|
||||||
* then punctuation is indexed.
|
* then punctuation is indexed.
|
||||||
*/
|
*/
|
||||||
public void testChineseStopWordsOff() throws Exception {
|
public void testChineseStopWordsOff() throws Exception {
|
||||||
Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */
|
Analyzer[] analyzers = new Analyzer[] {
|
||||||
|
new SmartChineseAnalyzer(false),/* doesn't load stopwords */
|
||||||
|
new SmartChineseAnalyzer(null) /* sets stopwords to empty set */};
|
||||||
String sentence = "我购买了道具和服装。";
|
String sentence = "我购买了道具和服装。";
|
||||||
String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
|
String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
|
||||||
assertAnalyzesTo(ca, sentence, result);
|
for (Analyzer analyzer : analyzers) {
|
||||||
|
assertAnalyzesTo(analyzer, sentence, result);
|
||||||
|
assertAnalyzesToReuse(analyzer, sentence, result);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testChineseAnalyzer() throws Exception {
|
public void testChineseAnalyzer() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue