LUCENE-1965: Lazy Atomic Loading Stopwords in SmartCN

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823285 13f79535-47bb-0310-9956-ffa450edef68
2009-10-08 19:21:36 +00:00 · 2009-10-08 19:21:36 +00:00 · 05b7822170
parent 84b2c6ecaa
commit 05b7822170
2 changed files with 62 additions and 20 deletions
--- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
+++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.util.Collections;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
@ -59,6 +60,48 @@ import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
 public class SmartChineseAnalyzer extends Analyzer {
  private final Set stopWords;
  private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  private static final String STOPWORD_FILE_COMMENT = "//";
  /**
   * Returns an unmodifiable instance of the default stop-words set.
   * @return an unmodifiable instance of the default stop-words set.
   */
  public static Set<String> getDefaultStopSet(){
    return DefaultSetHolder.DEFAULT_STOP_SET;
  }
  /**
   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
   * accesses the static final set the first time.;
   */
  private static class DefaultSetHolder {
    static final Set<String> DEFAULT_STOP_SET;
    static {
      try {
        DEFAULT_STOP_SET = loadDefaultStopWordSet();
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
        throw new RuntimeException("Unable to load default stopword set");
      }
    }
    static Set<String> loadDefaultStopWordSet() throws IOException {
      InputStream stream = SmartChineseAnalyzer.class
          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
      try {
        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
        // make sure it is unmodifiable as we expose it in the outer class
        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
      } finally {
        stream.close();
      }
    }
  }
  /**
   * Create a new SmartChineseAnalyzer, using the default stopword list.
@ -79,18 +122,8 @@ public class SmartChineseAnalyzer extends Analyzer {
   * @param useDefaultStopWords true to use the default stopword list.
   */
  public SmartChineseAnalyzer(boolean useDefaultStopWords) {
-    if (useDefaultStopWords) {
+    stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
-      try {
+        : Collections.EMPTY_SET;
      InputStream stream = this.getClass().getResourceAsStream("stopwords.txt");
      InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
      stopWords = WordlistLoader.getWordSet(reader, "//");
      } catch (IOException e) {
        // TODO: throw IOException
        throw new RuntimeException(e);
      }
    }else{
      stopWords = null;
    }
  }
  /**
@ -103,7 +136,7 @@ public class SmartChineseAnalyzer extends Analyzer {
   * @param stopWords {@link Set} of stopwords to use.
   */
  public SmartChineseAnalyzer(Set stopWords) {
-    this.stopWords = stopWords;
+    this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords;
  }
  public TokenStream tokenStream(String fieldName, Reader reader) {
@ -113,8 +146,8 @@ public class SmartChineseAnalyzer extends Analyzer {
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
-    if (stopWords != null) {
+    if (!stopWords.isEmpty()) {
-      result = new StopFilter(result, stopWords, false);
+      result = new StopFilter(false,result, stopWords, false);
    }
    return result;
  }
@ -133,8 +166,8 @@ public class SmartChineseAnalyzer extends Analyzer {
      streams.tokenStream = new SentenceTokenizer(reader);
      streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
      streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
-      if (stopWords != null) {
+      if (!stopWords.isEmpty()) {
-        streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords, false);
+        streams.filteredTokenStream = new StopFilter(false, streams.filteredTokenStream, stopWords, false);
      }
    } else {
      streams.tokenStream.reset(reader);
--- a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
+++ b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
@ -25,6 +25,7 @@ import java.util.Date;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
@ -33,6 +34,9 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
    String sentence = "我购买了道具和服装。";
    String result[] = { "我", "购买", "了", "道具", "和", "服装" };
    assertAnalyzesTo(ca, sentence, result);
    // set stop-words from the outer world - must yield same behavior
    ca = new SmartChineseAnalyzer(SmartChineseAnalyzer.getDefaultStopSet());
    assertAnalyzesTo(ca, sentence, result);
  }
  /*
@ -63,11 +67,16 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
   * if you don't supply (true) to the constructor, or use a different stopwords list,
   * then punctuation is indexed.
   */
-  public void testChineseStopWordsOff() throws Exception {  
+  public void testChineseStopWordsOff() throws Exception {
-    Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */
+    Analyzer[] analyzers = new Analyzer[] {
        new SmartChineseAnalyzer(false),/* doesn't load stopwords */
        new SmartChineseAnalyzer(null) /* sets stopwords to empty set */};
    String sentence = "我购买了道具和服装。";
    String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
-    assertAnalyzesTo(ca, sentence, result);
+    for (Analyzer analyzer : analyzers) {
      assertAnalyzesTo(analyzer, sentence, result);
      assertAnalyzesToReuse(analyzer, sentence, result);
    }
  }
  public void testChineseAnalyzer() throws Exception {