LUCENE-1965: Lazy Atomic Loading Stopwords in SmartCN

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823285 13f79535-47bb-0310-9956-ffa450edef68
2009-10-08 19:21:36 +00:00 · 2009-10-08 19:21:36 +00:00 · 05b7822170
parent 84b2c6ecaa
commit 05b7822170
2 changed files with 62 additions and 20 deletions
--- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
+++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.Collections;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
@ -59,6 +60,48 @@ import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
 public class SmartChineseAnalyzer extends Analyzer {

  private final Set stopWords;
+  
+  private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  private static final String STOPWORD_FILE_COMMENT = "//";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<String> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<String> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+
+    static Set<String> loadDefaultStopWordSet() throws IOException {
+      InputStream stream = SmartChineseAnalyzer.class
+          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      try {
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        // make sure it is unmodifiable as we expose it in the outer class
+        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
+      } finally {
+        stream.close();
+      }
+    }
+  }

  /**
   * Create a new SmartChineseAnalyzer, using the default stopword list.
@ -79,18 +122,8 @@ public class SmartChineseAnalyzer extends Analyzer {
   * @param useDefaultStopWords true to use the default stopword list.
   */
  public SmartChineseAnalyzer(boolean useDefaultStopWords) {
-    if (useDefaultStopWords) {
-      try {
-      InputStream stream = this.getClass().getResourceAsStream("stopwords.txt");
-      InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-      stopWords = WordlistLoader.getWordSet(reader, "//");
-      } catch (IOException e) {
-        // TODO: throw IOException
-        throw new RuntimeException(e);
-      }
-    }else{
-      stopWords = null;
-    }
+    stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
+        : Collections.EMPTY_SET;
  }

  /**
@ -103,7 +136,7 @@ public class SmartChineseAnalyzer extends Analyzer {
   * @param stopWords {@link Set} of stopwords to use.
   */
  public SmartChineseAnalyzer(Set stopWords) {
-    this.stopWords = stopWords;
+    this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords;
  }

  public TokenStream tokenStream(String fieldName, Reader reader) {
@ -113,8 +146,8 @@ public class SmartChineseAnalyzer extends Analyzer {
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
-    if (stopWords != null) {
-      result = new StopFilter(result, stopWords, false);
+    if (!stopWords.isEmpty()) {
+      result = new StopFilter(false,result, stopWords, false);
    }
    return result;
  }
@ -133,8 +166,8 @@ public class SmartChineseAnalyzer extends Analyzer {
      streams.tokenStream = new SentenceTokenizer(reader);
      streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
      streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
-      if (stopWords != null) {
-        streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords, false);
+      if (!stopWords.isEmpty()) {
+        streams.filteredTokenStream = new StopFilter(false, streams.filteredTokenStream, stopWords, false);
      }
    } else {
      streams.tokenStream.reset(reader);
--- a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
+++ b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
@ -25,6 +25,7 @@ import java.util.Date;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;

 public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
  
@ -33,6 +34,9 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
    String sentence = "我购买了道具和服装。";
    String result[] = { "我", "购买", "了", "道具", "和", "服装" };
    assertAnalyzesTo(ca, sentence, result);
+    // set stop-words from the outer world - must yield same behavior
+    ca = new SmartChineseAnalyzer(SmartChineseAnalyzer.getDefaultStopSet());
+    assertAnalyzesTo(ca, sentence, result);
  }
  
  /*
@ -63,11 +67,16 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
   * if you don't supply (true) to the constructor, or use a different stopwords list,
   * then punctuation is indexed.
   */
-  public void testChineseStopWordsOff() throws Exception {  
-    Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */
+  public void testChineseStopWordsOff() throws Exception {
+    Analyzer[] analyzers = new Analyzer[] {
+        new SmartChineseAnalyzer(false),/* doesn't load stopwords */
+        new SmartChineseAnalyzer(null) /* sets stopwords to empty set */};
    String sentence = "我购买了道具和服装。";
    String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
-    assertAnalyzesTo(ca, sentence, result);
+    for (Analyzer analyzer : analyzers) {
+      assertAnalyzesTo(analyzer, sentence, result);
+      assertAnalyzesToReuse(analyzer, sentence, result);
+    }
  }
  
  public void testChineseAnalyzer() throws Exception {