LUCENE-1962: Cleaned up Persian & Arabic Analyzer. Prevent default stopword list from being loaded more than once.

- replace if blocks with a single switch - marking private members final where needed - changed protected visibility to final in final class. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823180 13f79535-47bb-0310-9956-ffa450edef68
2025-03-01 22:09:24 +00:00 · 2009-10-08 13:54:18 +00:00 · 2009-10-08 13:54:18 +00:00 · 286cb1f9d2
commit 286cb1f9d2
parent 90fc7e18c7
7 changed files with 130 additions and 50 deletions
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.Set;
@ -68,21 +69,51 @@ public final class ArabicAnalyzer extends Analyzer {
   * The comment character in the stopwords file.  All lines prefixed with this will be ignored  
   */
  public static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<String> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<String> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+
+    static Set<String> loadDefaultStopWordSet() throws IOException {
+      InputStream stream = ArabicAnalyzer.class
+          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      try {
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        // make sure it is unmodifiable as we expose it in the outer class
+        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
+            STOPWORDS_COMMENT));
+      } finally {
+        stream.close();
+      }
+    }
+  }

  /**
   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
   */
  public ArabicAnalyzer() {
-    try {
-      InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
-      InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-      stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
-      reader.close();
-      stream.close();
-    } catch (IOException e) {
-      // TODO: throw IOException
-      throw new RuntimeException(e);
-    }
+    stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
  }

  /**
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
@ -44,8 +44,7 @@ public final class ArabicNormalizationFilter extends TokenFilter {
      int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength());
      termAtt.setTermLength(newlen);
      return true;
-    } else {
-      return false;
    }
+    return false;
  }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
@ -63,21 +63,34 @@ public class ArabicNormalizer {
   * @return length of input buffer after normalization
   */
  public int normalize(char s[], int len) {
- 
+
    for (int i = 0; i < len; i++) {
-      if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
+      switch (s[i]) {
+      case ALEF_MADDA:
+      case ALEF_HAMZA_ABOVE:
+      case ALEF_HAMZA_BELOW:
        s[i] = ALEF;
-
-      if (s[i] == DOTLESS_YEH)
+        break;
+      case DOTLESS_YEH:
        s[i] = YEH;
-
-      if (s[i] == TEH_MARBUTA)
+        break;
+      case TEH_MARBUTA:
        s[i] = HEH;
-
-      if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
-          s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) {
+        break;
+      case TATWEEL:
+      case KASRATAN:
+      case DAMMATAN:
+      case FATHATAN:
+      case FATHA:
+      case DAMMA:
+      case KASRA:
+      case SHADDA:
+      case SUKUN:
        len = delete(s, i, len);
        i--;
+        break;
+      default:
+        break;
      }
    }

--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
@ -30,8 +30,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;

 public final class ArabicStemFilter extends TokenFilter {

-  protected ArabicStemmer stemmer = null;
-  private TermAttribute termAtt;
+  private final ArabicStemmer stemmer;
+  private final TermAttribute termAtt;
  
  public ArabicStemFilter(TokenStream input) {
    super(input);
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.Set;
@ -58,30 +59,61 @@ public final class PersianAnalyzer extends Analyzer {
  /**
   * Contains the stopwords used with the StopFilter.
   */
-  private Set stoptable = new HashSet();
+  private final Set stoptable;

  /**
   * The comment character in the stopwords file. All lines prefixed with this
   * will be ignored
   */
  public static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<String> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<String> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+
+    static Set<String> loadDefaultStopWordSet() throws IOException {
+      InputStream stream = PersianAnalyzer.class
+          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      try {
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        // make sure it is unmodifiable as we expose it in the outer class
+        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
+            STOPWORDS_COMMENT));
+      } finally {
+        stream.close();
+      }
+    }
+  }
+
+  

  /**
   * Builds an analyzer with the default stop words:
   * {@link #DEFAULT_STOPWORD_FILE}.
   */
  public PersianAnalyzer() {
-    try {
-      InputStream stream = PersianAnalyzer.class
-          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
-      InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
-      stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
-      reader.close();
-      stream.close();
-    } catch (IOException e) {
-      // TODO: throw IOException
-      throw new RuntimeException(e);
-    }
+    stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
  }

  /**
@ -125,7 +157,7 @@ public final class PersianAnalyzer extends Analyzer {
     * the order here is important: the stopword list is normalized with the
     * above!
     */
-    result = new StopFilter(result, stoptable);
+    result = new StopFilter(false, result, stoptable);

    return result;
  }
@ -158,7 +190,7 @@ public final class PersianAnalyzer extends Analyzer {
       * the order here is important: the stopword list is normalized with the
       * above!
       */
-      streams.result = new StopFilter(streams.result, stoptable);
+      streams.result = new StopFilter(false, streams.result, stoptable);
      setPreviousTokenStream(streams);
    } else {
      streams.source.reset(reader);
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
@ -32,7 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 public final class PersianNormalizationFilter extends TokenFilter {

  private final PersianNormalizer normalizer;
-  private TermAttribute termAtt;
+  private final TermAttribute termAtt;

  public PersianNormalizationFilter(TokenStream input) {
    super(input);
@ -42,12 +42,11 @@ public final class PersianNormalizationFilter extends TokenFilter {

  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
-      int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt
+      final int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt
          .termLength());
      termAtt.setTermLength(newlen);
      return true;
-    } else {
-      return false;
-    }
+    } 
+    return false;
  }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java
@ -59,18 +59,24 @@ public class PersianNormalizer {
  public int normalize(char s[], int len) {

    for (int i = 0; i < len; i++) {
-      if (s[i] == FARSI_YEH || s[i] == YEH_BARREE)
+      switch (s[i]) {
+      case FARSI_YEH:
+      case YEH_BARREE:
        s[i] = YEH;
-
-      if (s[i] == KEHEH)
+        break;
+      case KEHEH:
        s[i] = KAF;
-
-      if (s[i] == HEH_YEH || s[i] == HEH_GOAL)
+        break;
+      case HEH_YEH:
+      case HEH_GOAL:
        s[i] = HEH;
-
-      if (s[i] == HAMZA_ABOVE) { // necessary for HEH + HAMZA
+        break;
+      case HAMZA_ABOVE: // necessary for HEH + HAMZA
        len = delete(s, i, len);
        i--;
+        break;
+      default:
+        break;
      }
    }

@ -88,7 +94,7 @@ public class PersianNormalizer {
  protected int delete(char s[], int pos, int len) {
    if (pos < len)
      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
-
+    
    return len - 1;
  }