convert StopFilter to use Set, with supporting changes to avoid calling deprecated methods. never compromise on your ideals!

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150229 13f79535-47bb-0310-9956-ffa450edef68
2004-03-12 09:43:48 +00:00 · 2004-03-12 09:43:48 +00:00 · 87d1685b0f
parent 7ceae266c5
commit 87d1685b0f
7 changed files with 70 additions and 34 deletions
--- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
@ -56,11 +56,12 @@ package org.apache.lucene.analysis;

 import java.io.Reader;
 import java.util.Hashtable;
+import java.util.Set;

 /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */

 public final class StopAnalyzer extends Analyzer {
-  private Hashtable stopTable;
+  private Set stopWords;

  /** An array containing some common English words that are not usually useful
    for searching. */
@ -74,17 +75,17 @@ public final class StopAnalyzer extends Analyzer {

  /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
  public StopAnalyzer() {
-    stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
+    stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
  }

  /** Builds an analyzer which removes words in the provided array. */
  public StopAnalyzer(String[] stopWords) {
-    stopTable = StopFilter.makeStopTable(stopWords);
+    this.stopWords = StopFilter.makeStopSet(stopWords);
  }

  /** Filters LowerCaseTokenizer with StopFilter. */
  public TokenStream tokenStream(String fieldName, Reader reader) {
-    return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
+    return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
  }
 }

--- a/src/java/org/apache/lucene/analysis/StopFilter.java
+++ b/src/java/org/apache/lucene/analysis/StopFilter.java
@ -57,6 +57,7 @@ package org.apache.lucene.analysis;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Hashtable;
+import java.util.Set;

 /**
 * Removes stop words from a token stream.
@ -64,7 +65,7 @@ import java.util.Hashtable;

 public final class StopFilter extends TokenFilter {

-  private HashSet stopWords;
+  private Set stopWords;

  /**
   * Constructs a filter which removes words from the input
@ -79,7 +80,7 @@ public final class StopFilter extends TokenFilter {
   * Constructs a filter which removes words from the input
   * TokenStream that are named in the Hashtable.
   *
-   * @deprecated Use {@link #StopFilter(TokenStream, HashSet)} StopFilter(TokenStream,Map)} instead
+   * @deprecated Use {@link #StopFilter(TokenStream, Set)} StopFilter(TokenStream,Map)} instead
   */
  public StopFilter(TokenStream in, Hashtable stopTable) {
    super(in);
@ -89,8 +90,12 @@ public final class StopFilter extends TokenFilter {
  /**
   * Constructs a filter which removes words from the input
   * TokenStream that are named in the Set.
+   * It is crucial that an efficient Set implementation is used
+   * for maximum performance.
+   *
+   * @see #makeStopSet(java.lang.String[])
   */
-  public StopFilter(TokenStream in, HashSet stopWords) {
+  public StopFilter(TokenStream in, Set stopWords) {
    super(in);
    this.stopWords = stopWords;
  }
@ -116,7 +121,7 @@ public final class StopFilter extends TokenFilter {
   * This permits this stopWords construction to be cached once when
   * an Analyzer is constructed.
   */
-  public static final HashSet makeStopSet(String[] stopWords) {
+  public static final Set makeStopSet(String[] stopWords) {
    HashSet stopTable = new HashSet(stopWords.length);
    for (int i = 0; i < stopWords.length; i++)
      stopTable.add(stopWords[i]);
--- a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@ -62,6 +62,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import java.io.File;
 import java.io.Reader;
 import java.util.Hashtable;
+import java.util.Set;
+import java.util.HashSet;

 /**
 * Analyzer for German language. Supports an external list of stopwords (words that
@ -96,19 +98,19 @@ public class GermanAnalyzer extends Analyzer
    /**
     * Contains the stopwords used with the StopFilter.
     */
-    private Hashtable stoptable = new Hashtable();
+    private Set stopSet = new HashSet();

    /**
     * Contains words that should be indexed but not stemmed.
     */
-    private Hashtable excltable = new Hashtable();
+    private Set exclusionSet = new HashSet();

    /**
     * Builds an analyzer.
     */
    public GermanAnalyzer()
    {
-	stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
+	stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS );
    }

    /**
@ -116,7 +118,7 @@ public class GermanAnalyzer extends Analyzer
     */
    public GermanAnalyzer( String[] stopwords )
    {
-	stoptable = StopFilter.makeStopTable( stopwords );
+	stopSet = StopFilter.makeStopSet( stopwords );
    }

    /**
@ -124,7 +126,7 @@ public class GermanAnalyzer extends Analyzer
     */
    public GermanAnalyzer( Hashtable stopwords )
    {
-	stoptable = stopwords;
+	stopSet = new HashSet(stopwords.keySet());
    }

    /**
@ -132,7 +134,7 @@ public class GermanAnalyzer extends Analyzer
     */
    public GermanAnalyzer( File stopwords )
    {
-	stoptable = WordlistLoader.getWordtable( stopwords );
+	stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
    }

    /**
@ -140,7 +142,7 @@ public class GermanAnalyzer extends Analyzer
     */
    public void setStemExclusionTable( String[] exclusionlist )
    {
-	excltable = StopFilter.makeStopTable( exclusionlist );
+	exclusionSet = StopFilter.makeStopSet( exclusionlist );
    }

    /**
@ -148,7 +150,7 @@ public class GermanAnalyzer extends Analyzer
     */
    public void setStemExclusionTable( Hashtable exclusionlist )
    {
-	excltable = exclusionlist;
+	exclusionSet = new HashSet(exclusionlist.keySet());
    }

    /**
@ -156,7 +158,7 @@ public class GermanAnalyzer extends Analyzer
     */
    public void setStemExclusionTable( File exclusionlist )
    {
-	excltable = WordlistLoader.getWordtable( exclusionlist );
+	exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
    }

    /**
@ -170,8 +172,8 @@ public class GermanAnalyzer extends Analyzer
 	TokenStream result = new StandardTokenizer( reader );
 	result = new StandardFilter( result );
  // shouldn't there be a lowercaser before stop word filtering?
-  result = new StopFilter( result, stoptable );
-	result = new GermanStemFilter( result, excltable );
+  result = new StopFilter( result, stopSet );
+	result = new GermanStemFilter( result, exclusionSet );
 	return result;
    }
 }
--- a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
+++ b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
@ -59,6 +59,8 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import java.io.IOException;
 import java.util.Hashtable;
+import java.util.Set;
+import java.util.HashSet;

 /**
 * A filter that stems German words. It supports a table of words that should
@ -75,7 +77,7 @@ public final class GermanStemFilter extends TokenFilter
     */
    private Token token = null;
    private GermanStemmer stemmer = null;
-    private Hashtable exclusions = null;
+    private Set exclusionSet = null;
    
    public GermanStemFilter( TokenStream in )
    {
@ -85,13 +87,24 @@ public final class GermanStemFilter extends TokenFilter
    
    /**
     * Builds a GermanStemFilter that uses an exclusiontable.
+     * @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
     */
    public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
    {
 	this( in );
-	exclusions = exclusiontable;
+	exclusionSet = new HashSet(exclusiontable.keySet());
+
    }
-    
+
+    /**
+     * Builds a GermanStemFilter that uses an exclusiontable.
+     */
+    public GermanStemFilter( TokenStream in, Set exclusionSet )
+    {
+	this( in );
+	this.exclusionSet = exclusionSet;
+    }
+
    /**
     * @return  Returns the next token in the stream, or null at EOS
     */
@ -102,7 +115,7 @@ public final class GermanStemFilter extends TokenFilter
 	    return null;
 	}
 	// Check the exclusiontable
-	else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
+	else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
 	    return token;
 	}
 	else {
@ -128,9 +141,18 @@ public final class GermanStemFilter extends TokenFilter

    /**
     * Set an alternative exclusion list for this filter.
+     * @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
     */
    public void setExclusionTable( Hashtable exclusiontable )
    {
-	exclusions = exclusiontable;
+	exclusionSet = new HashSet(exclusiontable.keySet());
+    }
+
+    /**
+     * Set an alternative exclusion list for this filter.
+     */
+    public void setExclusionSet( Set exclusionSet )
+    {
+	this.exclusionSet = exclusionSet;
    }
 }
--- a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
+++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
@ -67,6 +67,8 @@ import java.util.Hashtable;
 *
 * @author    Gerhard Schwarz
 * @version   $Id$
+ *
+ * @todo refactor to convert to Sets instead of Hashtable
 */
 public class WordlistLoader {
  /**
@ -92,6 +94,7 @@ public class WordlistLoader {

  /**
   * @param wordfile  File containing the wordlist
+   * @todo Create a Set version of this method
   */
  public static Hashtable getWordtable(File wordfile) {
    if (wordfile == null) {
--- a/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@ -60,6 +60,8 @@ import org.apache.lucene.analysis.TokenStream;

 import java.io.Reader;
 import java.util.Hashtable;
+import java.util.Set;
+import java.util.HashSet;

 /**
 * Analyzer for Russian language. Supports an external list of stopwords (words that
@ -215,7 +217,7 @@ public final class RussianAnalyzer extends Analyzer
    /**
     * Contains the stopwords used with the StopFilter.
     */
-    private Hashtable stoptable = new Hashtable();
+    private Set stopSet = new HashSet();

    /**
     * Charset for Russian letters.
@ -227,7 +229,7 @@ public final class RussianAnalyzer extends Analyzer

    public RussianAnalyzer() {
        charset = RussianCharsets.UnicodeRussian;
-        stoptable = StopFilter.makeStopTable(
+        stopSet = StopFilter.makeStopSet(
                    makeStopWords(RussianCharsets.UnicodeRussian));
    }

@ -237,7 +239,7 @@ public final class RussianAnalyzer extends Analyzer
    public RussianAnalyzer(char[] charset)
    {
        this.charset = charset;
-        stoptable = StopFilter.makeStopTable(makeStopWords(charset));
+        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
    }

    /**
@ -246,7 +248,7 @@ public final class RussianAnalyzer extends Analyzer
    public RussianAnalyzer(char[] charset, String[] stopwords)
    {
        this.charset = charset;
-        stoptable = StopFilter.makeStopTable(stopwords);
+        stopSet = StopFilter.makeStopSet(stopwords);
    }

    // Takes russian stop words and translates them to a String array, using
@ -270,11 +272,12 @@ public final class RussianAnalyzer extends Analyzer

    /**
     * Builds an analyzer with the given stop words.
+     * @todo create a Set version of this ctor
     */
    public RussianAnalyzer(char[] charset, Hashtable stopwords)
    {
        this.charset = charset;
-        stoptable = stopwords;
+        stopSet = new HashSet(stopwords.keySet());
    }

    /**
@ -287,7 +290,7 @@ public final class RussianAnalyzer extends Analyzer
    {
        TokenStream result = new RussianLetterTokenizer(reader, charset);
        result = new RussianLowerCaseFilter(result, charset);
-        result = new StopFilter(result, stoptable);
+        result = new StopFilter(result, stopSet);
        result = new RussianStemFilter(result, charset);
        return result;
    }
--- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@ -56,7 +56,7 @@ package org.apache.lucene.analysis.standard;

 import org.apache.lucene.analysis.*;
 import java.io.Reader;
-import java.util.Hashtable;
+import java.util.Set;

 /**
 * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
@ -65,7 +65,7 @@ import java.util.Hashtable;
 * @version $Id$
 */
 public class StandardAnalyzer extends Analyzer {
-  private Hashtable stopTable;
+  private Set stopSet;

  /** An array containing some common English words that are usually not
  useful for searching. */
@ -78,7 +78,7 @@ public class StandardAnalyzer extends Analyzer {

  /** Builds an analyzer with the given stop words. */
  public StandardAnalyzer(String[] stopWords) {
-    stopTable = StopFilter.makeStopTable(stopWords);
+    stopSet = StopFilter.makeStopSet(stopWords);
  }

  /** Constructs a {@link StandardTokenizer} filtered by a {@link
@ -87,7 +87,7 @@ public class StandardAnalyzer extends Analyzer {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
-    result = new StopFilter(result, stopTable);
+    result = new StopFilter(result, stopSet);
    return result;
  }
 }