From 87d1685b0f1f9286a64a4d4dd8c1286ef121feb7 Mon Sep 17 00:00:00 2001
From: Erik Hatcher <ehatcher@apache.org>
Date: Fri, 12 Mar 2004 09:43:48 +0000
Subject: [PATCH] convert StopFilter to use Set, with supporting changes to
 avoid calling deprecated methods.  never compromise on your ideals!

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150229 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/StopAnalyzer.java  |  9 +++---
 .../apache/lucene/analysis/StopFilter.java    | 13 +++++---
 .../lucene/analysis/de/GermanAnalyzer.java    | 24 +++++++-------
 .../lucene/analysis/de/GermanStemFilter.java  | 32 ++++++++++++++++---
 .../lucene/analysis/de/WordlistLoader.java    |  3 ++
 .../lucene/analysis/ru/RussianAnalyzer.java   | 15 +++++----
 .../analysis/standard/StandardAnalyzer.java   |  8 ++---
 7 files changed, 70 insertions(+), 34 deletions(-)

diff --git a/src/java/org/apache/lucene/analysis/StopAnalyzer.java b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
index e9573da981f..3c25c944d8a 100644
--- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
@@ -56,11 +56,12 @@ package org.apache.lucene.analysis;
 
 import java.io.Reader;
 import java.util.Hashtable;
+import java.util.Set;
 
 /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
 
 public final class StopAnalyzer extends Analyzer {
-  private Hashtable stopTable;
+  private Set stopWords;
 
   /** An array containing some common English words that are not usually useful
     for searching. */
@@ -74,17 +75,17 @@ public final class StopAnalyzer extends Analyzer {
 
   /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
   public StopAnalyzer() {
-    stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
+    stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
   }
 
   /** Builds an analyzer which removes words in the provided array. */
   public StopAnalyzer(String[] stopWords) {
-    stopTable = StopFilter.makeStopTable(stopWords);
+    this.stopWords = StopFilter.makeStopSet(stopWords);
   }
 
   /** Filters LowerCaseTokenizer with StopFilter. */
   public TokenStream tokenStream(String fieldName, Reader reader) {
-    return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
+    return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
   }
 }
 
diff --git a/src/java/org/apache/lucene/analysis/StopFilter.java b/src/java/org/apache/lucene/analysis/StopFilter.java
index 729b37a2b02..e33345d849f 100644
--- a/src/java/org/apache/lucene/analysis/StopFilter.java
+++ b/src/java/org/apache/lucene/analysis/StopFilter.java
@@ -57,6 +57,7 @@ package org.apache.lucene.analysis;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Hashtable;
+import java.util.Set;
 
 /**
  * Removes stop words from a token stream.
@@ -64,7 +65,7 @@ import java.util.Hashtable;
 
 public final class StopFilter extends TokenFilter {
 
-  private HashSet stopWords;
+  private Set stopWords;
 
   /**
    * Constructs a filter which removes words from the input
@@ -79,7 +80,7 @@ public final class StopFilter extends TokenFilter {
    * Constructs a filter which removes words from the input
    * TokenStream that are named in the Hashtable.
    *
-   * @deprecated Use {@link #StopFilter(TokenStream, HashSet)} StopFilter(TokenStream,Map)} instead
+   * @deprecated Use {@link #StopFilter(TokenStream, Set)} StopFilter(TokenStream,Map)} instead
    */
   public StopFilter(TokenStream in, Hashtable stopTable) {
     super(in);
@@ -89,8 +90,12 @@ public final class StopFilter extends TokenFilter {
   /**
    * Constructs a filter which removes words from the input
    * TokenStream that are named in the Set.
+   * It is crucial that an efficient Set implementation is used
+   * for maximum performance.
+   *
+   * @see #makeStopSet(java.lang.String[])
    */
-  public StopFilter(TokenStream in, HashSet stopWords) {
+  public StopFilter(TokenStream in, Set stopWords) {
     super(in);
     this.stopWords = stopWords;
   }
@@ -116,7 +121,7 @@ public final class StopFilter extends TokenFilter {
    * This permits this stopWords construction to be cached once when
    * an Analyzer is constructed.
    */
-  public static final HashSet makeStopSet(String[] stopWords) {
+  public static final Set makeStopSet(String[] stopWords) {
     HashSet stopTable = new HashSet(stopWords.length);
     for (int i = 0; i < stopWords.length; i++)
       stopTable.add(stopWords[i]);
diff --git a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index 6dd4d0cd56f..9fa13fe068f 100644
--- a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -62,6 +62,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import java.io.File;
 import java.io.Reader;
 import java.util.Hashtable;
+import java.util.Set;
+import java.util.HashSet;
 
 /**
  * Analyzer for German language. Supports an external list of stopwords (words that
@@ -96,19 +98,19 @@ public class GermanAnalyzer extends Analyzer
     /**
      * Contains the stopwords used with the StopFilter.
      */
-    private Hashtable stoptable = new Hashtable();
+    private Set stopSet = new HashSet();
 
     /**
      * Contains words that should be indexed but not stemmed.
      */
-    private Hashtable excltable = new Hashtable();
+    private Set exclusionSet = new HashSet();
 
     /**
      * Builds an analyzer.
      */
     public GermanAnalyzer()
     {
-	stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
+	stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS );
     }
 
     /**
@@ -116,7 +118,7 @@ public class GermanAnalyzer extends Analyzer
      */
     public GermanAnalyzer( String[] stopwords )
     {
-	stoptable = StopFilter.makeStopTable( stopwords );
+	stopSet = StopFilter.makeStopSet( stopwords );
     }
 
     /**
@@ -124,7 +126,7 @@ public class GermanAnalyzer extends Analyzer
      */
     public GermanAnalyzer( Hashtable stopwords )
     {
-	stoptable = stopwords;
+	stopSet = new HashSet(stopwords.keySet());
     }
 
     /**
@@ -132,7 +134,7 @@ public class GermanAnalyzer extends Analyzer
      */
     public GermanAnalyzer( File stopwords )
     {
-	stoptable = WordlistLoader.getWordtable( stopwords );
+	stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
     }
 
     /**
@@ -140,7 +142,7 @@ public class GermanAnalyzer extends Analyzer
      */
     public void setStemExclusionTable( String[] exclusionlist )
     {
-	excltable = StopFilter.makeStopTable( exclusionlist );
+	exclusionSet = StopFilter.makeStopSet( exclusionlist );
     }
 
     /**
@@ -148,7 +150,7 @@ public class GermanAnalyzer extends Analyzer
      */
     public void setStemExclusionTable( Hashtable exclusionlist )
     {
-	excltable = exclusionlist;
+	exclusionSet = new HashSet(exclusionlist.keySet());
     }
 
     /**
@@ -156,7 +158,7 @@ public class GermanAnalyzer extends Analyzer
      */
     public void setStemExclusionTable( File exclusionlist )
     {
-	excltable = WordlistLoader.getWordtable( exclusionlist );
+	exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
     }
 
     /**
@@ -170,8 +172,8 @@ public class GermanAnalyzer extends Analyzer
 	TokenStream result = new StandardTokenizer( reader );
 	result = new StandardFilter( result );
   // shouldn't there be a lowercaser before stop word filtering?
-  result = new StopFilter( result, stoptable );
-	result = new GermanStemFilter( result, excltable );
+  result = new StopFilter( result, stopSet );
+	result = new GermanStemFilter( result, exclusionSet );
 	return result;
     }
 }
diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
index e1e678cd24d..377e382a228 100644
--- a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
+++ b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
@@ -59,6 +59,8 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import java.io.IOException;
 import java.util.Hashtable;
+import java.util.Set;
+import java.util.HashSet;
 
 /**
  * A filter that stems German words. It supports a table of words that should
@@ -75,7 +77,7 @@ public final class GermanStemFilter extends TokenFilter
      */
     private Token token = null;
     private GermanStemmer stemmer = null;
-    private Hashtable exclusions = null;
+    private Set exclusionSet = null;
     
     public GermanStemFilter( TokenStream in )
     {
@@ -85,13 +87,24 @@ public final class GermanStemFilter extends TokenFilter
     
     /**
      * Builds a GermanStemFilter that uses an exclusiontable.
+     * @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
      */
     public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
     {
 	this( in );
-	exclusions = exclusiontable;
+	exclusionSet = new HashSet(exclusiontable.keySet());
+
     }
-    
+
+    /**
+     * Builds a GermanStemFilter that uses an exclusiontable.
+     */
+    public GermanStemFilter( TokenStream in, Set exclusionSet )
+    {
+	this( in );
+	this.exclusionSet = exclusionSet;
+    }
+
     /**
      * @return  Returns the next token in the stream, or null at EOS
      */
@@ -102,7 +115,7 @@ public final class GermanStemFilter extends TokenFilter
 	    return null;
 	}
 	// Check the exclusiontable
-	else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
+	else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
 	    return token;
 	}
 	else {
@@ -128,9 +141,18 @@ public final class GermanStemFilter extends TokenFilter
 
     /**
      * Set an alternative exclusion list for this filter.
+     * @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
      */
     public void setExclusionTable( Hashtable exclusiontable )
     {
-	exclusions = exclusiontable;
+	exclusionSet = new HashSet(exclusiontable.keySet());
+    }
+
+    /**
+     * Set an alternative exclusion list for this filter.
+     */
+    public void setExclusionSet( Set exclusionSet )
+    {
+	this.exclusionSet = exclusionSet;
     }
 }
diff --git a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
index 6339c1c0ec7..916f604742f 100644
--- a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
+++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
@@ -67,6 +67,8 @@ import java.util.Hashtable;
  *
  * @author    Gerhard Schwarz
  * @version   $Id$
+ *
+ * @todo refactor to convert to Sets instead of Hashtable
  */
 public class WordlistLoader {
   /**
@@ -92,6 +94,7 @@ public class WordlistLoader {
 
   /**
    * @param wordfile  File containing the wordlist
+   * @todo Create a Set version of this method
    */
   public static Hashtable getWordtable(File wordfile) {
     if (wordfile == null) {
diff --git a/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
index 9fa249a4bce..1852ec4ddde 100644
--- a/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@@ -60,6 +60,8 @@ import org.apache.lucene.analysis.TokenStream;
 
 import java.io.Reader;
 import java.util.Hashtable;
+import java.util.Set;
+import java.util.HashSet;
 
 /**
  * Analyzer for Russian language. Supports an external list of stopwords (words that
@@ -215,7 +217,7 @@ public final class RussianAnalyzer extends Analyzer
     /**
      * Contains the stopwords used with the StopFilter.
      */
-    private Hashtable stoptable = new Hashtable();
+    private Set stopSet = new HashSet();
 
     /**
      * Charset for Russian letters.
@@ -227,7 +229,7 @@ public final class RussianAnalyzer extends Analyzer
 
     public RussianAnalyzer() {
         charset = RussianCharsets.UnicodeRussian;
-        stoptable = StopFilter.makeStopTable(
+        stopSet = StopFilter.makeStopSet(
                     makeStopWords(RussianCharsets.UnicodeRussian));
     }
 
@@ -237,7 +239,7 @@ public final class RussianAnalyzer extends Analyzer
     public RussianAnalyzer(char[] charset)
     {
         this.charset = charset;
-        stoptable = StopFilter.makeStopTable(makeStopWords(charset));
+        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
     }
 
     /**
@@ -246,7 +248,7 @@ public final class RussianAnalyzer extends Analyzer
     public RussianAnalyzer(char[] charset, String[] stopwords)
     {
         this.charset = charset;
-        stoptable = StopFilter.makeStopTable(stopwords);
+        stopSet = StopFilter.makeStopSet(stopwords);
     }
 
     // Takes russian stop words and translates them to a String array, using
@@ -270,11 +272,12 @@ public final class RussianAnalyzer extends Analyzer
 
     /**
      * Builds an analyzer with the given stop words.
+     * @todo create a Set version of this ctor
      */
     public RussianAnalyzer(char[] charset, Hashtable stopwords)
     {
         this.charset = charset;
-        stoptable = stopwords;
+        stopSet = new HashSet(stopwords.keySet());
     }
 
     /**
@@ -287,7 +290,7 @@ public final class RussianAnalyzer extends Analyzer
     {
         TokenStream result = new RussianLetterTokenizer(reader, charset);
         result = new RussianLowerCaseFilter(result, charset);
-        result = new StopFilter(result, stoptable);
+        result = new StopFilter(result, stopSet);
         result = new RussianStemFilter(result, charset);
         return result;
     }
diff --git a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
index 15f9b3a14ec..2250917cc6a 100644
--- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -56,7 +56,7 @@ package org.apache.lucene.analysis.standard;
 
 import org.apache.lucene.analysis.*;
 import java.io.Reader;
-import java.util.Hashtable;
+import java.util.Set;
 
 /**
  * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
@@ -65,7 +65,7 @@ import java.util.Hashtable;
  * @version $Id$
  */
 public class StandardAnalyzer extends Analyzer {
-  private Hashtable stopTable;
+  private Set stopSet;
 
   /** An array containing some common English words that are usually not
   useful for searching. */
@@ -78,7 +78,7 @@ public class StandardAnalyzer extends Analyzer {
 
   /** Builds an analyzer with the given stop words. */
   public StandardAnalyzer(String[] stopWords) {
-    stopTable = StopFilter.makeStopTable(stopWords);
+    stopSet = StopFilter.makeStopSet(stopWords);
   }
 
   /** Constructs a {@link StandardTokenizer} filtered by a {@link
@@ -87,7 +87,7 @@ public class StandardAnalyzer extends Analyzer {
     TokenStream result = new StandardTokenizer(reader);
     result = new StandardFilter(result);
     result = new LowerCaseFilter(result);
-    result = new StopFilter(result, stopTable);
+    result = new StopFilter(result, stopSet);
     return result;
   }
 }