diff --git a/src/java/org/apache/lucene/analysis/StopAnalyzer.java b/src/java/org/apache/lucene/analysis/StopAnalyzer.java index e9573da981f..3c25c944d8a 100644 --- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java @@ -56,11 +56,12 @@ package org.apache.lucene.analysis; import java.io.Reader; import java.util.Hashtable; +import java.util.Set; /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */ public final class StopAnalyzer extends Analyzer { - private Hashtable stopTable; + private Set stopWords; /** An array containing some common English words that are not usually useful for searching. */ @@ -74,17 +75,17 @@ public final class StopAnalyzer extends Analyzer { /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ public StopAnalyzer() { - stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS); + stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS); } /** Builds an analyzer which removes words in the provided array. */ public StopAnalyzer(String[] stopWords) { - stopTable = StopFilter.makeStopTable(stopWords); + this.stopWords = StopFilter.makeStopSet(stopWords); } /** Filters LowerCaseTokenizer with StopFilter. */ public TokenStream tokenStream(String fieldName, Reader reader) { - return new StopFilter(new LowerCaseTokenizer(reader), stopTable); + return new StopFilter(new LowerCaseTokenizer(reader), stopWords); } } diff --git a/src/java/org/apache/lucene/analysis/StopFilter.java b/src/java/org/apache/lucene/analysis/StopFilter.java index 729b37a2b02..e33345d849f 100644 --- a/src/java/org/apache/lucene/analysis/StopFilter.java +++ b/src/java/org/apache/lucene/analysis/StopFilter.java @@ -57,6 +57,7 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.util.HashSet; import java.util.Hashtable; +import java.util.Set; /** * Removes stop words from a token stream. @@ -64,7 +65,7 @@ import java.util.Hashtable; public final class StopFilter extends TokenFilter { - private HashSet stopWords; + private Set stopWords; /** * Constructs a filter which removes words from the input @@ -79,7 +80,7 @@ public final class StopFilter extends TokenFilter { * Constructs a filter which removes words from the input * TokenStream that are named in the Hashtable. * - * @deprecated Use {@link #StopFilter(TokenStream, HashSet)} StopFilter(TokenStream,Map)} instead + * @deprecated Use {@link #StopFilter(TokenStream, Set)} StopFilter(TokenStream,Map)} instead */ public StopFilter(TokenStream in, Hashtable stopTable) { super(in); @@ -89,8 +90,12 @@ public final class StopFilter extends TokenFilter { /** * Constructs a filter which removes words from the input * TokenStream that are named in the Set. + * It is crucial that an efficient Set implementation is used + * for maximum performance. + * + * @see #makeStopSet(java.lang.String[]) */ - public StopFilter(TokenStream in, HashSet stopWords) { + public StopFilter(TokenStream in, Set stopWords) { super(in); this.stopWords = stopWords; } @@ -116,7 +121,7 @@ public final class StopFilter extends TokenFilter { * This permits this stopWords construction to be cached once when * an Analyzer is constructed. */ - public static final HashSet makeStopSet(String[] stopWords) { + public static final Set makeStopSet(String[] stopWords) { HashSet stopTable = new HashSet(stopWords.length); for (int i = 0; i < stopWords.length; i++) stopTable.add(stopWords[i]); diff --git a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 6dd4d0cd56f..9fa13fe068f 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -62,6 +62,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.File; import java.io.Reader; import java.util.Hashtable; +import java.util.Set; +import java.util.HashSet; /** * Analyzer for German language. Supports an external list of stopwords (words that @@ -96,19 +98,19 @@ public class GermanAnalyzer extends Analyzer /** * Contains the stopwords used with the StopFilter. */ - private Hashtable stoptable = new Hashtable(); + private Set stopSet = new HashSet(); /** * Contains words that should be indexed but not stemmed. */ - private Hashtable excltable = new Hashtable(); + private Set exclusionSet = new HashSet(); /** * Builds an analyzer. */ public GermanAnalyzer() { - stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS ); + stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS ); } /** @@ -116,7 +118,7 @@ public class GermanAnalyzer extends Analyzer */ public GermanAnalyzer( String[] stopwords ) { - stoptable = StopFilter.makeStopTable( stopwords ); + stopSet = StopFilter.makeStopSet( stopwords ); } /** @@ -124,7 +126,7 @@ public class GermanAnalyzer extends Analyzer */ public GermanAnalyzer( Hashtable stopwords ) { - stoptable = stopwords; + stopSet = new HashSet(stopwords.keySet()); } /** @@ -132,7 +134,7 @@ public class GermanAnalyzer extends Analyzer */ public GermanAnalyzer( File stopwords ) { - stoptable = WordlistLoader.getWordtable( stopwords ); + stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet()); } /** @@ -140,7 +142,7 @@ public class GermanAnalyzer extends Analyzer */ public void setStemExclusionTable( String[] exclusionlist ) { - excltable = StopFilter.makeStopTable( exclusionlist ); + exclusionSet = StopFilter.makeStopSet( exclusionlist ); } /** @@ -148,7 +150,7 @@ public class GermanAnalyzer extends Analyzer */ public void setStemExclusionTable( Hashtable exclusionlist ) { - excltable = exclusionlist; + exclusionSet = new HashSet(exclusionlist.keySet()); } /** @@ -156,7 +158,7 @@ public class GermanAnalyzer extends Analyzer */ public void setStemExclusionTable( File exclusionlist ) { - excltable = WordlistLoader.getWordtable( exclusionlist ); + exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet()); } /** @@ -170,8 +172,8 @@ public class GermanAnalyzer extends Analyzer TokenStream result = new StandardTokenizer( reader ); result = new StandardFilter( result ); // shouldn't there be a lowercaser before stop word filtering? - result = new StopFilter( result, stoptable ); - result = new GermanStemFilter( result, excltable ); + result = new StopFilter( result, stopSet ); + result = new GermanStemFilter( result, exclusionSet ); return result; } } diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java index e1e678cd24d..377e382a228 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java +++ b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java @@ -59,6 +59,8 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import java.io.IOException; import java.util.Hashtable; +import java.util.Set; +import java.util.HashSet; /** * A filter that stems German words. It supports a table of words that should @@ -75,7 +77,7 @@ public final class GermanStemFilter extends TokenFilter */ private Token token = null; private GermanStemmer stemmer = null; - private Hashtable exclusions = null; + private Set exclusionSet = null; public GermanStemFilter( TokenStream in ) { @@ -85,13 +87,24 @@ public final class GermanStemFilter extends TokenFilter /** * Builds a GermanStemFilter that uses an exclusiontable. + * @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead. */ public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) { this( in ); - exclusions = exclusiontable; + exclusionSet = new HashSet(exclusiontable.keySet()); + } - + + /** + * Builds a GermanStemFilter that uses an exclusiontable. + */ + public GermanStemFilter( TokenStream in, Set exclusionSet ) + { + this( in ); + this.exclusionSet = exclusionSet; + } + /** * @return Returns the next token in the stream, or null at EOS */ @@ -102,7 +115,7 @@ public final class GermanStemFilter extends TokenFilter return null; } // Check the exclusiontable - else if ( exclusions != null && exclusions.contains( token.termText() ) ) { + else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { return token; } else { @@ -128,9 +141,18 @@ public final class GermanStemFilter extends TokenFilter /** * Set an alternative exclusion list for this filter. + * @deprecated Use {@link #setExclusionSet(java.util.Set)} instead. */ public void setExclusionTable( Hashtable exclusiontable ) { - exclusions = exclusiontable; + exclusionSet = new HashSet(exclusiontable.keySet()); + } + + /** + * Set an alternative exclusion list for this filter. + */ + public void setExclusionSet( Set exclusionSet ) + { + this.exclusionSet = exclusionSet; } } diff --git a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java index 6339c1c0ec7..916f604742f 100644 --- a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java +++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java @@ -67,6 +67,8 @@ import java.util.Hashtable; * * @author Gerhard Schwarz * @version $Id$ + * + * @todo refactor to convert to Sets instead of Hashtable */ public class WordlistLoader { /** @@ -92,6 +94,7 @@ public class WordlistLoader { /** * @param wordfile File containing the wordlist + * @todo Create a Set version of this method */ public static Hashtable getWordtable(File wordfile) { if (wordfile == null) { diff --git a/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java index 9fa249a4bce..1852ec4ddde 100644 --- a/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java @@ -60,6 +60,8 @@ import org.apache.lucene.analysis.TokenStream; import java.io.Reader; import java.util.Hashtable; +import java.util.Set; +import java.util.HashSet; /** * Analyzer for Russian language. Supports an external list of stopwords (words that @@ -215,7 +217,7 @@ public final class RussianAnalyzer extends Analyzer /** * Contains the stopwords used with the StopFilter. */ - private Hashtable stoptable = new Hashtable(); + private Set stopSet = new HashSet(); /** * Charset for Russian letters. @@ -227,7 +229,7 @@ public final class RussianAnalyzer extends Analyzer public RussianAnalyzer() { charset = RussianCharsets.UnicodeRussian; - stoptable = StopFilter.makeStopTable( + stopSet = StopFilter.makeStopSet( makeStopWords(RussianCharsets.UnicodeRussian)); } @@ -237,7 +239,7 @@ public final class RussianAnalyzer extends Analyzer public RussianAnalyzer(char[] charset) { this.charset = charset; - stoptable = StopFilter.makeStopTable(makeStopWords(charset)); + stopSet = StopFilter.makeStopSet(makeStopWords(charset)); } /** @@ -246,7 +248,7 @@ public final class RussianAnalyzer extends Analyzer public RussianAnalyzer(char[] charset, String[] stopwords) { this.charset = charset; - stoptable = StopFilter.makeStopTable(stopwords); + stopSet = StopFilter.makeStopSet(stopwords); } // Takes russian stop words and translates them to a String array, using @@ -270,11 +272,12 @@ public final class RussianAnalyzer extends Analyzer /** * Builds an analyzer with the given stop words. + * @todo create a Set version of this ctor */ public RussianAnalyzer(char[] charset, Hashtable stopwords) { this.charset = charset; - stoptable = stopwords; + stopSet = new HashSet(stopwords.keySet()); } /** @@ -287,7 +290,7 @@ public final class RussianAnalyzer extends Analyzer { TokenStream result = new RussianLetterTokenizer(reader, charset); result = new RussianLowerCaseFilter(result, charset); - result = new StopFilter(result, stoptable); + result = new StopFilter(result, stopSet); result = new RussianStemFilter(result, charset); return result; } diff --git a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java index 15f9b3a14ec..2250917cc6a 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java @@ -56,7 +56,7 @@ package org.apache.lucene.analysis.standard; import org.apache.lucene.analysis.*; import java.io.Reader; -import java.util.Hashtable; +import java.util.Set; /** * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link @@ -65,7 +65,7 @@ import java.util.Hashtable; * @version $Id$ */ public class StandardAnalyzer extends Analyzer { - private Hashtable stopTable; + private Set stopSet; /** An array containing some common English words that are usually not useful for searching. */ @@ -78,7 +78,7 @@ public class StandardAnalyzer extends Analyzer { /** Builds an analyzer with the given stop words. */ public StandardAnalyzer(String[] stopWords) { - stopTable = StopFilter.makeStopTable(stopWords); + stopSet = StopFilter.makeStopSet(stopWords); } /** Constructs a {@link StandardTokenizer} filtered by a {@link @@ -87,7 +87,7 @@ public class StandardAnalyzer extends Analyzer { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); - result = new StopFilter(result, stopTable); + result = new StopFilter(result, stopSet); return result; } }