convert StopFilter to use Set, with supporting changes to avoid calling deprecated methods. never compromise on your ideals!

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2004-03-12 09:43:48 +00:00
parent 7ceae266c5
commit 87d1685b0f
7 changed files with 70 additions and 34 deletions

View File

@ -56,11 +56,12 @@ package org.apache.lucene.analysis;
import java.io.Reader; import java.io.Reader;
import java.util.Hashtable; import java.util.Hashtable;
import java.util.Set;
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */ /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
public final class StopAnalyzer extends Analyzer { public final class StopAnalyzer extends Analyzer {
private Hashtable stopTable; private Set stopWords;
/** An array containing some common English words that are not usually useful /** An array containing some common English words that are not usually useful
for searching. */ for searching. */
@ -74,17 +75,17 @@ public final class StopAnalyzer extends Analyzer {
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
public StopAnalyzer() { public StopAnalyzer() {
stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS); stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
} }
/** Builds an analyzer which removes words in the provided array. */ /** Builds an analyzer which removes words in the provided array. */
public StopAnalyzer(String[] stopWords) { public StopAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords); this.stopWords = StopFilter.makeStopSet(stopWords);
} }
/** Filters LowerCaseTokenizer with StopFilter. */ /** Filters LowerCaseTokenizer with StopFilter. */
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new LowerCaseTokenizer(reader), stopTable); return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
} }
} }

View File

@ -57,6 +57,7 @@ package org.apache.lucene.analysis;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Hashtable; import java.util.Hashtable;
import java.util.Set;
/** /**
* Removes stop words from a token stream. * Removes stop words from a token stream.
@ -64,7 +65,7 @@ import java.util.Hashtable;
public final class StopFilter extends TokenFilter { public final class StopFilter extends TokenFilter {
private HashSet stopWords; private Set stopWords;
/** /**
* Constructs a filter which removes words from the input * Constructs a filter which removes words from the input
@ -79,7 +80,7 @@ public final class StopFilter extends TokenFilter {
* Constructs a filter which removes words from the input * Constructs a filter which removes words from the input
* TokenStream that are named in the Hashtable. * TokenStream that are named in the Hashtable.
* *
* @deprecated Use {@link #StopFilter(TokenStream, HashSet)} StopFilter(TokenStream,Map)} instead * @deprecated Use {@link #StopFilter(TokenStream, Set)} StopFilter(TokenStream,Map)} instead
*/ */
public StopFilter(TokenStream in, Hashtable stopTable) { public StopFilter(TokenStream in, Hashtable stopTable) {
super(in); super(in);
@ -89,8 +90,12 @@ public final class StopFilter extends TokenFilter {
/** /**
* Constructs a filter which removes words from the input * Constructs a filter which removes words from the input
* TokenStream that are named in the Set. * TokenStream that are named in the Set.
* It is crucial that an efficient Set implementation is used
* for maximum performance.
*
* @see #makeStopSet(java.lang.String[])
*/ */
public StopFilter(TokenStream in, HashSet stopWords) { public StopFilter(TokenStream in, Set stopWords) {
super(in); super(in);
this.stopWords = stopWords; this.stopWords = stopWords;
} }
@ -116,7 +121,7 @@ public final class StopFilter extends TokenFilter {
* This permits this stopWords construction to be cached once when * This permits this stopWords construction to be cached once when
* an Analyzer is constructed. * an Analyzer is constructed.
*/ */
public static final HashSet makeStopSet(String[] stopWords) { public static final Set makeStopSet(String[] stopWords) {
HashSet stopTable = new HashSet(stopWords.length); HashSet stopTable = new HashSet(stopWords.length);
for (int i = 0; i < stopWords.length; i++) for (int i = 0; i < stopWords.length; i++)
stopTable.add(stopWords[i]); stopTable.add(stopWords[i]);

View File

@ -62,6 +62,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File; import java.io.File;
import java.io.Reader; import java.io.Reader;
import java.util.Hashtable; import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/** /**
* Analyzer for German language. Supports an external list of stopwords (words that * Analyzer for German language. Supports an external list of stopwords (words that
@ -96,19 +98,19 @@ public class GermanAnalyzer extends Analyzer
/** /**
* Contains the stopwords used with the StopFilter. * Contains the stopwords used with the StopFilter.
*/ */
private Hashtable stoptable = new Hashtable(); private Set stopSet = new HashSet();
/** /**
* Contains words that should be indexed but not stemmed. * Contains words that should be indexed but not stemmed.
*/ */
private Hashtable excltable = new Hashtable(); private Set exclusionSet = new HashSet();
/** /**
* Builds an analyzer. * Builds an analyzer.
*/ */
public GermanAnalyzer() public GermanAnalyzer()
{ {
stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS ); stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS );
} }
/** /**
@ -116,7 +118,7 @@ public class GermanAnalyzer extends Analyzer
*/ */
public GermanAnalyzer( String[] stopwords ) public GermanAnalyzer( String[] stopwords )
{ {
stoptable = StopFilter.makeStopTable( stopwords ); stopSet = StopFilter.makeStopSet( stopwords );
} }
/** /**
@ -124,7 +126,7 @@ public class GermanAnalyzer extends Analyzer
*/ */
public GermanAnalyzer( Hashtable stopwords ) public GermanAnalyzer( Hashtable stopwords )
{ {
stoptable = stopwords; stopSet = new HashSet(stopwords.keySet());
} }
/** /**
@ -132,7 +134,7 @@ public class GermanAnalyzer extends Analyzer
*/ */
public GermanAnalyzer( File stopwords ) public GermanAnalyzer( File stopwords )
{ {
stoptable = WordlistLoader.getWordtable( stopwords ); stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
} }
/** /**
@ -140,7 +142,7 @@ public class GermanAnalyzer extends Analyzer
*/ */
public void setStemExclusionTable( String[] exclusionlist ) public void setStemExclusionTable( String[] exclusionlist )
{ {
excltable = StopFilter.makeStopTable( exclusionlist ); exclusionSet = StopFilter.makeStopSet( exclusionlist );
} }
/** /**
@ -148,7 +150,7 @@ public class GermanAnalyzer extends Analyzer
*/ */
public void setStemExclusionTable( Hashtable exclusionlist ) public void setStemExclusionTable( Hashtable exclusionlist )
{ {
excltable = exclusionlist; exclusionSet = new HashSet(exclusionlist.keySet());
} }
/** /**
@ -156,7 +158,7 @@ public class GermanAnalyzer extends Analyzer
*/ */
public void setStemExclusionTable( File exclusionlist ) public void setStemExclusionTable( File exclusionlist )
{ {
excltable = WordlistLoader.getWordtable( exclusionlist ); exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
} }
/** /**
@ -170,8 +172,8 @@ public class GermanAnalyzer extends Analyzer
TokenStream result = new StandardTokenizer( reader ); TokenStream result = new StandardTokenizer( reader );
result = new StandardFilter( result ); result = new StandardFilter( result );
// shouldn't there be a lowercaser before stop word filtering? // shouldn't there be a lowercaser before stop word filtering?
result = new StopFilter( result, stoptable ); result = new StopFilter( result, stopSet );
result = new GermanStemFilter( result, excltable ); result = new GermanStemFilter( result, exclusionSet );
return result; return result;
} }
} }

View File

@ -59,6 +59,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import java.io.IOException; import java.io.IOException;
import java.util.Hashtable; import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/** /**
* A filter that stems German words. It supports a table of words that should * A filter that stems German words. It supports a table of words that should
@ -75,7 +77,7 @@ public final class GermanStemFilter extends TokenFilter
*/ */
private Token token = null; private Token token = null;
private GermanStemmer stemmer = null; private GermanStemmer stemmer = null;
private Hashtable exclusions = null; private Set exclusionSet = null;
public GermanStemFilter( TokenStream in ) public GermanStemFilter( TokenStream in )
{ {
@ -85,13 +87,24 @@ public final class GermanStemFilter extends TokenFilter
/** /**
* Builds a GermanStemFilter that uses an exclusiontable. * Builds a GermanStemFilter that uses an exclusiontable.
* @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
*/ */
public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
{ {
this( in ); this( in );
exclusions = exclusiontable; exclusionSet = new HashSet(exclusiontable.keySet());
} }
/**
* Builds a GermanStemFilter that uses an exclusiontable.
*/
public GermanStemFilter( TokenStream in, Set exclusionSet )
{
this( in );
this.exclusionSet = exclusionSet;
}
/** /**
* @return Returns the next token in the stream, or null at EOS * @return Returns the next token in the stream, or null at EOS
*/ */
@ -102,7 +115,7 @@ public final class GermanStemFilter extends TokenFilter
return null; return null;
} }
// Check the exclusiontable // Check the exclusiontable
else if ( exclusions != null && exclusions.contains( token.termText() ) ) { else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
return token; return token;
} }
else { else {
@ -128,9 +141,18 @@ public final class GermanStemFilter extends TokenFilter
/** /**
* Set an alternative exclusion list for this filter. * Set an alternative exclusion list for this filter.
* @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
*/ */
public void setExclusionTable( Hashtable exclusiontable ) public void setExclusionTable( Hashtable exclusiontable )
{ {
exclusions = exclusiontable; exclusionSet = new HashSet(exclusiontable.keySet());
}
/**
* Set an alternative exclusion list for this filter.
*/
public void setExclusionSet( Set exclusionSet )
{
this.exclusionSet = exclusionSet;
} }
} }

View File

@ -67,6 +67,8 @@ import java.util.Hashtable;
* *
* @author Gerhard Schwarz * @author Gerhard Schwarz
* @version $Id$ * @version $Id$
*
* @todo refactor to convert to Sets instead of Hashtable
*/ */
public class WordlistLoader { public class WordlistLoader {
/** /**
@ -92,6 +94,7 @@ public class WordlistLoader {
/** /**
* @param wordfile File containing the wordlist * @param wordfile File containing the wordlist
* @todo Create a Set version of this method
*/ */
public static Hashtable getWordtable(File wordfile) { public static Hashtable getWordtable(File wordfile) {
if (wordfile == null) { if (wordfile == null) {

View File

@ -60,6 +60,8 @@ import org.apache.lucene.analysis.TokenStream;
import java.io.Reader; import java.io.Reader;
import java.util.Hashtable; import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/** /**
* Analyzer for Russian language. Supports an external list of stopwords (words that * Analyzer for Russian language. Supports an external list of stopwords (words that
@ -215,7 +217,7 @@ public final class RussianAnalyzer extends Analyzer
/** /**
* Contains the stopwords used with the StopFilter. * Contains the stopwords used with the StopFilter.
*/ */
private Hashtable stoptable = new Hashtable(); private Set stopSet = new HashSet();
/** /**
* Charset for Russian letters. * Charset for Russian letters.
@ -227,7 +229,7 @@ public final class RussianAnalyzer extends Analyzer
public RussianAnalyzer() { public RussianAnalyzer() {
charset = RussianCharsets.UnicodeRussian; charset = RussianCharsets.UnicodeRussian;
stoptable = StopFilter.makeStopTable( stopSet = StopFilter.makeStopSet(
makeStopWords(RussianCharsets.UnicodeRussian)); makeStopWords(RussianCharsets.UnicodeRussian));
} }
@ -237,7 +239,7 @@ public final class RussianAnalyzer extends Analyzer
public RussianAnalyzer(char[] charset) public RussianAnalyzer(char[] charset)
{ {
this.charset = charset; this.charset = charset;
stoptable = StopFilter.makeStopTable(makeStopWords(charset)); stopSet = StopFilter.makeStopSet(makeStopWords(charset));
} }
/** /**
@ -246,7 +248,7 @@ public final class RussianAnalyzer extends Analyzer
public RussianAnalyzer(char[] charset, String[] stopwords) public RussianAnalyzer(char[] charset, String[] stopwords)
{ {
this.charset = charset; this.charset = charset;
stoptable = StopFilter.makeStopTable(stopwords); stopSet = StopFilter.makeStopSet(stopwords);
} }
// Takes russian stop words and translates them to a String array, using // Takes russian stop words and translates them to a String array, using
@ -270,11 +272,12 @@ public final class RussianAnalyzer extends Analyzer
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* @todo create a Set version of this ctor
*/ */
public RussianAnalyzer(char[] charset, Hashtable stopwords) public RussianAnalyzer(char[] charset, Hashtable stopwords)
{ {
this.charset = charset; this.charset = charset;
stoptable = stopwords; stopSet = new HashSet(stopwords.keySet());
} }
/** /**
@ -287,7 +290,7 @@ public final class RussianAnalyzer extends Analyzer
{ {
TokenStream result = new RussianLetterTokenizer(reader, charset); TokenStream result = new RussianLetterTokenizer(reader, charset);
result = new RussianLowerCaseFilter(result, charset); result = new RussianLowerCaseFilter(result, charset);
result = new StopFilter(result, stoptable); result = new StopFilter(result, stopSet);
result = new RussianStemFilter(result, charset); result = new RussianStemFilter(result, charset);
return result; return result;
} }

View File

@ -56,7 +56,7 @@ package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.*;
import java.io.Reader; import java.io.Reader;
import java.util.Hashtable; import java.util.Set;
/** /**
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
@ -65,7 +65,7 @@ import java.util.Hashtable;
* @version $Id$ * @version $Id$
*/ */
public class StandardAnalyzer extends Analyzer { public class StandardAnalyzer extends Analyzer {
private Hashtable stopTable; private Set stopSet;
/** An array containing some common English words that are usually not /** An array containing some common English words that are usually not
useful for searching. */ useful for searching. */
@ -78,7 +78,7 @@ public class StandardAnalyzer extends Analyzer {
/** Builds an analyzer with the given stop words. */ /** Builds an analyzer with the given stop words. */
public StandardAnalyzer(String[] stopWords) { public StandardAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords); stopSet = StopFilter.makeStopSet(stopWords);
} }
/** Constructs a {@link StandardTokenizer} filtered by a {@link /** Constructs a {@link StandardTokenizer} filtered by a {@link
@ -87,7 +87,7 @@ public class StandardAnalyzer extends Analyzer {
TokenStream result = new StandardTokenizer(reader); TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result); result = new StandardFilter(result);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(result);
result = new StopFilter(result, stopTable); result = new StopFilter(result, stopSet);
return result; return result;
} }
} }