convert StopFilter to use Set, with supporting changes to avoid calling deprecated methods. never compromise on your ideals!

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2004-03-12 09:43:48 +00:00
parent 7ceae266c5
commit 87d1685b0f
7 changed files with 70 additions and 34 deletions

View File

@ -56,11 +56,12 @@ package org.apache.lucene.analysis;
import java.io.Reader;
import java.util.Hashtable;
import java.util.Set;
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
public final class StopAnalyzer extends Analyzer {
private Hashtable stopTable;
private Set stopWords;
/** An array containing some common English words that are not usually useful
for searching. */
@ -74,17 +75,17 @@ public final class StopAnalyzer extends Analyzer {
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
public StopAnalyzer() {
stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
}
/** Builds an analyzer which removes words in the provided array. */
public StopAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords);
this.stopWords = StopFilter.makeStopSet(stopWords);
}
/** Filters LowerCaseTokenizer with StopFilter. */
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
}
}

View File

@ -57,6 +57,7 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
/**
* Removes stop words from a token stream.
@ -64,7 +65,7 @@ import java.util.Hashtable;
public final class StopFilter extends TokenFilter {
private HashSet stopWords;
private Set stopWords;
/**
* Constructs a filter which removes words from the input
@ -79,7 +80,7 @@ public final class StopFilter extends TokenFilter {
* Constructs a filter which removes words from the input
* TokenStream that are named in the Hashtable.
*
* @deprecated Use {@link #StopFilter(TokenStream, HashSet)} StopFilter(TokenStream,Map)} instead
* @deprecated Use {@link #StopFilter(TokenStream, Set)} StopFilter(TokenStream,Map)} instead
*/
public StopFilter(TokenStream in, Hashtable stopTable) {
super(in);
@ -89,8 +90,12 @@ public final class StopFilter extends TokenFilter {
/**
* Constructs a filter which removes words from the input
* TokenStream that are named in the Set.
* It is crucial that an efficient Set implementation is used
* for maximum performance.
*
* @see #makeStopSet(java.lang.String[])
*/
public StopFilter(TokenStream in, HashSet stopWords) {
public StopFilter(TokenStream in, Set stopWords) {
super(in);
this.stopWords = stopWords;
}
@ -116,7 +121,7 @@ public final class StopFilter extends TokenFilter {
* This permits this stopWords construction to be cached once when
* an Analyzer is constructed.
*/
public static final HashSet makeStopSet(String[] stopWords) {
public static final Set makeStopSet(String[] stopWords) {
HashSet stopTable = new HashSet(stopWords.length);
for (int i = 0; i < stopWords.length; i++)
stopTable.add(stopWords[i]);

View File

@ -62,6 +62,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
import java.io.Reader;
import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/**
* Analyzer for German language. Supports an external list of stopwords (words that
@ -96,19 +98,19 @@ public class GermanAnalyzer extends Analyzer
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stoptable = new Hashtable();
private Set stopSet = new HashSet();
/**
* Contains words that should be indexed but not stemmed.
*/
private Hashtable excltable = new Hashtable();
private Set exclusionSet = new HashSet();
/**
* Builds an analyzer.
*/
public GermanAnalyzer()
{
stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS );
}
/**
@ -116,7 +118,7 @@ public class GermanAnalyzer extends Analyzer
*/
public GermanAnalyzer( String[] stopwords )
{
stoptable = StopFilter.makeStopTable( stopwords );
stopSet = StopFilter.makeStopSet( stopwords );
}
/**
@ -124,7 +126,7 @@ public class GermanAnalyzer extends Analyzer
*/
public GermanAnalyzer( Hashtable stopwords )
{
stoptable = stopwords;
stopSet = new HashSet(stopwords.keySet());
}
/**
@ -132,7 +134,7 @@ public class GermanAnalyzer extends Analyzer
*/
public GermanAnalyzer( File stopwords )
{
stoptable = WordlistLoader.getWordtable( stopwords );
stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
}
/**
@ -140,7 +142,7 @@ public class GermanAnalyzer extends Analyzer
*/
public void setStemExclusionTable( String[] exclusionlist )
{
excltable = StopFilter.makeStopTable( exclusionlist );
exclusionSet = StopFilter.makeStopSet( exclusionlist );
}
/**
@ -148,7 +150,7 @@ public class GermanAnalyzer extends Analyzer
*/
public void setStemExclusionTable( Hashtable exclusionlist )
{
excltable = exclusionlist;
exclusionSet = new HashSet(exclusionlist.keySet());
}
/**
@ -156,7 +158,7 @@ public class GermanAnalyzer extends Analyzer
*/
public void setStemExclusionTable( File exclusionlist )
{
excltable = WordlistLoader.getWordtable( exclusionlist );
exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
}
/**
@ -170,8 +172,8 @@ public class GermanAnalyzer extends Analyzer
TokenStream result = new StandardTokenizer( reader );
result = new StandardFilter( result );
// shouldn't there be a lowercaser before stop word filtering?
result = new StopFilter( result, stoptable );
result = new GermanStemFilter( result, excltable );
result = new StopFilter( result, stopSet );
result = new GermanStemFilter( result, exclusionSet );
return result;
}
}

View File

@ -59,6 +59,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/**
* A filter that stems German words. It supports a table of words that should
@ -75,7 +77,7 @@ public final class GermanStemFilter extends TokenFilter
*/
private Token token = null;
private GermanStemmer stemmer = null;
private Hashtable exclusions = null;
private Set exclusionSet = null;
public GermanStemFilter( TokenStream in )
{
@ -85,13 +87,24 @@ public final class GermanStemFilter extends TokenFilter
/**
* Builds a GermanStemFilter that uses an exclusiontable.
* @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
*/
public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
{
this( in );
exclusions = exclusiontable;
exclusionSet = new HashSet(exclusiontable.keySet());
}
/**
* Builds a GermanStemFilter that uses an exclusiontable.
*/
public GermanStemFilter( TokenStream in, Set exclusionSet )
{
this( in );
this.exclusionSet = exclusionSet;
}
/**
* @return Returns the next token in the stream, or null at EOS
*/
@ -102,7 +115,7 @@ public final class GermanStemFilter extends TokenFilter
return null;
}
// Check the exclusiontable
else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
return token;
}
else {
@ -128,9 +141,18 @@ public final class GermanStemFilter extends TokenFilter
/**
* Set an alternative exclusion list for this filter.
* @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
*/
public void setExclusionTable( Hashtable exclusiontable )
{
exclusions = exclusiontable;
exclusionSet = new HashSet(exclusiontable.keySet());
}
/**
* Set an alternative exclusion list for this filter.
*/
public void setExclusionSet( Set exclusionSet )
{
this.exclusionSet = exclusionSet;
}
}

View File

@ -67,6 +67,8 @@ import java.util.Hashtable;
*
* @author Gerhard Schwarz
* @version $Id$
*
* @todo refactor to convert to Sets instead of Hashtable
*/
public class WordlistLoader {
/**
@ -92,6 +94,7 @@ public class WordlistLoader {
/**
* @param wordfile File containing the wordlist
* @todo Create a Set version of this method
*/
public static Hashtable getWordtable(File wordfile) {
if (wordfile == null) {

View File

@ -60,6 +60,8 @@ import org.apache.lucene.analysis.TokenStream;
import java.io.Reader;
import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/**
* Analyzer for Russian language. Supports an external list of stopwords (words that
@ -215,7 +217,7 @@ public final class RussianAnalyzer extends Analyzer
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stoptable = new Hashtable();
private Set stopSet = new HashSet();
/**
* Charset for Russian letters.
@ -227,7 +229,7 @@ public final class RussianAnalyzer extends Analyzer
public RussianAnalyzer() {
charset = RussianCharsets.UnicodeRussian;
stoptable = StopFilter.makeStopTable(
stopSet = StopFilter.makeStopSet(
makeStopWords(RussianCharsets.UnicodeRussian));
}
@ -237,7 +239,7 @@ public final class RussianAnalyzer extends Analyzer
public RussianAnalyzer(char[] charset)
{
this.charset = charset;
stoptable = StopFilter.makeStopTable(makeStopWords(charset));
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
}
/**
@ -246,7 +248,7 @@ public final class RussianAnalyzer extends Analyzer
public RussianAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stoptable = StopFilter.makeStopTable(stopwords);
stopSet = StopFilter.makeStopSet(stopwords);
}
// Takes russian stop words and translates them to a String array, using
@ -270,11 +272,12 @@ public final class RussianAnalyzer extends Analyzer
/**
* Builds an analyzer with the given stop words.
* @todo create a Set version of this ctor
*/
public RussianAnalyzer(char[] charset, Hashtable stopwords)
{
this.charset = charset;
stoptable = stopwords;
stopSet = new HashSet(stopwords.keySet());
}
/**
@ -287,7 +290,7 @@ public final class RussianAnalyzer extends Analyzer
{
TokenStream result = new RussianLetterTokenizer(reader, charset);
result = new RussianLowerCaseFilter(result, charset);
result = new StopFilter(result, stoptable);
result = new StopFilter(result, stopSet);
result = new RussianStemFilter(result, charset);
return result;
}

View File

@ -56,7 +56,7 @@ package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.*;
import java.io.Reader;
import java.util.Hashtable;
import java.util.Set;
/**
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
@ -65,7 +65,7 @@ import java.util.Hashtable;
* @version $Id$
*/
public class StandardAnalyzer extends Analyzer {
private Hashtable stopTable;
private Set stopSet;
/** An array containing some common English words that are usually not
useful for searching. */
@ -78,7 +78,7 @@ public class StandardAnalyzer extends Analyzer {
/** Builds an analyzer with the given stop words. */
public StandardAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords);
stopSet = StopFilter.makeStopSet(stopWords);
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
@ -87,7 +87,7 @@ public class StandardAnalyzer extends Analyzer {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopTable);
result = new StopFilter(result, stopSet);
return result;
}
}