mirror of https://github.com/apache/lucene.git
convert StopFilter to use Set, with supporting changes to avoid calling deprecated methods. never compromise on your ideals!
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150229 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7ceae266c5
commit
87d1685b0f
|
@ -56,11 +56,12 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
|
/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
|
||||||
|
|
||||||
public final class StopAnalyzer extends Analyzer {
|
public final class StopAnalyzer extends Analyzer {
|
||||||
private Hashtable stopTable;
|
private Set stopWords;
|
||||||
|
|
||||||
/** An array containing some common English words that are not usually useful
|
/** An array containing some common English words that are not usually useful
|
||||||
for searching. */
|
for searching. */
|
||||||
|
@ -74,17 +75,17 @@ public final class StopAnalyzer extends Analyzer {
|
||||||
|
|
||||||
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
|
/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
|
||||||
public StopAnalyzer() {
|
public StopAnalyzer() {
|
||||||
stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
|
stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Builds an analyzer which removes words in the provided array. */
|
/** Builds an analyzer which removes words in the provided array. */
|
||||||
public StopAnalyzer(String[] stopWords) {
|
public StopAnalyzer(String[] stopWords) {
|
||||||
stopTable = StopFilter.makeStopTable(stopWords);
|
this.stopWords = StopFilter.makeStopSet(stopWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
|
return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -57,6 +57,7 @@ package org.apache.lucene.analysis;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes stop words from a token stream.
|
* Removes stop words from a token stream.
|
||||||
|
@ -64,7 +65,7 @@ import java.util.Hashtable;
|
||||||
|
|
||||||
public final class StopFilter extends TokenFilter {
|
public final class StopFilter extends TokenFilter {
|
||||||
|
|
||||||
private HashSet stopWords;
|
private Set stopWords;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a filter which removes words from the input
|
* Constructs a filter which removes words from the input
|
||||||
|
@ -79,7 +80,7 @@ public final class StopFilter extends TokenFilter {
|
||||||
* Constructs a filter which removes words from the input
|
* Constructs a filter which removes words from the input
|
||||||
* TokenStream that are named in the Hashtable.
|
* TokenStream that are named in the Hashtable.
|
||||||
*
|
*
|
||||||
* @deprecated Use {@link #StopFilter(TokenStream, HashSet)} StopFilter(TokenStream,Map)} instead
|
* @deprecated Use {@link #StopFilter(TokenStream, Set)} StopFilter(TokenStream,Map)} instead
|
||||||
*/
|
*/
|
||||||
public StopFilter(TokenStream in, Hashtable stopTable) {
|
public StopFilter(TokenStream in, Hashtable stopTable) {
|
||||||
super(in);
|
super(in);
|
||||||
|
@ -89,8 +90,12 @@ public final class StopFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* Constructs a filter which removes words from the input
|
* Constructs a filter which removes words from the input
|
||||||
* TokenStream that are named in the Set.
|
* TokenStream that are named in the Set.
|
||||||
|
* It is crucial that an efficient Set implementation is used
|
||||||
|
* for maximum performance.
|
||||||
|
*
|
||||||
|
* @see #makeStopSet(java.lang.String[])
|
||||||
*/
|
*/
|
||||||
public StopFilter(TokenStream in, HashSet stopWords) {
|
public StopFilter(TokenStream in, Set stopWords) {
|
||||||
super(in);
|
super(in);
|
||||||
this.stopWords = stopWords;
|
this.stopWords = stopWords;
|
||||||
}
|
}
|
||||||
|
@ -116,7 +121,7 @@ public final class StopFilter extends TokenFilter {
|
||||||
* This permits this stopWords construction to be cached once when
|
* This permits this stopWords construction to be cached once when
|
||||||
* an Analyzer is constructed.
|
* an Analyzer is constructed.
|
||||||
*/
|
*/
|
||||||
public static final HashSet makeStopSet(String[] stopWords) {
|
public static final Set makeStopSet(String[] stopWords) {
|
||||||
HashSet stopTable = new HashSet(stopWords.length);
|
HashSet stopTable = new HashSet(stopWords.length);
|
||||||
for (int i = 0; i < stopWords.length; i++)
|
for (int i = 0; i < stopWords.length; i++)
|
||||||
stopTable.add(stopWords[i]);
|
stopTable.add(stopWords[i]);
|
||||||
|
|
|
@ -62,6 +62,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for German language. Supports an external list of stopwords (words that
|
* Analyzer for German language. Supports an external list of stopwords (words that
|
||||||
|
@ -96,19 +98,19 @@ public class GermanAnalyzer extends Analyzer
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the StopFilter.
|
||||||
*/
|
*/
|
||||||
private Hashtable stoptable = new Hashtable();
|
private Set stopSet = new HashSet();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains words that should be indexed but not stemmed.
|
* Contains words that should be indexed but not stemmed.
|
||||||
*/
|
*/
|
||||||
private Hashtable excltable = new Hashtable();
|
private Set exclusionSet = new HashSet();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer.
|
* Builds an analyzer.
|
||||||
*/
|
*/
|
||||||
public GermanAnalyzer()
|
public GermanAnalyzer()
|
||||||
{
|
{
|
||||||
stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
|
stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -116,7 +118,7 @@ public class GermanAnalyzer extends Analyzer
|
||||||
*/
|
*/
|
||||||
public GermanAnalyzer( String[] stopwords )
|
public GermanAnalyzer( String[] stopwords )
|
||||||
{
|
{
|
||||||
stoptable = StopFilter.makeStopTable( stopwords );
|
stopSet = StopFilter.makeStopSet( stopwords );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -124,7 +126,7 @@ public class GermanAnalyzer extends Analyzer
|
||||||
*/
|
*/
|
||||||
public GermanAnalyzer( Hashtable stopwords )
|
public GermanAnalyzer( Hashtable stopwords )
|
||||||
{
|
{
|
||||||
stoptable = stopwords;
|
stopSet = new HashSet(stopwords.keySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -132,7 +134,7 @@ public class GermanAnalyzer extends Analyzer
|
||||||
*/
|
*/
|
||||||
public GermanAnalyzer( File stopwords )
|
public GermanAnalyzer( File stopwords )
|
||||||
{
|
{
|
||||||
stoptable = WordlistLoader.getWordtable( stopwords );
|
stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -140,7 +142,7 @@ public class GermanAnalyzer extends Analyzer
|
||||||
*/
|
*/
|
||||||
public void setStemExclusionTable( String[] exclusionlist )
|
public void setStemExclusionTable( String[] exclusionlist )
|
||||||
{
|
{
|
||||||
excltable = StopFilter.makeStopTable( exclusionlist );
|
exclusionSet = StopFilter.makeStopSet( exclusionlist );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -148,7 +150,7 @@ public class GermanAnalyzer extends Analyzer
|
||||||
*/
|
*/
|
||||||
public void setStemExclusionTable( Hashtable exclusionlist )
|
public void setStemExclusionTable( Hashtable exclusionlist )
|
||||||
{
|
{
|
||||||
excltable = exclusionlist;
|
exclusionSet = new HashSet(exclusionlist.keySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -156,7 +158,7 @@ public class GermanAnalyzer extends Analyzer
|
||||||
*/
|
*/
|
||||||
public void setStemExclusionTable( File exclusionlist )
|
public void setStemExclusionTable( File exclusionlist )
|
||||||
{
|
{
|
||||||
excltable = WordlistLoader.getWordtable( exclusionlist );
|
exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -170,8 +172,8 @@ public class GermanAnalyzer extends Analyzer
|
||||||
TokenStream result = new StandardTokenizer( reader );
|
TokenStream result = new StandardTokenizer( reader );
|
||||||
result = new StandardFilter( result );
|
result = new StandardFilter( result );
|
||||||
// shouldn't there be a lowercaser before stop word filtering?
|
// shouldn't there be a lowercaser before stop word filtering?
|
||||||
result = new StopFilter( result, stoptable );
|
result = new StopFilter( result, stopSet );
|
||||||
result = new GermanStemFilter( result, excltable );
|
result = new GermanStemFilter( result, exclusionSet );
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,6 +59,8 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter that stems German words. It supports a table of words that should
|
* A filter that stems German words. It supports a table of words that should
|
||||||
|
@ -75,7 +77,7 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
*/
|
*/
|
||||||
private Token token = null;
|
private Token token = null;
|
||||||
private GermanStemmer stemmer = null;
|
private GermanStemmer stemmer = null;
|
||||||
private Hashtable exclusions = null;
|
private Set exclusionSet = null;
|
||||||
|
|
||||||
public GermanStemFilter( TokenStream in )
|
public GermanStemFilter( TokenStream in )
|
||||||
{
|
{
|
||||||
|
@ -85,11 +87,22 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds a GermanStemFilter that uses an exclusiontable.
|
* Builds a GermanStemFilter that uses an exclusiontable.
|
||||||
|
* @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
|
||||||
*/
|
*/
|
||||||
public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
|
public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
|
||||||
{
|
{
|
||||||
this( in );
|
this( in );
|
||||||
exclusions = exclusiontable;
|
exclusionSet = new HashSet(exclusiontable.keySet());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a GermanStemFilter that uses an exclusiontable.
|
||||||
|
*/
|
||||||
|
public GermanStemFilter( TokenStream in, Set exclusionSet )
|
||||||
|
{
|
||||||
|
this( in );
|
||||||
|
this.exclusionSet = exclusionSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -102,7 +115,7 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
// Check the exclusiontable
|
// Check the exclusiontable
|
||||||
else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
|
else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -128,9 +141,18 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set an alternative exclusion list for this filter.
|
* Set an alternative exclusion list for this filter.
|
||||||
|
* @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
|
||||||
*/
|
*/
|
||||||
public void setExclusionTable( Hashtable exclusiontable )
|
public void setExclusionTable( Hashtable exclusiontable )
|
||||||
{
|
{
|
||||||
exclusions = exclusiontable;
|
exclusionSet = new HashSet(exclusiontable.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set an alternative exclusion list for this filter.
|
||||||
|
*/
|
||||||
|
public void setExclusionSet( Set exclusionSet )
|
||||||
|
{
|
||||||
|
this.exclusionSet = exclusionSet;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,6 +67,8 @@ import java.util.Hashtable;
|
||||||
*
|
*
|
||||||
* @author Gerhard Schwarz
|
* @author Gerhard Schwarz
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
*
|
||||||
|
* @todo refactor to convert to Sets instead of Hashtable
|
||||||
*/
|
*/
|
||||||
public class WordlistLoader {
|
public class WordlistLoader {
|
||||||
/**
|
/**
|
||||||
|
@ -92,6 +94,7 @@ public class WordlistLoader {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param wordfile File containing the wordlist
|
* @param wordfile File containing the wordlist
|
||||||
|
* @todo Create a Set version of this method
|
||||||
*/
|
*/
|
||||||
public static Hashtable getWordtable(File wordfile) {
|
public static Hashtable getWordtable(File wordfile) {
|
||||||
if (wordfile == null) {
|
if (wordfile == null) {
|
||||||
|
|
|
@ -60,6 +60,8 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Analyzer for Russian language. Supports an external list of stopwords (words that
|
* Analyzer for Russian language. Supports an external list of stopwords (words that
|
||||||
|
@ -215,7 +217,7 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the StopFilter.
|
||||||
*/
|
*/
|
||||||
private Hashtable stoptable = new Hashtable();
|
private Set stopSet = new HashSet();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Charset for Russian letters.
|
* Charset for Russian letters.
|
||||||
|
@ -227,7 +229,7 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
|
|
||||||
public RussianAnalyzer() {
|
public RussianAnalyzer() {
|
||||||
charset = RussianCharsets.UnicodeRussian;
|
charset = RussianCharsets.UnicodeRussian;
|
||||||
stoptable = StopFilter.makeStopTable(
|
stopSet = StopFilter.makeStopSet(
|
||||||
makeStopWords(RussianCharsets.UnicodeRussian));
|
makeStopWords(RussianCharsets.UnicodeRussian));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -237,7 +239,7 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
public RussianAnalyzer(char[] charset)
|
public RussianAnalyzer(char[] charset)
|
||||||
{
|
{
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
stoptable = StopFilter.makeStopTable(makeStopWords(charset));
|
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -246,7 +248,7 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
public RussianAnalyzer(char[] charset, String[] stopwords)
|
public RussianAnalyzer(char[] charset, String[] stopwords)
|
||||||
{
|
{
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
stoptable = StopFilter.makeStopTable(stopwords);
|
stopSet = StopFilter.makeStopSet(stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Takes russian stop words and translates them to a String array, using
|
// Takes russian stop words and translates them to a String array, using
|
||||||
|
@ -270,11 +272,12 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the given stop words.
|
* Builds an analyzer with the given stop words.
|
||||||
|
* @todo create a Set version of this ctor
|
||||||
*/
|
*/
|
||||||
public RussianAnalyzer(char[] charset, Hashtable stopwords)
|
public RussianAnalyzer(char[] charset, Hashtable stopwords)
|
||||||
{
|
{
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
stoptable = stopwords;
|
stopSet = new HashSet(stopwords.keySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -287,7 +290,7 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
{
|
{
|
||||||
TokenStream result = new RussianLetterTokenizer(reader, charset);
|
TokenStream result = new RussianLetterTokenizer(reader, charset);
|
||||||
result = new RussianLowerCaseFilter(result, charset);
|
result = new RussianLowerCaseFilter(result, charset);
|
||||||
result = new StopFilter(result, stoptable);
|
result = new StopFilter(result, stopSet);
|
||||||
result = new RussianStemFilter(result, charset);
|
result = new RussianStemFilter(result, charset);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,7 +56,7 @@ package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.*;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Hashtable;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||||
|
@ -65,7 +65,7 @@ import java.util.Hashtable;
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class StandardAnalyzer extends Analyzer {
|
public class StandardAnalyzer extends Analyzer {
|
||||||
private Hashtable stopTable;
|
private Set stopSet;
|
||||||
|
|
||||||
/** An array containing some common English words that are usually not
|
/** An array containing some common English words that are usually not
|
||||||
useful for searching. */
|
useful for searching. */
|
||||||
|
@ -78,7 +78,7 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
|
|
||||||
/** Builds an analyzer with the given stop words. */
|
/** Builds an analyzer with the given stop words. */
|
||||||
public StandardAnalyzer(String[] stopWords) {
|
public StandardAnalyzer(String[] stopWords) {
|
||||||
stopTable = StopFilter.makeStopTable(stopWords);
|
stopSet = StopFilter.makeStopSet(stopWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||||
|
@ -87,7 +87,7 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
result = new StopFilter(result, stopTable);
|
result = new StopFilter(result, stopSet);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue