LUCENE-2034: Refactor analyzer reuse and stopword handling

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@895339 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-01-03 08:48:17 +00:00
parent 5e77ec9845
commit a949836869
25 changed files with 584 additions and 743 deletions

View File

@ -105,6 +105,12 @@ New features
backwards compatibility. If Version < 3.1 is passed to the constructor,
LowerCaseFilter yields the old behavior. (Simon Willnauer, Robert Muir)
* LUCENE-2034: Added ReusableAnalyzerBase, an abstract subclass of Analyzer
that makes it easier to reuse TokenStreams correctly. This issue also added
StopwordAnalyzerBase, which improves consistency of all Analyzers that use
stopwords, and implement many analyzers in contrib with it.
(Simon Willnauer via Robert Muir)
Optimizations
* LUCENE-2086: When resolving deleted terms, do so in term sort order

View File

@ -19,17 +19,15 @@ package org.apache.lucene.analysis.ar;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@ -52,7 +50,7 @@ import org.apache.lucene.util.Version;
* </ul>
*
*/
public final class ArabicAnalyzer extends Analyzer {
public final class ArabicAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Arabic stopwords.
@ -62,21 +60,18 @@ public final class ArabicAnalyzer extends Analyzer {
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Contains the stopwords used with the StopFilter.
*/
private final Set<?> stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this will be ignored
* @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
*/
// TODO make this private
public static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<String> getDefaultStopSet(){
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@ -85,34 +80,19 @@ public final class ArabicAnalyzer extends Analyzer {
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<String> DEFAULT_STOP_SET;
static final Set<?> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadDefaultStopWordSet();
DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
static Set<String> loadDefaultStopWordSet() throws IOException {
InputStream stream = ArabicAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORDS_COMMENT));
} finally {
stream.close();
}
}
}
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
@ -129,8 +109,7 @@ public final class ArabicAnalyzer extends Analyzer {
* a stopword set
*/
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
super(matchVersion, stopwords);
}
/**
@ -159,54 +138,21 @@ public final class ArabicAnalyzer extends Analyzer {
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
* @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
* and {@link ArabicStemFilter}.
*/
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ArabicLetterTokenizer( reader );
result = new LowerCaseFilter(matchVersion, result);
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new ArabicLetterTokenizer(reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized!
result = new StopFilter( matchVersion, result, stoptable );
result = new ArabicNormalizationFilter( result );
result = new ArabicStemFilter( result );
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
* and {@link ArabicStemFilter}.
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new ArabicLetterTokenizer(reader);
streams.result = new LowerCaseFilter(matchVersion, streams.source);
// the order here is important: the stopword list is not normalized!
streams.result = new StopFilter( matchVersion, streams.result, stoptable);
streams.result = new ArabicNormalizationFilter(streams.result);
streams.result = new ArabicStemFilter(streams.result);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
result = new StopFilter( matchVersion, result, stopwords);
result = new ArabicNormalizationFilter(result);
return new TokenStreamComponents(source, new ArabicStemFilter(result));
}
}

View File

@ -17,17 +17,16 @@ package org.apache.lucene.analysis.bg;
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@ -43,7 +42,7 @@ import org.apache.lucene.util.Version;
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
* <p>
*/
public final class BulgarianAnalyzer extends Analyzer {
public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Bulgarian stopwords.
@ -54,14 +53,12 @@ public final class BulgarianAnalyzer extends Analyzer {
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Contains the stopwords used with the StopFilter.
*/
private final Set<?> stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
* @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
*/
//TODO make this private
public static final String STOPWORDS_COMMENT = "#";
/**
@ -69,7 +66,7 @@ public final class BulgarianAnalyzer extends Analyzer {
*
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<String> getDefaultStopSet() {
public static Set<?> getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@ -78,35 +75,19 @@ public final class BulgarianAnalyzer extends Analyzer {
* class accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<String> DEFAULT_STOP_SET;
static final Set<?> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadDefaultStopWordSet();
} catch (Exception ex) {
DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set", ex);
}
}
static Set<String> loadDefaultStopWordSet() throws IOException {
final InputStream stream = BulgarianAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORDS_COMMENT));
} finally {
if(stream != null)
stream.close();
throw new RuntimeException("Unable to load default stopword set");
}
}
}
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
@ -119,58 +100,24 @@ public final class BulgarianAnalyzer extends Analyzer {
* Builds an analyzer with the given stop words.
*/
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
super();
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
stopwords));
this.matchVersion = matchVersion;
super(matchVersion, stopwords);
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
* {@link Reader}.
*
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link BulgarianStemFilter}.
*/
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stoptable);
result = new StopFilter(matchVersion, result, stopwords);
result = new BulgarianStemFilter(result);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
* text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link BulgarianStemFilter}.
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
streams.result = new BulgarianStemFilter(streams.result);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
return new TokenStreamComponents(source, result);
}
}

View File

@ -21,19 +21,21 @@ import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
@ -49,7 +51,7 @@ import org.apache.lucene.util.Version;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class BrazilianAnalyzer extends Analyzer {
public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
/**
* List of typical Brazilian Portuguese stopwords.
@ -91,19 +93,13 @@ public final class BrazilianAnalyzer extends Analyzer {
Arrays.asList(BRAZILIAN_STOP_WORDS), false));
}
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private final Set<?> stoptable;
/**
* Contains words that should be indexed but not stemmed.
*/
// TODO make this private in 3.1
private Set<?> excltable = Collections.emptySet();
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
*/
@ -120,8 +116,7 @@ public final class BrazilianAnalyzer extends Analyzer {
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
super(matchVersion, stopwords);
}
/**
@ -188,53 +183,22 @@ public final class BrazilianAnalyzer extends Analyzer {
excltable = WordlistLoader.getWordSet( exclusionlist );
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
* {@link BrazilianStemFilter}.
*/
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer( matchVersion, reader );
result = new LowerCaseFilter( matchVersion, result );
result = new StandardFilter( result );
result = new StopFilter( matchVersion, result, stoptable );
result = new BrazilianStemFilter( result, excltable );
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
* {@link BrazilianStemFilter}.
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new LowerCaseFilter(matchVersion, streams.source);
streams.result = new StandardFilter(streams.result);
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
streams.result = new BrazilianStemFilter(streams.result, excltable);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
/**
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
* {@link BrazilianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StandardFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
return new TokenStreamComponents(source, new BrazilianStemFilter(result,
excltable));
}
}

View File

@ -19,12 +19,12 @@ package org.apache.lucene.analysis.cjk;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Set;
@ -35,7 +35,7 @@ import java.util.Set;
* filters with {@link StopFilter}
*
*/
public final class CJKAnalyzer extends Analyzer {
public final class CJKAnalyzer extends StopwordAnalyzerBase {
//~ Static fields/initializers ---------------------------------------------
/**
@ -71,11 +71,6 @@ public final class CJKAnalyzer extends Analyzer {
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS),
false));
}
/**
* stop word list
*/
private final Set<?> stopTable;
private final Version matchVersion;
//~ Constructors -----------------------------------------------------------
@ -95,8 +90,7 @@ public final class CJKAnalyzer extends Analyzer {
* a stopword set
*/
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
super(matchVersion, stopwords);
}
/**
@ -106,51 +100,15 @@ public final class CJKAnalyzer extends Analyzer {
* @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
*/
public CJKAnalyzer(Version matchVersion, String... stopWords) {
stopTable = StopFilter.makeStopSet(matchVersion, stopWords);
this.matchVersion = matchVersion;
super(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords));
}
//~ Methods ----------------------------------------------------------------
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
* @param fieldName lucene field name
* @param reader input {@link Reader}
* @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
* {@link StopFilter}
*/
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(matchVersion, new CJKTokenizer(reader), stopTable);
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @param fieldName lucene field name
* @param reader Input {@link Reader}
* @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
* {@link StopFilter}
*/
@Override
public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
/* tokenStream() is final, no back compat issue */
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new CJKTokenizer(reader);
streams.result = new StopFilter(matchVersion, streams.source, stopTable);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new CJKTokenizer(reader);
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
}
}

View File

@ -25,8 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
/**
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages.

View File

@ -17,10 +17,11 @@ package org.apache.lucene.analysis.cn;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
@ -29,49 +30,19 @@ import org.apache.lucene.analysis.Tokenizer;
*
*/
public final class ChineseAnalyzer extends Analyzer {
public final class ChineseAnalyzer extends ReusableAnalyzerBase {
public ChineseAnalyzer() {
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link ChineseTokenizer}
* filtered with {@link ChineseFilter}.
*/
/**
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
* provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a
* {@link ChineseTokenizer} filtered with {@link ChineseFilter}
*/
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ChineseTokenizer(reader);
result = new ChineseFilter(result);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the
* provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link ChineseTokenizer}
* filtered with {@link ChineseFilter}.
*/
@Override
public final TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
/* tokenStream() is final, no back compat issue */
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new ChineseTokenizer(reader);
streams.result = new ChineseFilter(streams.source);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new ChineseTokenizer(reader);
return new TokenStreamComponents(source, new ChineseFilter(source));
}
}

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.cz;
* limitations under the License.
*/
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@ -30,9 +32,9 @@ import org.apache.lucene.util.Version;
import java.io.*;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.Collections;
/**
* {@link Analyzer} for Czech language.
@ -53,7 +55,7 @@ import java.util.Collections;
* <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
* </ul>
*/
public final class CzechAnalyzer extends Analyzer {
public final class CzechAnalyzer extends ReusableAnalyzerBase {
/**
* List of typical stopwords.
@ -95,10 +97,11 @@ public final class CzechAnalyzer extends Analyzer {
Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false));
}
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
// TODO make this final in 3.1
// TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
private Set<?> stoptable;
private final Version matchVersion;
@ -168,6 +171,7 @@ public final class CzechAnalyzer extends Analyzer {
* @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
* and {@link #CzechAnalyzer(Version, Set)} instead
*/
// TODO extend StopwordAnalyzerBase once this method is gone!
public void loadStopWords( InputStream wordfile, String encoding ) {
setPreviousTokenStream(null); // force a new stopfilter to be created
if ( wordfile == null ) {
@ -191,58 +195,25 @@ public final class CzechAnalyzer extends Analyzer {
stoptable = Collections.emptySet();
}
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link CzechStemFilter} (only if version is
* >= LUCENE_31)
*/
@Override
public final TokenStream tokenStream( String fieldName, Reader reader ) {
TokenStream result = new StandardTokenizer( matchVersion, reader );
result = new StandardFilter( result );
result = new LowerCaseFilter( matchVersion, result );
result = new StopFilter( matchVersion, result, stoptable );
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new CzechStemFilter(result);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
* text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link CzechStemFilter} (only if version is
* >= LUCENE_31)
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter( matchVersion, streams.result, stoptable);
if (matchVersion.onOrAfter(Version.LUCENE_31))
streams.result = new CzechStemFilter(streams.result);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stoptable);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new CzechStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -29,13 +29,15 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
/**
@ -51,7 +53,7 @@ import org.apache.lucene.util.Version;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class GermanAnalyzer extends Analyzer {
public final class GermanAnalyzer extends StopwordAnalyzerBase {
/**
* List of typical german stopwords.
@ -89,17 +91,13 @@ public final class GermanAnalyzer extends Analyzer {
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
//TODO make this final in 3.1
private Set<?> stopSet;
/**
* Contains words that should be indexed but not stemmed.
*/
// TODO make this final in 3.1
private Set<?> exclusionSet;
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words:
* {@link #getDefaultStopSet()}.
@ -131,9 +129,8 @@ public final class GermanAnalyzer extends Analyzer {
* a stemming exclusion set
*/
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
super(matchVersion, stopwords);
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
this.matchVersion = matchVersion;
}
/**
@ -187,51 +184,23 @@ public final class GermanAnalyzer extends Analyzer {
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
* provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a
* {@link StandardTokenizer} filtered with {@link StandardFilter},
* {@link LowerCaseFilter}, {@link StopFilter}, and
* {@link GermanStemFilter}
*/
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopSet);
result = new GermanStemFilter(result, exclusionSet);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
* {@link GermanStemFilter}
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter( matchVersion, streams.result, stopSet);
streams.result = new GermanStemFilter(streams.result, exclusionSet);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
result = new StopFilter( matchVersion, result, stopwords);
return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet));
}
}

View File

@ -19,14 +19,15 @@ package org.apache.lucene.analysis.el;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
@ -43,7 +44,7 @@ import java.util.Set;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class GreekAnalyzer extends Analyzer
public final class GreekAnalyzer extends StopwordAnalyzerBase
{
/**
* List of typical Greek stopwords.
@ -73,13 +74,6 @@ public final class GreekAnalyzer extends Analyzer
Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
}
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private final Set<?> stopSet;
private final Version matchVersion;
public GreekAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
@ -93,8 +87,7 @@ public final class GreekAnalyzer extends Analyzer
* a stopword set
*/
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
super(matchVersion, stopwords);
}
/**
@ -115,47 +108,20 @@ public final class GreekAnalyzer extends Analyzer
{
this(matchVersion, stopwords.keySet());
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link GreekLowerCaseFilter} and {@link StopFilter}
*/
/**
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
* provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a
* {@link StandardTokenizer} filtered with
* {@link GreekLowerCaseFilter} and {@link StopFilter}
*/
@Override
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new GreekLowerCaseFilter(result);
result = new StopFilter(matchVersion, result, stopSet);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link GreekLowerCaseFilter} and {@link StopFilter}
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new GreekLowerCaseFilter(streams.source);
streams.result = new StopFilter(matchVersion, streams.result, stopSet);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
final TokenStream result = new GreekLowerCaseFilter(source);
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}

View File

@ -19,17 +19,15 @@ package org.apache.lucene.analysis.fa;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@ -45,7 +43,7 @@ import org.apache.lucene.util.Version;
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
* </p>
*/
public final class PersianAnalyzer extends Analyzer {
public final class PersianAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Persian stopwords.
@ -57,11 +55,6 @@ public final class PersianAnalyzer extends Analyzer {
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Contains the stopwords used with the StopFilter.
*/
private final Set<?> stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
@ -85,30 +78,15 @@ public final class PersianAnalyzer extends Analyzer {
static {
try {
DEFAULT_STOP_SET = loadDefaultStopWordSet();
DEFAULT_STOP_SET = loadStopwordSet(false, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
static Set<String> loadDefaultStopWordSet() throws IOException {
InputStream stream = PersianAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORDS_COMMENT));
} finally {
stream.close();
}
}
}
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
@ -126,8 +104,7 @@ public final class PersianAnalyzer extends Analyzer {
* a stopword set
*/
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
super(matchVersion, stopwords);
}
/**
@ -156,18 +133,19 @@ public final class PersianAnalyzer extends Analyzer {
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
* @return {@link TokenStreamComponents} built from a {@link ArabicLetterTokenizer}
* filtered with {@link LowerCaseFilter},
* {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words
*/
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ArabicLetterTokenizer(reader);
result = new LowerCaseFilter(matchVersion, result);
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new ArabicLetterTokenizer(reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
@ -175,44 +153,6 @@ public final class PersianAnalyzer extends Analyzer {
* the order here is important: the stopword list is normalized with the
* above!
*/
result = new StopFilter(matchVersion, result, stoptable);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
}
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
* filtered with {@link LowerCaseFilter},
* {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new ArabicLetterTokenizer(reader);
streams.result = new LowerCaseFilter(matchVersion, streams.source);
streams.result = new ArabicNormalizationFilter(streams.result);
/* additional persian-specific normalization */
streams.result = new PersianNormalizationFilter(streams.result);
/*
* the order here is important: the stopword list is normalized with the
* above!
*/
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}

View File

@ -20,7 +20,9 @@ package org.apache.lucene.analysis.fr;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@ -59,7 +61,7 @@ import java.util.Set;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class FrenchAnalyzer extends Analyzer {
public final class FrenchAnalyzer extends StopwordAnalyzerBase {
/**
* Extended list of typical French stopwords.
@ -91,18 +93,12 @@ public final class FrenchAnalyzer extends Analyzer {
"été", "être", "ô"
};
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private final Set<?> stoptable;
/**
* Contains words that should be indexed but not stemmed.
*/
//TODO make this final in 3.0
private Set<?> excltable = Collections.<Object>emptySet();
private final Version matchVersion;
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
@ -148,9 +144,7 @@ public final class FrenchAnalyzer extends Analyzer {
*/
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
Set<?> stemExclutionSet) {
this.matchVersion = matchVersion;
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet
.copy(matchVersion, stopwords));
super(matchVersion, stopwords);
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(matchVersion, stemExclutionSet));
}
@ -202,54 +196,22 @@ public final class FrenchAnalyzer extends Analyzer {
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link StopFilter},
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
*/
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
result = new StopFilter(matchVersion, result, stoptable);
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
result = new StopFilter(matchVersion, result, stopwords);
result = new FrenchStemFilter(result, excltable);
// Convert to lowercase after stemming!
result = new LowerCaseFilter(matchVersion, result);
return result;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
* text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link StopFilter},
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
streams.result = new FrenchStemFilter(streams.result, excltable);
// Convert to lowercase after stemming!
streams.result = new LowerCaseFilter(matchVersion, streams.result);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
}
}

View File

@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ru;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
@ -26,7 +25,9 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
@ -39,7 +40,7 @@ import org.apache.lucene.util.Version;
* A default set of stopwords is used unless an alternative list is specified.
* </p>
*/
public final class RussianAnalyzer extends Analyzer
public final class RussianAnalyzer extends StopwordAnalyzerBase
{
/**
* List of typical Russian stopwords.
@ -63,13 +64,6 @@ public final class RussianAnalyzer extends Analyzer
Arrays.asList(RUSSIAN_STOP_WORDS), false));
}
/**
* Contains the stopwords used with the StopFilter.
*/
private final Set<?> stopSet;
private final Version matchVersion;
public RussianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
@ -91,8 +85,7 @@ public final class RussianAnalyzer extends Analyzer
* a stopword set
*/
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
super(matchVersion, stopwords);
}
/**
@ -106,52 +99,21 @@ public final class RussianAnalyzer extends Analyzer
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
* provided {@link Reader}.
*
* @return A {@link TokenStream} built from a
* @return {@link TokenStreamComponents} built from a
* {@link RussianLetterTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter},
* and {@link RussianStemFilter}
*/
@Override
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new RussianLetterTokenizer(reader);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopSet);
result = new RussianStemFilter(result);
return result;
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new RussianLetterTokenizer(reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StopFilter(matchVersion, result, stopwords);
return new TokenStreamComponents(source, new RussianStemFilter(result));
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a
* {@link RussianLetterTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter},
* and {@link RussianStemFilter}
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new RussianLetterTokenizer(reader);
streams.result = new LowerCaseFilter(matchVersion, streams.source);
streams.result = new StopFilter(matchVersion, streams.result, stopSet);
streams.result = new RussianStemFilter(streams.result);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
}
return streams.result;
}
}

View File

@ -16,16 +16,18 @@ package org.apache.lucene.analysis.th;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
/**
@ -35,41 +37,28 @@ import org.apache.lucene.util.Version;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class ThaiAnalyzer extends Analyzer {
public final class ThaiAnalyzer extends ReusableAnalyzerBase {
private final Version matchVersion;
public ThaiAnalyzer(Version matchVersion) {
this.matchVersion = matchVersion;
}
/**
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
* provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a
* {@link StandardTokenizer} filtered with {@link StandardFilter},
* {@link ThaiWordFilter}, and {@link StopFilter}
*/
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = new StandardTokenizer(matchVersion, reader);
ts = new StandardFilter(ts);
ts = new ThaiWordFilter(ts);
ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
return ts;
}
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new ThaiWordFilter(streams.result);
streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
streams.result.reset(); // reset the ThaiWordFilter's state
}
return streams.result;
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(source);
result = new ThaiWordFilter(result);
return new TokenStreamComponents(source, new StopFilter(matchVersion,
result, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
}
}

View File

@ -17,10 +17,10 @@ package org.apache.lucene.analysis.ar;
* limitations under the License.
*/
import java.io.StringReader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
@ -78,7 +78,9 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
* Test that custom stopwords work, and are not case-sensitive.
*/
public void testCustomStopwords() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" });
Set<String> set = new HashSet<String>();
Collections.addAll(set, "the", "and", "a");
ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set);
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
"brown", "fox" });
}

View File

@ -17,10 +17,12 @@ package org.apache.lucene.analysis.br;
* limitations under the License.
*/
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
/**

View File

@ -17,11 +17,8 @@ package org.apache.lucene.analysis.fa;
* limitations under the License.
*/
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**

View File

@ -0,0 +1,163 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
/**
* An convenience subclass of Analyzer that makes it easy to implement
* {@link TokenStream} reuse.
* <p>
* ReusableAnalyzerBase is a simplification of Analyzer that supports easy reuse
* for the most common use-cases. Analyzers such as
* {@link PerFieldAnalyzerWrapper} that behave differently depending upon the
* field name need to subclass Analyzer directly instead.
* </p>
* <p>
* To prevent consistency problems, this class does not allow subclasses to
* extend {@link #reusableTokenStream(String, Reader)} or
* {@link #tokenStream(String, Reader)} directly. Instead, subclasses must
* implement {@link #createComponents(String, Reader)}.
* </p>
*/
public abstract class ReusableAnalyzerBase extends Analyzer {
/**
* Creates a new {@link TokenStreamComponents} instance for this analyzer.
*
* @param fieldName
* the name of the fields content passed to the
* {@link TokenStreamComponents} sink as a reader
* @param aReader
* the reader passed to the {@link Tokenizer} constructor
* @return the {@link TokenStreamComponents} for this analyzer.
*/
protected abstract TokenStreamComponents createComponents(String fieldName,
Reader aReader);
/**
* This method uses {@link #createComponents(String, Reader)} to obtain an
* instance of {@link TokenStreamComponents}. It returns the sink of the
* components and stores the components internally. Subsequent calls to this
* method will reuse the previously stored components if and only if the
* {@link TokenStreamComponents#reset(Reader)} method returned
* <code>true</code>. Otherwise a new instance of
* {@link TokenStreamComponents} is created.
*
* @param fieldName the name of the field the created TokenStream is used for
* @param reader the reader the streams source reads from
*/
@Override
public final TokenStream reusableTokenStream(final String fieldName,
final Reader reader) throws IOException {
TokenStreamComponents streamChain = (TokenStreamComponents)
getPreviousTokenStream();
if (streamChain == null || !streamChain.reset(reader)) {
streamChain = createComponents(fieldName, reader);
setPreviousTokenStream(streamChain);
}
return streamChain.getTokenStream();
}
/**
* This method uses {@link #createComponents(String, Reader)} to obtain an
* instance of {@link TokenStreamComponents} and returns the sink of the
* components. Each calls to this method will create a new instance of
* {@link TokenStreamComponents}. Created {@link TokenStream} instances are
* never reused.
*
* @param fieldName the name of the field the created TokenStream is used for
* @param reader the reader the streams source reads from
*/
@Override
public final TokenStream tokenStream(final String fieldName,
final Reader reader) {
return createComponents(fieldName, reader).getTokenStream();
}
/**
* This class encapsulates the outer components of a token stream. It provides
* access to the source ({@link Tokenizer}) and the outer end (sink), an
* instance of {@link TokenFilter} which also serves as the
* {@link TokenStream} returned by
* {@link Analyzer#tokenStream(String, Reader)} and
* {@link Analyzer#reusableTokenStream(String, Reader)}.
*/
public static class TokenStreamComponents {
final Tokenizer source;
final TokenStream sink;
/**
* Creates a new {@link TokenStreamComponents} instance.
*
* @param source
* the analyzer's tokenizer
* @param result
* the analyzer's resulting token stream
*/
public TokenStreamComponents(final Tokenizer source,
final TokenStream result) {
this.source = source;
this.sink = result;
}
/**
* Creates a new {@link TokenStreamComponents} instance.
*
* @param source
* the analyzer's tokenizer
*/
public TokenStreamComponents(final Tokenizer source) {
this.source = source;
this.sink = source;
}
/**
* Resets the encapsulated components with the given reader. This method by
* default returns <code>true</code> indicating that the components have
* been reset successfully. Subclasses of {@link ReusableAnalyzerBase} might use
* their own {@link TokenStreamComponents} returning <code>false</code> if
* the components cannot be reset.
*
* @param reader
* a reader to reset the source component
* @return <code>true</code> if the components were reset, otherwise
* <code>false</code>
* @throws IOException
* if the component's reset method throws an {@link IOException}
*/
protected boolean reset(final Reader reader) throws IOException {
source.reset(reader);
if(sink != source)
sink.reset(); // only reset if the sink reference is different from source
return true;
}
/**
* Returns the sink {@link TokenStream}
*
* @return the sink {@link TokenStream}
*/
protected TokenStream getTokenStream() {
return sink;
}
}
}

View File

@ -18,25 +18,15 @@ package org.apache.lucene.analysis;
*/
import java.io.Reader;
import java.io.IOException;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter} */
public final class SimpleAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new LowerCaseTokenizer(reader);
}
public final class SimpleAnalyzer extends ReusableAnalyzerBase {
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new LowerCaseTokenizer(reader);
setPreviousTokenStream(tokenizer);
} else
tokenizer.reset(reader);
return tokenizer;
protected TokenStreamComponents createComponents(final String fieldName,
final Reader reader) {
return new TokenStreamComponents(new LowerCaseTokenizer(reader));
}
}

View File

@ -24,6 +24,7 @@ import java.util.Arrays;
import java.util.Set;
import java.util.List;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.util.Version;
/** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
@ -38,9 +39,7 @@ import org.apache.lucene.util.Version;
* </ul>
*/
public final class StopAnalyzer extends Analyzer {
private final Set<?> stopWords;
private final Version matchVersion;
public final class StopAnalyzer extends StopwordAnalyzerBase {
/** An unmodifiable set containing some common English words that are not usually useful
for searching.*/
@ -65,16 +64,14 @@ public final class StopAnalyzer extends Analyzer {
* @param matchVersion See <a href="#version">above</a>
*/
public StopAnalyzer(Version matchVersion) {
stopWords = ENGLISH_STOP_WORDS_SET;
this.matchVersion = matchVersion;
this(matchVersion, ENGLISH_STOP_WORDS_SET);
}
/** Builds an analyzer with the stop words from the given set.
* @param matchVersion See <a href="#version">above</a>
* @param stopWords Set of stop words */
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
this.stopWords = stopWords;
this.matchVersion = matchVersion;
super(matchVersion, stopWords);
}
/** Builds an analyzer with the stop words from the given file.
@ -82,8 +79,7 @@ public final class StopAnalyzer extends Analyzer {
* @param matchVersion See <a href="#version">above</a>
* @param stopwordsFile File to load stop words from */
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwordsFile);
this.matchVersion = matchVersion;
this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
}
/** Builds an analyzer with the stop words from the given reader.
@ -91,34 +87,21 @@ public final class StopAnalyzer extends Analyzer {
* @param matchVersion See <a href="#version">above</a>
* @param stopwords Reader to load stop words from */
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwords);
this.matchVersion = matchVersion;
this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
/** Filters LowerCaseTokenizer with StopFilter. */
/**
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a {@link LowerCaseTokenizer} filtered with
* {@link StopFilter}
*/
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(matchVersion,
new LowerCaseTokenizer(reader), stopWords);
}
/** Filters LowerCaseTokenizer with StopFilter. */
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new LowerCaseTokenizer(reader);
streams.result = new StopFilter(matchVersion,
streams.source, stopWords);
setPreviousTokenStream(streams);
} else
streams.source.reset(reader);
return streams.result;
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new LowerCaseTokenizer(reader);
return new TokenStreamComponents(source, new StopFilter(matchVersion,
source, stopwords));
}
}

View File

@ -0,0 +1,110 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.util.Version;
/**
* Base class for Analyzers that need to make use of stopword sets.
*
*/
public abstract class StopwordAnalyzerBase extends ReusableAnalyzerBase {
/**
* An immutable stopword set
*/
protected final CharArraySet stopwords;
protected final Version matchVersion;
/**
* Returns the analyzer's stopword set or an empty set if the analyzer has no
* stopwords
*
* @return the analyzer's stopword set or an empty set if the analyzer has no
* stopwords
*/
public Set<?> getStopwordSet() {
return stopwords;
}
/**
* Creates a new instance initialized with the given stopword set
*
* @param version
* the Lucene version for cross version compatibility
* @param stopwords
* the analyzer's stopword set
*/
protected StopwordAnalyzerBase(final Version version, final Set<?> stopwords) {
/*
* no need to call
* setOverridesTokenStreamMethod(StopwordAnalyzerBase.class); here, both
* tokenStream methods are final in this class.
*/
matchVersion = version;
// analyzers should use char array set for stopwords!
this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
.unmodifiableSet(CharArraySet.copy(version, stopwords));
}
/**
* Creates a new Analyzer with an empty stopword set
*
* @param version
* the Lucene version for cross version compatibility
*/
protected StopwordAnalyzerBase(final Version version) {
this(version, null);
}
/**
* Creates a CharArraySet from a file resource associated with a class. (See
* {@link Class#getResourceAsStream(String)}).
*
* @param ignoreCase
* <code>true</code> if the set should ignore the case of the
* stopwords, otherwise <code>false</code>
* @param aClass
* a class that is associated with the given stopwordResource
* @param resource
* name of the resource file associated with the given class
* @param comment
* comment string to ignore in the stopword file
* @return a CharArraySet containing the distinct stopwords from the given
* file
* @throws IOException
* if loading the stopwords throws an {@link IOException}
*/
protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
final Class<? extends ReusableAnalyzerBase> aClass, final String resource,
final String comment) throws IOException {
final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
comment);
final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
set.addAll(wordSet);
return set;
}
}

View File

@ -18,24 +18,14 @@ package org.apache.lucene.analysis;
*/
import java.io.Reader;
import java.io.IOException;
/** An Analyzer that uses {@link WhitespaceTokenizer}. */
public final class WhitespaceAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
public final class WhitespaceAnalyzer extends ReusableAnalyzerBase {
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new WhitespaceTokenizer(reader);
setPreviousTokenStream(tokenizer);
} else
tokenizer.reset(reader);
return tokenizer;
protected TokenStreamComponents createComponents(final String fieldName,
final Reader reader) {
return new TokenStreamComponents(new WhitespaceTokenizer(reader));
}
}

View File

@ -21,15 +21,69 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
/**
* Loader for text files that represent a list of stopwords.
*/
public class WordlistLoader {
/**
* Loads a text file associated with a given class (See
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
* the file should contain only one word. The words need to be in lower-case if
* you make use of an Analyzer which uses LowerCaseFilter (like
* StandardAnalyzer).
*
* @param aClass
* a class that is associated with the given stopwordResource
* @param stopwordResource
* name of the resource file associated with the given class
* @return a {@link Set} with the file's words
*/
public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
throws IOException {
final Reader reader = new BufferedReader(new InputStreamReader(aClass
.getResourceAsStream(stopwordResource), "UTF-8"));
try {
return getWordSet(reader);
} finally {
reader.close();
}
}
/**
* Loads a text file associated with a given class (See
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
* the file should contain only one word. The words need to be in lower-case if
* you make use of an Analyzer which uses LowerCaseFilter (like
* StandardAnalyzer).
*
* @param aClass
* a class that is associated with the given stopwordResource
* @param stopwordResource
* name of the resource file associated with the given class
* @param comment
* the comment string to ignore
* @return a {@link Set} with the file's words
*/
public static Set<String> getWordSet(Class<?> aClass,
String stopwordResource, String comment) throws IOException {
final Reader reader = new BufferedReader(new InputStreamReader(aClass
.getResourceAsStream(stopwordResource), "UTF-8"));
try {
return getWordSet(reader, comment);
} finally {
reader.close();
}
}
/**
* Loads a text file and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
@ -40,17 +94,15 @@ public class WordlistLoader {
* @return A HashSet with the file's words
*/
public static HashSet<String> getWordSet(File wordfile) throws IOException {
HashSet<String> result = new HashSet<String>();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
result = getWordSet(reader);
return getWordSet(reader);
}
finally {
if (reader != null)
reader.close();
}
return result;
}
/**
@ -64,17 +116,15 @@ public class WordlistLoader {
* @return A HashSet with the file's words
*/
public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
HashSet<String> result = new HashSet<String>();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
result = getWordSet(reader, comment);
return getWordSet(reader, comment);
}
finally {
if (reader != null)
reader.close();
}
return result;
}
@ -88,7 +138,7 @@ public class WordlistLoader {
* @return A HashSet with the reader's words
*/
public static HashSet<String> getWordSet(Reader reader) throws IOException {
HashSet<String> result = new HashSet<String>();
final HashSet<String> result = new HashSet<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
@ -119,7 +169,7 @@ public class WordlistLoader {
* @return A HashSet with the reader's words
*/
public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
HashSet<String> result = new HashSet<String>();
final HashSet<String> result = new HashSet<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
@ -154,21 +204,18 @@ public class WordlistLoader {
public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
if (wordstemfile == null)
throw new NullPointerException("wordstemfile may not be null");
HashMap<String, String> result = new HashMap<String, String>();
final HashMap<String, String> result = new HashMap<String,String>();
BufferedReader br = null;
FileReader fr = null;
try {
fr = new FileReader(wordstemfile);
br = new BufferedReader(fr);
br = new BufferedReader(new FileReader(wordstemfile));
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
if (fr != null)
fr.close();
if (br != null)
if(br != null)
br.close();
}
return result;

View File

@ -0,0 +1,5 @@
#comment
ONE
two
#comment
three

View File

@ -0,0 +1,3 @@
ONE
two
three