mirror of https://github.com/apache/lucene.git
LUCENE-2034: Refactor analyzer reuse and stopword handling
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@895339 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5e77ec9845
commit
a949836869
|
@ -105,6 +105,12 @@ New features
|
|||
backwards compatibility. If Version < 3.1 is passed to the constructor,
|
||||
LowerCaseFilter yields the old behavior. (Simon Willnauer, Robert Muir)
|
||||
|
||||
* LUCENE-2034: Added ReusableAnalyzerBase, an abstract subclass of Analyzer
|
||||
that makes it easier to reuse TokenStreams correctly. This issue also added
|
||||
StopwordAnalyzerBase, which improves consistency of all Analyzers that use
|
||||
stopwords, and implement many analyzers in contrib with it.
|
||||
(Simon Willnauer via Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2086: When resolving deleted terms, do so in term sort order
|
||||
|
|
|
@ -19,17 +19,15 @@ package org.apache.lucene.analysis.ar;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
|
@ -52,7 +50,7 @@ import org.apache.lucene.util.Version;
|
|||
* </ul>
|
||||
*
|
||||
*/
|
||||
public final class ArabicAnalyzer extends Analyzer {
|
||||
public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* File containing default Arabic stopwords.
|
||||
|
@ -62,21 +60,18 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
*/
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private final Set<?> stoptable;
|
||||
/**
|
||||
* The comment character in the stopwords file. All lines prefixed with this will be ignored
|
||||
* @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
|
||||
*/
|
||||
// TODO make this private
|
||||
public static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<String> getDefaultStopSet(){
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -85,34 +80,19 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<String> DEFAULT_STOP_SET;
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||
InputStream stream = ArabicAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||
STOPWORDS_COMMENT));
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
|
@ -129,8 +109,7 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
* a stopword set
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -159,54 +138,21 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
||||
* @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
|
||||
* and {@link ArabicStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new ArabicLetterTokenizer( reader );
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new ArabicLetterTokenizer(reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
// the order here is important: the stopword list is not normalized!
|
||||
result = new StopFilter( matchVersion, result, stoptable );
|
||||
result = new ArabicNormalizationFilter( result );
|
||||
result = new ArabicStemFilter( result );
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||
* in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
|
||||
* and {@link ArabicStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new ArabicLetterTokenizer(reader);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.source);
|
||||
// the order here is important: the stopword list is not normalized!
|
||||
streams.result = new StopFilter( matchVersion, streams.result, stoptable);
|
||||
streams.result = new ArabicNormalizationFilter(streams.result);
|
||||
streams.result = new ArabicStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
return new TokenStreamComponents(source, new ArabicStemFilter(result));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,17 +17,16 @@ package org.apache.lucene.analysis.bg;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
|
@ -43,7 +42,7 @@ import org.apache.lucene.util.Version;
|
|||
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
|
||||
* <p>
|
||||
*/
|
||||
public final class BulgarianAnalyzer extends Analyzer {
|
||||
public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* File containing default Bulgarian stopwords.
|
||||
|
@ -54,14 +53,12 @@ public final class BulgarianAnalyzer extends Analyzer {
|
|||
*/
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private final Set<?> stoptable;
|
||||
/**
|
||||
* The comment character in the stopwords file. All lines prefixed with this
|
||||
* will be ignored
|
||||
* @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
|
||||
*/
|
||||
//TODO make this private
|
||||
public static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
|
@ -69,7 +66,7 @@ public final class BulgarianAnalyzer extends Analyzer {
|
|||
*
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<String> getDefaultStopSet() {
|
||||
public static Set<?> getDefaultStopSet() {
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -78,35 +75,19 @@ public final class BulgarianAnalyzer extends Analyzer {
|
|||
* class accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<String> DEFAULT_STOP_SET;
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||
} catch (Exception ex) {
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set", ex);
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||
final InputStream stream = BulgarianAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||
STOPWORDS_COMMENT));
|
||||
} finally {
|
||||
if(stream != null)
|
||||
stream.close();
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
|
@ -119,58 +100,24 @@ public final class BulgarianAnalyzer extends Analyzer {
|
|||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
super();
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
|
||||
stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided
|
||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
|
||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, and {@link BulgarianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new BulgarianStemFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
|
||||
* text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from an {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, and {@link BulgarianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
streams.result = new BulgarianStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,19 +21,21 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -49,7 +51,7 @@ import org.apache.lucene.util.Version;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class BrazilianAnalyzer extends Analyzer {
|
||||
public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* List of typical Brazilian Portuguese stopwords.
|
||||
|
@ -91,10 +93,6 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
Arrays.asList(BRAZILIAN_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
private final Set<?> stoptable;
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
|
@ -102,8 +100,6 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
// TODO make this private in 3.1
|
||||
private Set<?> excltable = Collections.emptySet();
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
|
||||
*/
|
||||
|
@ -120,8 +116,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
* a stopword set
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -188,53 +183,22 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
excltable = WordlistLoader.getWordSet( exclusionlist );
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
|
||||
* {@link BrazilianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer( matchVersion, reader );
|
||||
result = new LowerCaseFilter( matchVersion, result );
|
||||
result = new StandardFilter( result );
|
||||
result = new StopFilter( matchVersion, result, stoptable );
|
||||
result = new BrazilianStemFilter( result, excltable );
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||
* in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
|
||||
* {@link BrazilianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.source);
|
||||
streams.result = new StandardFilter(streams.result);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
streams.result = new BrazilianStemFilter(streams.result, excltable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
/**
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
|
||||
* {@link BrazilianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
return new TokenStreamComponents(source, new BrazilianStemFilter(result,
|
||||
excltable));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,12 +19,12 @@ package org.apache.lucene.analysis.cjk;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
@ -35,7 +35,7 @@ import java.util.Set;
|
|||
* filters with {@link StopFilter}
|
||||
*
|
||||
*/
|
||||
public final class CJKAnalyzer extends Analyzer {
|
||||
public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
||||
//~ Static fields/initializers ---------------------------------------------
|
||||
|
||||
/**
|
||||
|
@ -71,11 +71,6 @@ public final class CJKAnalyzer extends Analyzer {
|
|||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS),
|
||||
false));
|
||||
}
|
||||
/**
|
||||
* stop word list
|
||||
*/
|
||||
private final Set<?> stopTable;
|
||||
private final Version matchVersion;
|
||||
|
||||
//~ Constructors -----------------------------------------------------------
|
||||
|
||||
|
@ -95,8 +90,7 @@ public final class CJKAnalyzer extends Analyzer {
|
|||
* a stopword set
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -106,51 +100,15 @@ public final class CJKAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion, String... stopWords) {
|
||||
stopTable = StopFilter.makeStopSet(matchVersion, stopWords);
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords));
|
||||
}
|
||||
|
||||
//~ Methods ----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @param fieldName lucene field name
|
||||
* @param reader input {@link Reader}
|
||||
* @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
|
||||
* {@link StopFilter}
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new StopFilter(matchVersion, new CJKTokenizer(reader), stopTable);
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||
* in the provided {@link Reader}.
|
||||
*
|
||||
* @param fieldName lucene field name
|
||||
* @param reader Input {@link Reader}
|
||||
* @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
|
||||
* {@link StopFilter}
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
/* tokenStream() is final, no back compat issue */
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new CJKTokenizer(reader);
|
||||
streams.result = new StopFilter(matchVersion, streams.source, stopTable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new CJKTokenizer(reader);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,8 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
|
||||
|
||||
/**
|
||||
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
|
||||
|
|
|
@ -17,10 +17,11 @@ package org.apache.lucene.analysis.cn;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
|
@ -29,49 +30,19 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
*
|
||||
*/
|
||||
|
||||
public final class ChineseAnalyzer extends Analyzer {
|
||||
public final class ChineseAnalyzer extends ReusableAnalyzerBase {
|
||||
|
||||
public ChineseAnalyzer() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link ChineseTokenizer}
|
||||
* filtered with {@link ChineseFilter}.
|
||||
*/
|
||||
/**
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
||||
* provided {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents} built from a
|
||||
* {@link ChineseTokenizer} filtered with {@link ChineseFilter}
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new ChineseTokenizer(reader);
|
||||
result = new ChineseFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the
|
||||
* provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link ChineseTokenizer}
|
||||
* filtered with {@link ChineseFilter}.
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
/* tokenStream() is final, no back compat issue */
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new ChineseTokenizer(reader);
|
||||
streams.result = new ChineseFilter(streams.source);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new ChineseTokenizer(reader);
|
||||
return new TokenStreamComponents(source, new ChineseFilter(source));
|
||||
}
|
||||
}
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.cz;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
|
@ -30,9 +32,9 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
import java.io.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Czech language.
|
||||
|
@ -53,7 +55,7 @@ import java.util.Collections;
|
|||
* <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
|
||||
* </ul>
|
||||
*/
|
||||
public final class CzechAnalyzer extends Analyzer {
|
||||
public final class CzechAnalyzer extends ReusableAnalyzerBase {
|
||||
|
||||
/**
|
||||
* List of typical stopwords.
|
||||
|
@ -95,10 +97,11 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
// TODO make this final in 3.1
|
||||
// TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
|
||||
private Set<?> stoptable;
|
||||
private final Version matchVersion;
|
||||
|
||||
|
@ -168,6 +171,7 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
|
||||
* and {@link #CzechAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
// TODO extend StopwordAnalyzerBase once this method is gone!
|
||||
public void loadStopWords( InputStream wordfile, String encoding ) {
|
||||
setPreviousTokenStream(null); // force a new stopfilter to be created
|
||||
if ( wordfile == null ) {
|
||||
|
@ -191,58 +195,25 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
stoptable = Collections.emptySet();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, and {@link CzechStemFilter} (only if version is
|
||||
* >= LUCENE_31)
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream tokenStream( String fieldName, Reader reader ) {
|
||||
TokenStream result = new StandardTokenizer( matchVersion, reader );
|
||||
result = new StandardFilter( result );
|
||||
result = new LowerCaseFilter( matchVersion, result );
|
||||
result = new StopFilter( matchVersion, result, stoptable );
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new CzechStemFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
|
||||
* text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, and {@link CzechStemFilter} (only if version is
|
||||
* >= LUCENE_31)
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
streams.result = new StopFilter( matchVersion, streams.result, stoptable);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
streams.result = new CzechStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stoptable);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new CzechStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -29,13 +29,15 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -51,7 +53,7 @@ import org.apache.lucene.util.Version;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class GermanAnalyzer extends Analyzer {
|
||||
public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* List of typical german stopwords.
|
||||
|
@ -89,8 +91,6 @@ public final class GermanAnalyzer extends Analyzer {
|
|||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
//TODO make this final in 3.1
|
||||
private Set<?> stopSet;
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
|
@ -98,8 +98,6 @@ public final class GermanAnalyzer extends Analyzer {
|
|||
// TODO make this final in 3.1
|
||||
private Set<?> exclusionSet;
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #getDefaultStopSet()}.
|
||||
|
@ -131,9 +129,8 @@ public final class GermanAnalyzer extends Analyzer {
|
|||
* a stemming exclusion set
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
super(matchVersion, stopwords);
|
||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -189,49 +186,21 @@ public final class GermanAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
||||
* provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
|
||||
* @return {@link TokenStreamComponents} built from a
|
||||
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}, and
|
||||
* {@link GermanStemFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopSet);
|
||||
result = new GermanStemFilter(result, exclusionSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||
* in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
|
||||
* {@link GermanStemFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
streams.result = new StopFilter( matchVersion, streams.result, stopSet);
|
||||
streams.result = new GermanStemFilter(streams.result, exclusionSet);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,14 +19,15 @@ package org.apache.lucene.analysis.el;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
@ -43,7 +44,7 @@ import java.util.Set;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class GreekAnalyzer extends Analyzer
|
||||
public final class GreekAnalyzer extends StopwordAnalyzerBase
|
||||
{
|
||||
/**
|
||||
* List of typical Greek stopwords.
|
||||
|
@ -73,13 +74,6 @@ public final class GreekAnalyzer extends Analyzer
|
|||
Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
private final Set<?> stopSet;
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
public GreekAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
@ -93,8 +87,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
* a stopword set
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -116,46 +109,19 @@ public final class GreekAnalyzer extends Analyzer
|
|||
this(matchVersion, stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||
* {@link GreekLowerCaseFilter} and {@link StopFilter}
|
||||
*/
|
||||
/**
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
||||
* provided {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents} built from a
|
||||
* {@link StandardTokenizer} filtered with
|
||||
* {@link GreekLowerCaseFilter} and {@link StopFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new GreekLowerCaseFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||
* in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
|
||||
* {@link GreekLowerCaseFilter} and {@link StopFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new GreekLowerCaseFilter(streams.source);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stopSet);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
final TokenStream result = new GreekLowerCaseFilter(source);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,17 +19,15 @@ package org.apache.lucene.analysis.fa;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
|
@ -45,7 +43,7 @@ import org.apache.lucene.util.Version;
|
|||
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
|
||||
* </p>
|
||||
*/
|
||||
public final class PersianAnalyzer extends Analyzer {
|
||||
public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* File containing default Persian stopwords.
|
||||
|
@ -57,11 +55,6 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
*/
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private final Set<?> stoptable;
|
||||
|
||||
/**
|
||||
* The comment character in the stopwords file. All lines prefixed with this
|
||||
* will be ignored
|
||||
|
@ -85,30 +78,15 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||
InputStream stream = PersianAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||
STOPWORDS_COMMENT));
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
|
@ -126,8 +104,7 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
* a stopword set
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -156,18 +133,19 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
|
||||
* @return {@link TokenStreamComponents} built from a {@link ArabicLetterTokenizer}
|
||||
* filtered with {@link LowerCaseFilter},
|
||||
* {@link ArabicNormalizationFilter},
|
||||
* {@link PersianNormalizationFilter} and Persian Stop words
|
||||
*/
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new ArabicLetterTokenizer(reader);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new ArabicLetterTokenizer(reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
/* additional persian-specific normalization */
|
||||
result = new PersianNormalizationFilter(result);
|
||||
|
@ -175,44 +153,6 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
* the order here is important: the stopword list is normalized with the
|
||||
* above!
|
||||
*/
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||
* in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
|
||||
* filtered with {@link LowerCaseFilter},
|
||||
* {@link ArabicNormalizationFilter},
|
||||
* {@link PersianNormalizationFilter} and Persian Stop words
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new ArabicLetterTokenizer(reader);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.source);
|
||||
streams.result = new ArabicNormalizationFilter(streams.result);
|
||||
/* additional persian-specific normalization */
|
||||
streams.result = new PersianNormalizationFilter(streams.result);
|
||||
/*
|
||||
* the order here is important: the stopword list is normalized with the
|
||||
* above!
|
||||
*/
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,9 @@ package org.apache.lucene.analysis.fr;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
|
@ -59,7 +61,7 @@ import java.util.Set;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class FrenchAnalyzer extends Analyzer {
|
||||
public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Extended list of typical French stopwords.
|
||||
|
@ -91,18 +93,12 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
"été", "être", "ô"
|
||||
};
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
private final Set<?> stoptable;
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
//TODO make this final in 3.0
|
||||
private Set<?> excltable = Collections.<Object>emptySet();
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
|
@ -148,9 +144,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
|
||||
Set<?> stemExclutionSet) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(matchVersion, stopwords));
|
||||
super(matchVersion, stopwords);
|
||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(matchVersion, stemExclutionSet));
|
||||
}
|
||||
|
@ -202,54 +196,22 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
||||
* {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link StopFilter},
|
||||
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new FrenchStemFilter(result, excltable);
|
||||
// Convert to lowercase after stemming!
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the
|
||||
* text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link StopFilter},
|
||||
* {@link FrenchStemFilter} and {@link LowerCaseFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
streams.result = new FrenchStemFilter(streams.result, excltable);
|
||||
// Convert to lowercase after stemming!
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ru;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
@ -26,7 +25,9 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -39,7 +40,7 @@ import org.apache.lucene.util.Version;
|
|||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
* </p>
|
||||
*/
|
||||
public final class RussianAnalyzer extends Analyzer
|
||||
public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||
{
|
||||
/**
|
||||
* List of typical Russian stopwords.
|
||||
|
@ -63,13 +64,6 @@ public final class RussianAnalyzer extends Analyzer
|
|||
Arrays.asList(RUSSIAN_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private final Set<?> stopSet;
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
public RussianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
@ -91,8 +85,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
* a stopword set
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -106,52 +99,21 @@ public final class RussianAnalyzer extends Analyzer
|
|||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
||||
* provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a
|
||||
* @return {@link TokenStreamComponents} built from a
|
||||
* {@link RussianLetterTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||
* and {@link RussianStemFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
TokenStream result = new RussianLetterTokenizer(reader);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopSet);
|
||||
result = new RussianStemFilter(result);
|
||||
return result;
|
||||
}
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new RussianLetterTokenizer(reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
return new TokenStreamComponents(source, new RussianStemFilter(result));
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
|
||||
* in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from a
|
||||
* {@link RussianLetterTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||
* and {@link RussianStemFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new RussianLetterTokenizer(reader);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.source);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stopSet);
|
||||
streams.result = new RussianStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,16 +16,18 @@ package org.apache.lucene.analysis.th;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -35,41 +37,28 @@ import org.apache.lucene.util.Version;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class ThaiAnalyzer extends Analyzer {
|
||||
public final class ThaiAnalyzer extends ReusableAnalyzerBase {
|
||||
private final Version matchVersion;
|
||||
|
||||
public ThaiAnalyzer(Version matchVersion) {
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
||||
* provided {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents} built from a
|
||||
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
||||
* {@link ThaiWordFilter}, and {@link StopFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream ts = new StandardTokenizer(matchVersion, reader);
|
||||
ts = new StandardFilter(ts);
|
||||
ts = new ThaiWordFilter(ts);
|
||||
ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
return ts;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new ThaiWordFilter(streams.result);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
streams.result.reset(); // reset the ThaiWordFilter's state
|
||||
}
|
||||
return streams.result;
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new ThaiWordFilter(result);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion,
|
||||
result, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,10 +17,10 @@ package org.apache.lucene.analysis.ar;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -78,7 +78,9 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
|||
* Test that custom stopwords work, and are not case-sensitive.
|
||||
*/
|
||||
public void testCustomStopwords() throws Exception {
|
||||
ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" });
|
||||
Set<String> set = new HashSet<String>();
|
||||
Collections.addAll(set, "the", "and", "a");
|
||||
ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set);
|
||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||
"brown", "fox" });
|
||||
}
|
||||
|
|
|
@ -17,10 +17,12 @@ package org.apache.lucene.analysis.br;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
|
|
@ -17,11 +17,8 @@ package org.apache.lucene.analysis.fa;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* An convenience subclass of Analyzer that makes it easy to implement
|
||||
* {@link TokenStream} reuse.
|
||||
* <p>
|
||||
* ReusableAnalyzerBase is a simplification of Analyzer that supports easy reuse
|
||||
* for the most common use-cases. Analyzers such as
|
||||
* {@link PerFieldAnalyzerWrapper} that behave differently depending upon the
|
||||
* field name need to subclass Analyzer directly instead.
|
||||
* </p>
|
||||
* <p>
|
||||
* To prevent consistency problems, this class does not allow subclasses to
|
||||
* extend {@link #reusableTokenStream(String, Reader)} or
|
||||
* {@link #tokenStream(String, Reader)} directly. Instead, subclasses must
|
||||
* implement {@link #createComponents(String, Reader)}.
|
||||
* </p>
|
||||
*/
|
||||
public abstract class ReusableAnalyzerBase extends Analyzer {
|
||||
|
||||
/**
|
||||
* Creates a new {@link TokenStreamComponents} instance for this analyzer.
|
||||
*
|
||||
* @param fieldName
|
||||
* the name of the fields content passed to the
|
||||
* {@link TokenStreamComponents} sink as a reader
|
||||
* @param aReader
|
||||
* the reader passed to the {@link Tokenizer} constructor
|
||||
* @return the {@link TokenStreamComponents} for this analyzer.
|
||||
*/
|
||||
protected abstract TokenStreamComponents createComponents(String fieldName,
|
||||
Reader aReader);
|
||||
|
||||
/**
|
||||
* This method uses {@link #createComponents(String, Reader)} to obtain an
|
||||
* instance of {@link TokenStreamComponents}. It returns the sink of the
|
||||
* components and stores the components internally. Subsequent calls to this
|
||||
* method will reuse the previously stored components if and only if the
|
||||
* {@link TokenStreamComponents#reset(Reader)} method returned
|
||||
* <code>true</code>. Otherwise a new instance of
|
||||
* {@link TokenStreamComponents} is created.
|
||||
*
|
||||
* @param fieldName the name of the field the created TokenStream is used for
|
||||
* @param reader the reader the streams source reads from
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream reusableTokenStream(final String fieldName,
|
||||
final Reader reader) throws IOException {
|
||||
TokenStreamComponents streamChain = (TokenStreamComponents)
|
||||
getPreviousTokenStream();
|
||||
if (streamChain == null || !streamChain.reset(reader)) {
|
||||
streamChain = createComponents(fieldName, reader);
|
||||
setPreviousTokenStream(streamChain);
|
||||
}
|
||||
return streamChain.getTokenStream();
|
||||
}
|
||||
|
||||
/**
|
||||
* This method uses {@link #createComponents(String, Reader)} to obtain an
|
||||
* instance of {@link TokenStreamComponents} and returns the sink of the
|
||||
* components. Each calls to this method will create a new instance of
|
||||
* {@link TokenStreamComponents}. Created {@link TokenStream} instances are
|
||||
* never reused.
|
||||
*
|
||||
* @param fieldName the name of the field the created TokenStream is used for
|
||||
* @param reader the reader the streams source reads from
|
||||
*/
|
||||
@Override
|
||||
public final TokenStream tokenStream(final String fieldName,
|
||||
final Reader reader) {
|
||||
return createComponents(fieldName, reader).getTokenStream();
|
||||
}
|
||||
|
||||
/**
|
||||
* This class encapsulates the outer components of a token stream. It provides
|
||||
* access to the source ({@link Tokenizer}) and the outer end (sink), an
|
||||
* instance of {@link TokenFilter} which also serves as the
|
||||
* {@link TokenStream} returned by
|
||||
* {@link Analyzer#tokenStream(String, Reader)} and
|
||||
* {@link Analyzer#reusableTokenStream(String, Reader)}.
|
||||
*/
|
||||
public static class TokenStreamComponents {
|
||||
final Tokenizer source;
|
||||
final TokenStream sink;
|
||||
|
||||
/**
|
||||
* Creates a new {@link TokenStreamComponents} instance.
|
||||
*
|
||||
* @param source
|
||||
* the analyzer's tokenizer
|
||||
* @param result
|
||||
* the analyzer's resulting token stream
|
||||
*/
|
||||
public TokenStreamComponents(final Tokenizer source,
|
||||
final TokenStream result) {
|
||||
this.source = source;
|
||||
this.sink = result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link TokenStreamComponents} instance.
|
||||
*
|
||||
* @param source
|
||||
* the analyzer's tokenizer
|
||||
*/
|
||||
public TokenStreamComponents(final Tokenizer source) {
|
||||
this.source = source;
|
||||
this.sink = source;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the encapsulated components with the given reader. This method by
|
||||
* default returns <code>true</code> indicating that the components have
|
||||
* been reset successfully. Subclasses of {@link ReusableAnalyzerBase} might use
|
||||
* their own {@link TokenStreamComponents} returning <code>false</code> if
|
||||
* the components cannot be reset.
|
||||
*
|
||||
* @param reader
|
||||
* a reader to reset the source component
|
||||
* @return <code>true</code> if the components were reset, otherwise
|
||||
* <code>false</code>
|
||||
* @throws IOException
|
||||
* if the component's reset method throws an {@link IOException}
|
||||
*/
|
||||
protected boolean reset(final Reader reader) throws IOException {
|
||||
source.reset(reader);
|
||||
if(sink != source)
|
||||
sink.reset(); // only reset if the sink reference is different from source
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the sink {@link TokenStream}
|
||||
*
|
||||
* @return the sink {@link TokenStream}
|
||||
*/
|
||||
protected TokenStream getTokenStream() {
|
||||
return sink;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -18,25 +18,15 @@ package org.apache.lucene.analysis;
|
|||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
/** An {@link Analyzer} that filters {@link LetterTokenizer}
|
||||
* with {@link LowerCaseFilter} */
|
||||
|
||||
public final class SimpleAnalyzer extends Analyzer {
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new LowerCaseTokenizer(reader);
|
||||
}
|
||||
public final class SimpleAnalyzer extends ReusableAnalyzerBase {
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
|
||||
if (tokenizer == null) {
|
||||
tokenizer = new LowerCaseTokenizer(reader);
|
||||
setPreviousTokenStream(tokenizer);
|
||||
} else
|
||||
tokenizer.reset(reader);
|
||||
return tokenizer;
|
||||
protected TokenStreamComponents createComponents(final String fieldName,
|
||||
final Reader reader) {
|
||||
return new TokenStreamComponents(new LowerCaseTokenizer(reader));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Arrays;
|
|||
import java.util.Set;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
|
||||
|
@ -38,9 +39,7 @@ import org.apache.lucene.util.Version;
|
|||
* </ul>
|
||||
*/
|
||||
|
||||
public final class StopAnalyzer extends Analyzer {
|
||||
private final Set<?> stopWords;
|
||||
private final Version matchVersion;
|
||||
public final class StopAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/** An unmodifiable set containing some common English words that are not usually useful
|
||||
for searching.*/
|
||||
|
@ -65,16 +64,14 @@ public final class StopAnalyzer extends Analyzer {
|
|||
* @param matchVersion See <a href="#version">above</a>
|
||||
*/
|
||||
public StopAnalyzer(Version matchVersion) {
|
||||
stopWords = ENGLISH_STOP_WORDS_SET;
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, ENGLISH_STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given set.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopWords Set of stop words */
|
||||
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
this.matchVersion = matchVersion;
|
||||
super(matchVersion, stopWords);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
|
@ -82,8 +79,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopwordsFile File to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwordsFile);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
|
@ -91,34 +87,21 @@ public final class StopAnalyzer extends Analyzer {
|
|||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopwords Reader to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwords);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
}
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
/**
|
||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents} built from a {@link LowerCaseTokenizer} filtered with
|
||||
* {@link StopFilter}
|
||||
*/
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new StopFilter(matchVersion,
|
||||
new LowerCaseTokenizer(reader), stopWords);
|
||||
}
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new LowerCaseTokenizer(reader);
|
||||
streams.result = new StopFilter(matchVersion,
|
||||
streams.source, stopWords);
|
||||
setPreviousTokenStream(streams);
|
||||
} else
|
||||
streams.source.reset(reader);
|
||||
return streams.result;
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new LowerCaseTokenizer(reader);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion,
|
||||
source, stopwords));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Base class for Analyzers that need to make use of stopword sets.
|
||||
*
|
||||
*/
|
||||
public abstract class StopwordAnalyzerBase extends ReusableAnalyzerBase {
|
||||
|
||||
/**
|
||||
* An immutable stopword set
|
||||
*/
|
||||
protected final CharArraySet stopwords;
|
||||
|
||||
protected final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Returns the analyzer's stopword set or an empty set if the analyzer has no
|
||||
* stopwords
|
||||
*
|
||||
* @return the analyzer's stopword set or an empty set if the analyzer has no
|
||||
* stopwords
|
||||
*/
|
||||
public Set<?> getStopwordSet() {
|
||||
return stopwords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance initialized with the given stopword set
|
||||
*
|
||||
* @param version
|
||||
* the Lucene version for cross version compatibility
|
||||
* @param stopwords
|
||||
* the analyzer's stopword set
|
||||
*/
|
||||
protected StopwordAnalyzerBase(final Version version, final Set<?> stopwords) {
|
||||
/*
|
||||
* no need to call
|
||||
* setOverridesTokenStreamMethod(StopwordAnalyzerBase.class); here, both
|
||||
* tokenStream methods are final in this class.
|
||||
*/
|
||||
matchVersion = version;
|
||||
// analyzers should use char array set for stopwords!
|
||||
this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
|
||||
.unmodifiableSet(CharArraySet.copy(version, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new Analyzer with an empty stopword set
|
||||
*
|
||||
* @param version
|
||||
* the Lucene version for cross version compatibility
|
||||
*/
|
||||
protected StopwordAnalyzerBase(final Version version) {
|
||||
this(version, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a CharArraySet from a file resource associated with a class. (See
|
||||
* {@link Class#getResourceAsStream(String)}).
|
||||
*
|
||||
* @param ignoreCase
|
||||
* <code>true</code> if the set should ignore the case of the
|
||||
* stopwords, otherwise <code>false</code>
|
||||
* @param aClass
|
||||
* a class that is associated with the given stopwordResource
|
||||
* @param resource
|
||||
* name of the resource file associated with the given class
|
||||
* @param comment
|
||||
* comment string to ignore in the stopword file
|
||||
* @return a CharArraySet containing the distinct stopwords from the given
|
||||
* file
|
||||
* @throws IOException
|
||||
* if loading the stopwords throws an {@link IOException}
|
||||
*/
|
||||
protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
|
||||
final Class<? extends ReusableAnalyzerBase> aClass, final String resource,
|
||||
final String comment) throws IOException {
|
||||
final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
|
||||
comment);
|
||||
final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
|
||||
set.addAll(wordSet);
|
||||
return set;
|
||||
}
|
||||
|
||||
}
|
|
@ -18,24 +18,14 @@ package org.apache.lucene.analysis;
|
|||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
/** An Analyzer that uses {@link WhitespaceTokenizer}. */
|
||||
|
||||
public final class WhitespaceAnalyzer extends Analyzer {
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
public final class WhitespaceAnalyzer extends ReusableAnalyzerBase {
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
|
||||
if (tokenizer == null) {
|
||||
tokenizer = new WhitespaceTokenizer(reader);
|
||||
setPreviousTokenStream(tokenizer);
|
||||
} else
|
||||
tokenizer.reset(reader);
|
||||
return tokenizer;
|
||||
protected TokenStreamComponents createComponents(final String fieldName,
|
||||
final Reader reader) {
|
||||
return new TokenStreamComponents(new WhitespaceTokenizer(reader));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,15 +21,69 @@ import java.io.BufferedReader;
|
|||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Loader for text files that represent a list of stopwords.
|
||||
*/
|
||||
public class WordlistLoader {
|
||||
|
||||
/**
|
||||
* Loads a text file associated with a given class (See
|
||||
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
|
||||
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
|
||||
* the file should contain only one word. The words need to be in lower-case if
|
||||
* you make use of an Analyzer which uses LowerCaseFilter (like
|
||||
* StandardAnalyzer).
|
||||
*
|
||||
* @param aClass
|
||||
* a class that is associated with the given stopwordResource
|
||||
* @param stopwordResource
|
||||
* name of the resource file associated with the given class
|
||||
* @return a {@link Set} with the file's words
|
||||
*/
|
||||
public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
|
||||
throws IOException {
|
||||
final Reader reader = new BufferedReader(new InputStreamReader(aClass
|
||||
.getResourceAsStream(stopwordResource), "UTF-8"));
|
||||
try {
|
||||
return getWordSet(reader);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a text file associated with a given class (See
|
||||
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
|
||||
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
|
||||
* the file should contain only one word. The words need to be in lower-case if
|
||||
* you make use of an Analyzer which uses LowerCaseFilter (like
|
||||
* StandardAnalyzer).
|
||||
*
|
||||
* @param aClass
|
||||
* a class that is associated with the given stopwordResource
|
||||
* @param stopwordResource
|
||||
* name of the resource file associated with the given class
|
||||
* @param comment
|
||||
* the comment string to ignore
|
||||
* @return a {@link Set} with the file's words
|
||||
*/
|
||||
public static Set<String> getWordSet(Class<?> aClass,
|
||||
String stopwordResource, String comment) throws IOException {
|
||||
final Reader reader = new BufferedReader(new InputStreamReader(aClass
|
||||
.getResourceAsStream(stopwordResource), "UTF-8"));
|
||||
try {
|
||||
return getWordSet(reader, comment);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a text file and adds every line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the file should contain only
|
||||
|
@ -40,17 +94,15 @@ public class WordlistLoader {
|
|||
* @return A HashSet with the file's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(File wordfile) throws IOException {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
FileReader reader = null;
|
||||
try {
|
||||
reader = new FileReader(wordfile);
|
||||
result = getWordSet(reader);
|
||||
return getWordSet(reader);
|
||||
}
|
||||
finally {
|
||||
if (reader != null)
|
||||
reader.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -64,17 +116,15 @@ public class WordlistLoader {
|
|||
* @return A HashSet with the file's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
FileReader reader = null;
|
||||
try {
|
||||
reader = new FileReader(wordfile);
|
||||
result = getWordSet(reader, comment);
|
||||
return getWordSet(reader, comment);
|
||||
}
|
||||
finally {
|
||||
if (reader != null)
|
||||
reader.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
@ -88,7 +138,7 @@ public class WordlistLoader {
|
|||
* @return A HashSet with the reader's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(Reader reader) throws IOException {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
final HashSet<String> result = new HashSet<String>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
|
@ -119,7 +169,7 @@ public class WordlistLoader {
|
|||
* @return A HashSet with the reader's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
final HashSet<String> result = new HashSet<String>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
|
@ -154,21 +204,18 @@ public class WordlistLoader {
|
|||
public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
|
||||
if (wordstemfile == null)
|
||||
throw new NullPointerException("wordstemfile may not be null");
|
||||
HashMap<String, String> result = new HashMap<String, String>();
|
||||
final HashMap<String, String> result = new HashMap<String,String>();
|
||||
BufferedReader br = null;
|
||||
FileReader fr = null;
|
||||
|
||||
try {
|
||||
fr = new FileReader(wordstemfile);
|
||||
br = new BufferedReader(fr);
|
||||
br = new BufferedReader(new FileReader(wordstemfile));
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] wordstem = line.split("\t", 2);
|
||||
result.put(wordstem[0], wordstem[1]);
|
||||
}
|
||||
} finally {
|
||||
if (fr != null)
|
||||
fr.close();
|
||||
if (br != null)
|
||||
if(br != null)
|
||||
br.close();
|
||||
}
|
||||
return result;
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
#comment
|
||||
ONE
|
||||
two
|
||||
#comment
|
||||
three
|
|
@ -0,0 +1,3 @@
|
|||
ONE
|
||||
two
|
||||
three
|
Loading…
Reference in New Issue