LUCENE-3765: Trappy behavior with StopFilter/ignoreCase

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1242497 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-02-09 19:59:50 +00:00
parent ef65f76824
commit 72ae3171be
89 changed files with 363 additions and 535 deletions

View File

@ -779,6 +779,11 @@ API Changes
to be merged. To mimic the old behaviour, just use IndexReader.directory() to be merged. To mimic the old behaviour, just use IndexReader.directory()
for choosing the provider by Directory. (Uwe Schindler) for choosing the provider by Directory. (Uwe Schindler)
* LUCENE-3765: Deprecated StopFilter ctor that took ignoreCase, because
in some cases (if the set is a CharArraySet), the argument is ignored.
Deprecated StandardAnalyzer and ClassicAnalyzer ctors that take File,
please use the Reader ctor instead. (Robert Muir)
New Features New Features
* LUCENE-3593: Added a FieldValueFilter that accepts all documents that either * LUCENE-3593: Added a FieldValueFilter that accepts all documents that either

View File

@ -218,6 +218,10 @@ Bug Fixes
* LUCENE-3719: FVH: slow performance on very large queries. * LUCENE-3719: FVH: slow performance on very large queries.
(Igor Motov via Koji Sekiguchi) (Igor Motov via Koji Sekiguchi)
* LUCENE-3765: As of Version.LUCENE_36, DutchAnalyzer's two ctors
that take stopwords and stem exclusion tables also initialize
the default stem overrides (e.g. kind/kinder, fiets). (Robert Muir)
Documentation Documentation
* LUCENE-3599: Javadocs for DistanceUtils.haversine() were incorrectly * LUCENE-3599: Javadocs for DistanceUtils.haversine() were incorrectly

View File

@ -29,6 +29,11 @@ API Changes
since they prevent reuse. Both Analyzers should be configured at instantiation. since they prevent reuse. Both Analyzers should be configured at instantiation.
(Chris Male) (Chris Male)
* LUCENE-3765: Stopset ctors that previously took Set<?> or Map<?,String> now take
CharArraySet and CharArrayMap respectively. Previously the behavior was confusing,
and sometimes different depending on the type of set, and ultimately a CharArraySet
or CharArrayMap was always used anyway. (Robert Muir)
New Features New Features
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer * LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ar;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -63,7 +62,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -72,7 +71,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -85,7 +84,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
} }
} }
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
@ -102,7 +101,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){ public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -118,7 +117,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
* @param stemExclusionSet * @param stemExclusionSet
* a set of terms not to be stemmed * a set of terms not to be stemmed
*/ */
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){ public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -56,7 +56,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
* *
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet() { public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -65,7 +65,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
* class accesses the static final set the first time.; * class accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -78,7 +78,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
} }
} }
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** /**
* Builds an analyzer with the default stop words: * Builds an analyzer with the default stop words:
@ -91,7 +91,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
*/ */
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) { public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -100,7 +100,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
* If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter} * If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter}
* before {@link BulgarianStemFilter}. * before {@link BulgarianStemFilter}.
*/ */
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); } matchVersion, stemExclusionSet)); }

View File

@ -19,8 +19,6 @@ package org.apache.lucene.analysis.br;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Collections;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -56,12 +54,12 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -79,7 +77,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Contains words that should be indexed but not stemmed. * Contains words that should be indexed but not stemmed.
*/ */
private Set<?> excltable = Collections.emptySet(); private CharArraySet excltable = CharArraySet.EMPTY_SET;
/** /**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}). * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
@ -96,7 +94,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) { public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
} }
@ -108,8 +106,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords, public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords,
Set<?> stemExclusionSet) { CharArraySet stemExclusionSet) {
this(matchVersion, stopwords); this(matchVersion, stopwords);
excltable = CharArraySet.unmodifiableSet(CharArraySet excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(matchVersion, stemExclusionSet)); .copy(matchVersion, stemExclusionSet));

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ca;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Arrays; import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -49,7 +48,7 @@ import org.tartarus.snowball.ext.CatalanStemmer;
* </ul> * </ul>
*/ */
public final class CatalanAnalyzer extends StopwordAnalyzerBase { public final class CatalanAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Catalan stopwords. */ /** File containing default Catalan stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -64,7 +63,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -73,7 +72,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -100,7 +99,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public CatalanAnalyzer(Version matchVersion, Set<?> stopwords) { public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -113,7 +112,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public CatalanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cjk;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -27,6 +26,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -49,12 +49,12 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -82,7 +82,7 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){ public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){
super(matchVersion, stopwords); super(matchVersion, stopwords);
} }

View File

@ -10,7 +10,6 @@
package org.apache.lucene.analysis.commongrams; package org.apache.lucene.analysis.commongrams;
import java.io.IOException; import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -69,35 +68,9 @@ public final class CommonGramsFilter extends TokenFilter {
* @param input TokenStream input in filter chain * @param input TokenStream input in filter chain
* @param commonWords The set of common words. * @param commonWords The set of common words.
*/ */
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) { public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) {
this(matchVersion, input, commonWords, false);
}
/**
* Construct a token stream filtering the given input using a Set of common
* words to create bigrams, case-sensitive if ignoreCase is false (unless Set
* is CharArraySet). If <code>commonWords</code> is an instance of
* {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
* construct the set) it will be directly used and <code>ignoreCase</code>
* will be ignored since <code>CharArraySet</code> directly controls case
* sensitivity.
* <p/>
* If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
* new CharArraySet will be constructed and <code>ignoreCase</code> will be
* used to specify the case sensitivity of that set.
*
* @param input TokenStream input in filter chain.
* @param commonWords The set of common words.
* @param ignoreCase -Ignore case when constructing bigrams for common words.
*/
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
super(input); super(input);
if (commonWords instanceof CharArraySet) { this.commonWords = commonWords;
this.commonWords = (CharArraySet) commonWords;
} else {
this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
this.commonWords.addAll(commonWords);
}
} }
/** /**

View File

@ -18,10 +18,7 @@ package org.apache.lucene.analysis.compound;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Locale;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
@ -43,13 +40,6 @@ import org.apache.lucene.util.Version;
* supplementary characters in strings and char arrays provided as compound word * supplementary characters in strings and char arrays provided as compound word
* dictionaries. * dictionaries.
* </ul> * </ul>
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
* it should be case-insensitive unless it contains only lowercased entries and you
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
* For optional performance (as this filter does lots of lookups to the dictionary,
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
* {@link Set Sets} to the ctors, they will be automatically
* transformed to case-insensitive!
*/ */
public abstract class CompoundWordTokenFilterBase extends TokenFilter { public abstract class CompoundWordTokenFilterBase extends TokenFilter {
/** /**
@ -80,15 +70,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
private AttributeSource.State current; private AttributeSource.State current;
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) { protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) {
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
} }
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) { protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary) {
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
} }
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input); super(input);
this.tokens=new LinkedList<CompoundToken>(); this.tokens=new LinkedList<CompoundToken>();
@ -96,12 +86,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
this.minSubwordSize=minSubwordSize; this.minSubwordSize=minSubwordSize;
this.maxSubwordSize=maxSubwordSize; this.maxSubwordSize=maxSubwordSize;
this.onlyLongestMatch=onlyLongestMatch; this.onlyLongestMatch=onlyLongestMatch;
this.dictionary = dictionary;
if (dictionary==null || dictionary instanceof CharArraySet) {
this.dictionary = (CharArraySet) dictionary;
} else {
this.dictionary = new CharArraySet(matchVersion, dictionary, true);
}
} }
@Override @Override

View File

@ -22,6 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
/** /**
@ -38,13 +39,6 @@ import org.apache.lucene.util.Version;
* supplementary characters in strings and char arrays provided as compound word * supplementary characters in strings and char arrays provided as compound word
* dictionaries. * dictionaries.
* </ul> * </ul>
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
* it should be case-insensitive unless it contains only lowercased entries and you
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
* For optional performance (as this filter does lots of lookups to the dictionary,
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
* {@link Set Sets} to the ctors, they will be automatically
* transformed to case-insensitive!
*/ */
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
@ -61,7 +55,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
* @param dictionary * @param dictionary
* the word dictionary to match against. * the word dictionary to match against.
*/ */
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) { public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) {
super(matchVersion, input, dictionary); super(matchVersion, input, dictionary);
} }
@ -86,7 +80,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
* @param onlyLongestMatch * @param onlyLongestMatch
* Add only the longest matching subword to the stream * Add only the longest matching subword to the stream
*/ */
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary, public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
} }

View File

@ -18,12 +18,12 @@ package org.apache.lucene.analysis.compound;
*/ */
import java.io.File; import java.io.File;
import java.util.Set;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation; import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.xml.sax.InputSource; import org.xml.sax.InputSource;
@ -41,13 +41,6 @@ import org.xml.sax.InputSource;
* supplementary characters in strings and char arrays provided as compound word * supplementary characters in strings and char arrays provided as compound word
* dictionaries. * dictionaries.
* </ul> * </ul>
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
* it should be case-insensitive unless it contains only lowercased entries and you
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
* For optional performance (as this filter does lots of lookups to the dictionary,
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
* {@link Set Sets} to the ctors, they will be automatically
* transformed to case-insensitive!
*/ */
public class HyphenationCompoundWordTokenFilter extends public class HyphenationCompoundWordTokenFilter extends
CompoundWordTokenFilterBase { CompoundWordTokenFilterBase {
@ -69,7 +62,7 @@ public class HyphenationCompoundWordTokenFilter extends
* the word dictionary to match against. * the word dictionary to match against.
*/ */
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, Set<?> dictionary) { HyphenationTree hyphenator, CharArraySet dictionary) {
this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false); DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
} }
@ -98,7 +91,7 @@ public class HyphenationCompoundWordTokenFilter extends
* Add only the longest matching subword to the stream * Add only the longest matching subword to the stream
*/ */
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, Set<?> dictionary, int minWordSize, HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
onlyLongestMatch); onlyLongestMatch);
@ -109,14 +102,14 @@ public class HyphenationCompoundWordTokenFilter extends
/** /**
* Create a HyphenationCompoundWordTokenFilter with no dictionary. * Create a HyphenationCompoundWordTokenFilter with no dictionary.
* <p> * <p>
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean) * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean)
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
* null, minWordSize, minSubwordSize, maxSubwordSize } * null, minWordSize, minSubwordSize, maxSubwordSize }
*/ */
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, int minWordSize, int minSubwordSize, HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
int maxSubwordSize) { int maxSubwordSize) {
this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize, this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize,
maxSubwordSize, false); maxSubwordSize, false);
} }

View File

@ -21,7 +21,6 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Arrays; import java.util.Arrays;
import java.util.Set;
import java.util.List; import java.util.List;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
@ -46,7 +45,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
/** An unmodifiable set containing some common English words that are not usually useful /** An unmodifiable set containing some common English words that are not usually useful
for searching.*/ for searching.*/
public static final Set<?> ENGLISH_STOP_WORDS_SET; public static final CharArraySet ENGLISH_STOP_WORDS_SET;
static { static {
final List<String> stopWords = Arrays.asList( final List<String> stopWords = Arrays.asList(
@ -72,7 +71,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
/** Builds an analyzer with the stop words from the given set. /** Builds an analyzer with the stop words from the given set.
* @param matchVersion See <a href="#version">above</a> * @param matchVersion See <a href="#version">above</a>
* @param stopWords Set of stop words */ * @param stopWords Set of stop words */
public StopAnalyzer(Version matchVersion, Set<?> stopWords) { public StopAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(matchVersion, stopWords); super(matchVersion, stopWords);
} }

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -44,34 +43,6 @@ public final class StopFilter extends FilteringTokenFilter {
private final CharArraySet stopWords; private final CharArraySet stopWords;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
* Construct a token stream filtering the given input. If
* <code>stopWords</code> is an instance of {@link CharArraySet} (true if
* <code>makeStopSet()</code> was used to construct the set) it will be
* directly used and <code>ignoreCase</code> will be ignored since
* <code>CharArraySet</code> directly controls case sensitivity.
* <p/>
* If <code>stopWords</code> is not an instance of {@link CharArraySet}, a new
* CharArraySet will be constructed and <code>ignoreCase</code> will be used
* to specify the case sensitivity of that set.
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the stop
* set if Version > 3.0. See <a href="#version">above</a> for details.
* @param input
* Input TokenStream
* @param stopWords
* A Set of Strings or char[] or any other toString()-able set
* representing the stopwords
* @param ignoreCase
* if true, all words are lower cased first
*/
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
{
super(true, input);
this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
}
/** /**
* Constructs a filter which removes words from the input TokenStream that are * Constructs a filter which removes words from the input TokenStream that are
@ -83,12 +54,12 @@ public final class StopFilter extends FilteringTokenFilter {
* @param in * @param in
* Input stream * Input stream
* @param stopWords * @param stopWords
* A Set of Strings or char[] or any other toString()-able set * A {@link CharArraySet} representing the stopwords.
* representing the stopwords
* @see #makeStopSet(Version, java.lang.String...) * @see #makeStopSet(Version, java.lang.String...)
*/ */
public StopFilter(Version matchVersion, TokenStream in, Set<?> stopWords) { public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
this(matchVersion, in, stopWords, false); super(true, in);
this.stopWords = stopWords;
} }
/** /**
@ -101,7 +72,7 @@ public final class StopFilter extends FilteringTokenFilter {
* @param stopWords An array of stopwords * @param stopWords An array of stopwords
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
*/ */
public static Set<Object> makeStopSet(Version matchVersion, String... stopWords) { public static CharArraySet makeStopSet(Version matchVersion, String... stopWords) {
return makeStopSet(matchVersion, stopWords, false); return makeStopSet(matchVersion, stopWords, false);
} }
@ -116,7 +87,7 @@ public final class StopFilter extends FilteringTokenFilter {
* @return A Set ({@link CharArraySet}) containing the words * @return A Set ({@link CharArraySet}) containing the words
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
*/ */
public static Set<Object> makeStopSet(Version matchVersion, List<?> stopWords) { public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords) {
return makeStopSet(matchVersion, stopWords, false); return makeStopSet(matchVersion, stopWords, false);
} }
@ -128,7 +99,7 @@ public final class StopFilter extends FilteringTokenFilter {
* @param ignoreCase If true, all words are lower cased first. * @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words * @return a Set containing the words
*/ */
public static Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) { public static CharArraySet makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase); CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
stopSet.addAll(Arrays.asList(stopWords)); stopSet.addAll(Arrays.asList(stopWords));
return stopSet; return stopSet;
@ -141,7 +112,7 @@ public final class StopFilter extends FilteringTokenFilter {
* @param ignoreCase if true, all words are lower cased first * @param ignoreCase if true, all words are lower cased first
* @return A Set ({@link CharArraySet}) containing the words * @return A Set ({@link CharArraySet}) containing the words
*/ */
public static Set<Object> makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){ public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase); CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
stopSet.addAll(stopWords); stopSet.addAll(stopWords);
return stopSet; return stopSet;

View File

@ -32,7 +32,6 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import java.io.*; import java.io.*;
import java.util.Set;
/** /**
* {@link Analyzer} for Czech language. * {@link Analyzer} for Czech language.
@ -62,12 +61,12 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
* *
* @return a set of default Czech-stopwords * @return a set of default Czech-stopwords
*/ */
public static final Set<?> getDefaultStopSet(){ public static final CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET; return DefaultSetHolder.DEFAULT_SET;
} }
private static class DefaultSetHolder { private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET; private static final CharArraySet DEFAULT_SET;
static { static {
try { try {
@ -82,7 +81,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
} }
private final Set<?> stemExclusionTable; private final CharArraySet stemExclusionTable;
/** /**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}). * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
@ -101,7 +100,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
* {@link <a href="#version">above</a>} * {@link <a href="#version">above</a>}
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) { public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -114,7 +113,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionTable a stemming exclusion set * @param stemExclusionTable a stemming exclusion set
*/ */
public CzechAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable) { public CzechAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
} }
@ -129,7 +128,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If * , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
* a version is >= LUCENE_31 and a stem exclusion set is provided via * a version is >= LUCENE_31 and a stem exclusion set is provided via
* {@link #CzechAnalyzer(Version, Set, Set)} a * {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
* {@link KeywordMarkerFilter} is added before * {@link KeywordMarkerFilter} is added before
* {@link CzechStemFilter}. * {@link CzechStemFilter}.
*/ */

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.da;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.DanishStemmer;
* {@link Analyzer} for Danish. * {@link Analyzer} for Danish.
*/ */
public final class DanishAnalyzer extends StopwordAnalyzerBase { public final class DanishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Danish stopwords. */ /** File containing default Danish stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt";
@ -50,7 +49,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -59,7 +58,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -86,7 +85,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public DanishAnalyzer(Version matchVersion, Set<?> stopwords) { public DanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -99,7 +98,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -21,7 +21,6 @@ package org.apache.lucene.analysis.de;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Arrays; import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -90,16 +89,16 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
* Returns a set of default German-stopwords * Returns a set of default German-stopwords
* @return a set of default German-stopwords * @return a set of default German-stopwords
*/ */
public static final Set<?> getDefaultStopSet(){ public static final CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET; return DefaultSetHolder.DEFAULT_SET;
} }
private static class DefaultSetHolder { private static class DefaultSetHolder {
/** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */ /** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */
@Deprecated @Deprecated
private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet( private static final CharArraySet DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false)); Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
private static final Set<?> DEFAULT_SET; private static final CharArraySet DEFAULT_SET;
static { static {
try { try {
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
@ -119,7 +118,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
/** /**
* Contains words that should be indexed but not stemmed. * Contains words that should be indexed but not stemmed.
*/ */
private final Set<?> exclusionSet; private final CharArraySet exclusionSet;
/** /**
* Builds an analyzer with the default stop words: * Builds an analyzer with the default stop words:
@ -139,7 +138,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public GermanAnalyzer(Version matchVersion, Set<?> stopwords) { public GermanAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -153,7 +152,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
* @param stemExclusionSet * @param stemExclusionSet
* a stemming exclusion set * a stemming exclusion set
*/ */
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
} }

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.el;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -27,6 +26,7 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -58,12 +58,12 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
* Returns a set of default Greek-stopwords * Returns a set of default Greek-stopwords
* @return a set of default Greek-stopwords * @return a set of default Greek-stopwords
*/ */
public static final Set<?> getDefaultStopSet(){ public static final CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET; return DefaultSetHolder.DEFAULT_SET;
} }
private static class DefaultSetHolder { private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET; private static final CharArraySet DEFAULT_SET;
static { static {
try { try {
@ -95,7 +95,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
* See <a href="#version">above</a> * See <a href="#version">above</a>
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) { public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
} }

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.en;
*/ */
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -37,13 +36,13 @@ import org.apache.lucene.util.Version;
* {@link Analyzer} for English. * {@link Analyzer} for English.
*/ */
public final class EnglishAnalyzer extends StopwordAnalyzerBase { public final class EnglishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** /**
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -52,7 +51,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET; static final CharArraySet DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
} }
/** /**
@ -68,7 +67,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords) { public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -81,7 +80,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.es;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -48,7 +47,7 @@ import org.tartarus.snowball.ext.SpanishStemmer;
* </ul> * </ul>
*/ */
public final class SpanishAnalyzer extends StopwordAnalyzerBase { public final class SpanishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Spanish stopwords. */ /** File containing default Spanish stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt";
@ -57,7 +56,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -66,7 +65,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -93,7 +92,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords) { public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -106,7 +105,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.eu;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.BasqueStemmer;
* {@link Analyzer} for Basque. * {@link Analyzer} for Basque.
*/ */
public final class BasqueAnalyzer extends StopwordAnalyzerBase { public final class BasqueAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Basque stopwords. */ /** File containing default Basque stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -48,7 +47,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -57,7 +56,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -84,7 +83,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public BasqueAnalyzer(Version matchVersion, Set<?> stopwords) { public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -97,7 +96,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public BasqueAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fa;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharReader;
@ -30,6 +29,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -63,7 +63,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -72,7 +72,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -101,7 +101,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){ public PersianAnalyzer(Version matchVersion, CharArraySet stopwords){
super(matchVersion, stopwords); super(matchVersion, stopwords);
} }

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fi;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.FinnishStemmer;
* {@link Analyzer} for Finnish. * {@link Analyzer} for Finnish.
*/ */
public final class FinnishAnalyzer extends StopwordAnalyzerBase { public final class FinnishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Italian stopwords. */ /** File containing default Italian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt";
@ -50,7 +49,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -59,7 +58,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -86,7 +85,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords) { public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -99,7 +98,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,11 +19,9 @@ package org.apache.lucene.analysis.fr;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -56,7 +54,7 @@ public final class ElisionFilter extends TokenFilter {
* @param input the source {@link TokenStream} * @param input the source {@link TokenStream}
* @param articles a set of stopword articles * @param articles a set of stopword articles
*/ */
public ElisionFilter(Version matchVersion, TokenStream input, Set<?> articles) { public ElisionFilter(Version matchVersion, TokenStream input, CharArraySet articles) {
super(input); super(input);
this.articles = CharArraySet.unmodifiableSet( this.articles = CharArraySet.unmodifiableSet(
new CharArraySet(matchVersion, articles, true)); new CharArraySet(matchVersion, articles, true));

View File

@ -36,7 +36,6 @@ import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Arrays; import java.util.Arrays;
import java.util.Set;
/** /**
* {@link Analyzer} for French language. * {@link Analyzer} for French language.
@ -101,23 +100,23 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
/** /**
* Contains words that should be indexed but not stemmed. * Contains words that should be indexed but not stemmed.
*/ */
private final Set<?> excltable; private final CharArraySet excltable;
/** /**
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
private static class DefaultSetHolder { private static class DefaultSetHolder {
/** @deprecated (3.1) remove this in Lucene 5.0, index bw compat */ /** @deprecated (3.1) remove this in Lucene 5.0, index bw compat */
@Deprecated @Deprecated
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS), .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
false)); false));
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
@ -147,7 +146,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords){ public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -161,8 +160,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
* @param stemExclutionSet * @param stemExclutionSet
* a stemming exclusion set * a stemming exclusion set
*/ */
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords, public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords,
Set<?> stemExclutionSet) { CharArraySet stemExclutionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.excltable = CharArraySet.unmodifiableSet(CharArraySet this.excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(matchVersion, stemExclutionSet)); .copy(matchVersion, stemExclutionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.gl;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -39,7 +38,7 @@ import org.apache.lucene.util.Version;
* {@link Analyzer} for Galician. * {@link Analyzer} for Galician.
*/ */
public final class GalicianAnalyzer extends StopwordAnalyzerBase { public final class GalicianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Galician stopwords. */ /** File containing default Galician stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -48,7 +47,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -57,7 +56,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -84,7 +83,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public GalicianAnalyzer(Version matchVersion, Set<?> stopwords) { public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -97,7 +96,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public GalicianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hi;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -44,7 +43,7 @@ import org.apache.lucene.util.Version;
* </ul> * </ul>
*/ */
public final class HindiAnalyzer extends StopwordAnalyzerBase { public final class HindiAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** /**
* File containing default Hindi stopwords. * File containing default Hindi stopwords.
@ -59,7 +58,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -68,7 +67,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -88,7 +87,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a stemming exclusion set * @param stemExclusionSet a stemming exclusion set
*/ */
public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) { public HindiAnalyzer(Version version, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(version, stopwords); super(version, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet( this.stemExclusionSet = CharArraySet.unmodifiableSet(
CharArraySet.copy(matchVersion, stemExclusionSet)); CharArraySet.copy(matchVersion, stemExclusionSet));
@ -100,7 +99,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
* @param version lucene compatibility version * @param version lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public HindiAnalyzer(Version version, Set<?> stopwords) { public HindiAnalyzer(Version version, CharArraySet stopwords) {
this(version, stopwords, CharArraySet.EMPTY_SET); this(version, stopwords, CharArraySet.EMPTY_SET);
} }

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hu;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.HungarianStemmer;
* {@link Analyzer} for Hungarian. * {@link Analyzer} for Hungarian.
*/ */
public final class HungarianAnalyzer extends StopwordAnalyzerBase { public final class HungarianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Hungarian stopwords. */ /** File containing default Hungarian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt";
@ -50,7 +49,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -59,7 +58,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -86,7 +85,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords) { public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -99,7 +98,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hy;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.ArmenianStemmer;
* {@link Analyzer} for Armenian. * {@link Analyzer} for Armenian.
*/ */
public final class ArmenianAnalyzer extends StopwordAnalyzerBase { public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Armenian stopwords. */ /** File containing default Armenian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -48,7 +47,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -57,7 +56,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -84,7 +83,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public ArmenianAnalyzer(Version matchVersion, Set<?> stopwords) { public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -97,7 +96,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public ArmenianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.id;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
@ -43,7 +42,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -52,7 +51,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -65,7 +64,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
} }
} }
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
@ -82,7 +81,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords){ public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -98,7 +97,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
* @param stemExclusionSet * @param stemExclusionSet
* a set of terms not to be stemmed * a set of terms not to be stemmed
*/ */
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){ public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.it;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Arrays; import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -52,7 +51,7 @@ import org.tartarus.snowball.ext.ItalianStemmer;
* </ul> * </ul>
*/ */
public final class ItalianAnalyzer extends StopwordAnalyzerBase { public final class ItalianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Italian stopwords. */ /** File containing default Italian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
@ -68,7 +67,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -77,7 +76,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -104,7 +103,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords) { public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -117,7 +116,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.lv;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -27,7 +26,6 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
@ -40,7 +38,7 @@ import org.apache.lucene.util.Version;
* {@link Analyzer} for Latvian. * {@link Analyzer} for Latvian.
*/ */
public final class LatvianAnalyzer extends StopwordAnalyzerBase { public final class LatvianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Latvian stopwords. */ /** File containing default Latvian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -49,7 +47,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -58,7 +56,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -85,7 +83,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) { public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -98,7 +96,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -18,14 +18,12 @@ package org.apache.lucene.analysis.miscellaneous;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/** /**
* Marks terms as keywords via the {@link KeywordAttribute}. Each token * Marks terms as keywords via the {@link KeywordAttribute}. Each token
@ -50,27 +48,11 @@ public final class KeywordMarkerFilter extends TokenFilter {
* @param keywordSet * @param keywordSet
* the keywords set to lookup the current termbuffer * the keywords set to lookup the current termbuffer
*/ */
public KeywordMarkerFilter(final TokenStream in, public KeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet) {
final CharArraySet keywordSet) {
super(in); super(in);
this.keywordSet = keywordSet; this.keywordSet = keywordSet;
} }
/**
* Create a new KeywordMarkerFilter, that marks the current token as a
* keyword if the tokens term buffer is contained in the given set via the
* {@link KeywordAttribute}.
*
* @param in
* TokenStream to filter
* @param keywordSet
* the keywords set to lookup the current termbuffer
*/
public KeywordMarkerFilter(final TokenStream in, final Set<?> keywordSet) {
this(in, keywordSet instanceof CharArraySet ? (CharArraySet) keywordSet
: CharArraySet.copy(Version.LUCENE_31, keywordSet));
}
@Override @Override
public final boolean incrementToken() throws IOException { public final boolean incrementToken() throws IOException {
if (input.incrementToken()) { if (input.incrementToken()) {

View File

@ -22,7 +22,6 @@ import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays; import java.util.Arrays;
import java.util.Locale; import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -139,7 +138,7 @@ public final class PatternAnalyzer extends Analyzer {
private final Pattern pattern; private final Pattern pattern;
private final boolean toLowerCase; private final boolean toLowerCase;
private final Set<?> stopWords; private final CharArraySet stopWords;
private final Version matchVersion; private final Version matchVersion;
@ -162,7 +161,7 @@ public final class PatternAnalyzer extends Analyzer {
* or <a href="http://www.unine.ch/info/clef/">other stop words * or <a href="http://www.unine.ch/info/clef/">other stop words
* lists </a>. * lists </a>.
*/ */
public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set<?> stopWords) { public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, CharArraySet stopWords) {
if (pattern == null) if (pattern == null)
throw new IllegalArgumentException("pattern must not be null"); throw new IllegalArgumentException("pattern must not be null");
@ -404,12 +403,12 @@ public final class PatternAnalyzer extends Analyzer {
private int pos; private int pos;
private final boolean isLetter; private final boolean isLetter;
private final boolean toLowerCase; private final boolean toLowerCase;
private final Set<?> stopWords; private final CharArraySet stopWords;
private static final Locale locale = Locale.getDefault(); private static final Locale locale = Locale.getDefault();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) { public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, CharArraySet stopWords) {
super(input); super(input);
this.str = str; this.str = str;
this.isLetter = isLetter; this.isLetter = isLetter;

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.miscellaneous;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -46,10 +45,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
* </p> * </p>
*/ */
public StemmerOverrideFilter(Version matchVersion, TokenStream input, public StemmerOverrideFilter(Version matchVersion, TokenStream input,
Map<?,String> dictionary) { CharArrayMap<String> dictionary) {
super(input); super(input);
this.dictionary = dictionary instanceof CharArrayMap ? this.dictionary = CharArrayMap.copy(matchVersion, dictionary);
(CharArrayMap<String>) dictionary : CharArrayMap.copy(matchVersion, dictionary);
} }
@Override @Override

View File

@ -28,18 +28,14 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Set;
import java.util.Map;
/** /**
* {@link Analyzer} for Dutch language. * {@link Analyzer} for Dutch language.
@ -56,6 +52,9 @@ import java.util.Map;
* <p>You must specify the required {@link Version} * <p>You must specify the required {@link Version}
* compatibility when creating DutchAnalyzer: * compatibility when creating DutchAnalyzer:
* <ul> * <ul>
* <li> As of 3.6, {@link #DutchAnalyzer(Version, CharArraySet)} and
* {@link #DutchAnalyzer(Version, CharArraySet, CharArraySet)} also populate
* the default entries for the stem override dictionary
* <li> As of 3.1, Snowball stemming is done with SnowballFilter, * <li> As of 3.1, Snowball stemming is done with SnowballFilter,
* LowerCaseFilter is used prior to StopFilter, and Snowball * LowerCaseFilter is used prior to StopFilter, and Snowball
* stopwords are used by default. * stopwords are used by default.
@ -75,13 +74,13 @@ public final class DutchAnalyzer extends Analyzer {
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static final CharArrayMap<String> DEFAULT_STEM_DICT;
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
@ -91,6 +90,12 @@ public final class DutchAnalyzer extends Analyzer {
// distribution (JAR) // distribution (JAR)
throw new RuntimeException("Unable to load default stopword set"); throw new RuntimeException("Unable to load default stopword set");
} }
DEFAULT_STEM_DICT = new CharArrayMap<String>(Version.LUCENE_CURRENT, 4, false);
DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet
DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet
DEFAULT_STEM_DICT.put("ei", "eier");
DEFAULT_STEM_DICT.put("kind", "kinder");
} }
} }
@ -98,14 +103,14 @@ public final class DutchAnalyzer extends Analyzer {
/** /**
* Contains the stopwords used with the StopFilter. * Contains the stopwords used with the StopFilter.
*/ */
private final Set<?> stoptable; private final CharArraySet stoptable;
/** /**
* Contains words that should be indexed but not stemmed. * Contains words that should be indexed but not stemmed.
*/ */
private Set<?> excltable = Collections.emptySet(); private CharArraySet excltable = CharArraySet.EMPTY_SET;
private final Map<String, String> stemdict = new HashMap<String, String>(); private final CharArrayMap<String> stemdict;
private final Version matchVersion; private final Version matchVersion;
/** /**
@ -114,21 +119,33 @@ public final class DutchAnalyzer extends Analyzer {
* *
*/ */
public DutchAnalyzer(Version matchVersion) { public DutchAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); // historically, only this ctor populated the stem dict!!!!!
stemdict.put("fiets", "fiets"); //otherwise fiet this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
stemdict.put("ei", "eier");
stemdict.put("kind", "kinder");
} }
public DutchAnalyzer(Version matchVersion, Set<?> stopwords){ public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); // historically, this ctor never the stem dict!!!!!
// so we populate it only for >= 3.6
this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
matchVersion.onOrAfter(Version.LUCENE_36)
? DefaultSetHolder.DEFAULT_STEM_DICT
: CharArrayMap.<String>emptyMap());
} }
public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){ public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); // historically, this ctor never the stem dict!!!!!
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); // so we populate it only for >= 3.6
this(matchVersion, stopwords, stemExclusionTable,
matchVersion.onOrAfter(Version.LUCENE_36)
? DefaultSetHolder.DEFAULT_STEM_DICT
: CharArrayMap.<String>emptyMap());
}
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
this.matchVersion = matchVersion; this.matchVersion = matchVersion;
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
this.stemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict));
} }
/** /**

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.no;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.NorwegianStemmer;
* {@link Analyzer} for Norwegian. * {@link Analyzer} for Norwegian.
*/ */
public final class NorwegianAnalyzer extends StopwordAnalyzerBase { public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Norwegian stopwords. */ /** File containing default Norwegian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
@ -50,7 +49,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -59,7 +58,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -86,7 +85,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) { public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -99,7 +98,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.pt;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -48,7 +47,7 @@ import org.tartarus.snowball.ext.PortugueseStemmer;
* </ul> * </ul>
*/ */
public final class PortugueseAnalyzer extends StopwordAnalyzerBase { public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Portuguese stopwords. */ /** File containing default Portuguese stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
@ -57,7 +56,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -66,7 +65,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -93,7 +92,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) { public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -106,7 +105,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -22,6 +22,7 @@ import java.util.*;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
@ -179,7 +180,8 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
if (stopWords == null) { if (stopWords == null) {
return components; return components;
} }
StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(), stopWords); StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(),
new CharArraySet(matchVersion, stopWords, false));
return new TokenStreamComponents(components.getTokenizer(), stopFilter); return new TokenStreamComponents(components.getTokenizer(), stopFilter);
} }

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ro;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.RomanianStemmer;
* {@link Analyzer} for Romanian. * {@link Analyzer} for Romanian.
*/ */
public final class RomanianAnalyzer extends StopwordAnalyzerBase { public final class RomanianAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Romanian stopwords. */ /** File containing default Romanian stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -53,7 +52,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -62,7 +61,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -89,7 +88,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) { public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -102,7 +101,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ru;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Arrays; import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -78,10 +77,10 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
private static class DefaultSetHolder { private static class DefaultSetHolder {
/** @deprecated (3.1) remove this for Lucene 5.0 */ /** @deprecated (3.1) remove this for Lucene 5.0 */
@Deprecated @Deprecated
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(RUSSIAN_STOP_WORDS_30), false)); Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -95,14 +94,14 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
} }
} }
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** /**
* Returns an unmodifiable instance of the default stop-words set. * Returns an unmodifiable instance of the default stop-words set.
* *
* @return an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set.
*/ */
public static Set<?> getDefaultStopSet() { public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -120,7 +119,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){ public RussianAnalyzer(Version matchVersion, CharArraySet stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -133,7 +132,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
* a stopword set * a stopword set
* @param stemExclusionSet a set of words not to be stemmed * @param stemExclusionSet a set of words not to be stemmed
*/ */
public RussianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){ public RussianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
} }

View File

@ -27,7 +27,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link /** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}. * LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
@ -48,7 +47,7 @@ import java.util.Set;
@Deprecated @Deprecated
public final class SnowballAnalyzer extends Analyzer { public final class SnowballAnalyzer extends Analyzer {
private String name; private String name;
private Set<?> stopSet; private CharArraySet stopSet;
private final Version matchVersion; private final Version matchVersion;
/** Builds the named analyzer with no stop words. */ /** Builds the named analyzer with no stop words. */
@ -58,7 +57,7 @@ public final class SnowballAnalyzer extends Analyzer {
} }
/** Builds the named analyzer with the given stop words. */ /** Builds the named analyzer with the given stop words. */
public SnowballAnalyzer(Version matchVersion, String name, Set<?> stopWords) { public SnowballAnalyzer(Version matchVersion, String name, CharArraySet stopWords) {
this(matchVersion, name); this(matchVersion, name);
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
stopWords)); stopWords));

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -28,7 +29,6 @@ import org.apache.lucene.util.Version;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
/** /**
* Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link * Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
@ -60,13 +60,13 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
/** An unmodifiable set containing some common English words that are usually not /** An unmodifiable set containing some common English words that are usually not
useful for searching. */ useful for searching. */
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the given stop words. /** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link * @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>} * <a href="#version">above</a>}
* @param stopWords stop words */ * @param stopWords stop words */
public ClassicAnalyzer(Version matchVersion, Set<?> stopWords) { public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(matchVersion, stopWords); super(matchVersion, stopWords);
} }
@ -79,15 +79,6 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
this(matchVersion, STOP_WORDS_SET); this(matchVersion, STOP_WORDS_SET);
} }
/** Builds an analyzer with the stop words from the given file.
* @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords File to read stop words from */
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader. /** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader, Version) * @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion Lucene version to match See {@link * @param matchVersion Lucene version to match See {@link

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -28,7 +29,6 @@ import org.apache.lucene.util.Version;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
/** /**
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
@ -61,13 +61,13 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
/** An unmodifiable set containing some common English words that are usually not /** An unmodifiable set containing some common English words that are usually not
useful for searching. */ useful for searching. */
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the given stop words. /** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link * @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>} * <a href="#version">above</a>}
* @param stopWords stop words */ * @param stopWords stop words */
public StandardAnalyzer(Version matchVersion, Set<?> stopWords) { public StandardAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(matchVersion, stopWords); super(matchVersion, stopWords);
} }
@ -80,15 +80,6 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
this(matchVersion, STOP_WORDS_SET); this(matchVersion, STOP_WORDS_SET);
} }
/** Builds an analyzer with the stop words from the given file.
* @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords File to read stop words from */
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader. /** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader, Version) * @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion Lucene version to match See {@link * @param matchVersion Lucene version to match See {@link

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.sv;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.SwedishStemmer;
* {@link Analyzer} for Swedish. * {@link Analyzer} for Swedish.
*/ */
public final class SwedishAnalyzer extends StopwordAnalyzerBase { public final class SwedishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Swedish stopwords. */ /** File containing default Swedish stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
@ -50,7 +49,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -59,7 +58,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -86,7 +85,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) { public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -99,7 +98,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.th;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -55,7 +55,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -64,7 +64,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -93,7 +93,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public ThaiAnalyzer(Version matchVersion, Set<?> stopwords) { public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
} }

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.tr;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
@ -38,7 +37,7 @@ import org.tartarus.snowball.ext.TurkishStemmer;
* {@link Analyzer} for Turkish. * {@link Analyzer} for Turkish.
*/ */
public final class TurkishAnalyzer extends StopwordAnalyzerBase { public final class TurkishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
/** File containing default Turkish stopwords. */ /** File containing default Turkish stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -52,7 +51,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -61,7 +60,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static { static {
try { try {
@ -88,7 +87,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) { public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -101,7 +100,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); matchVersion, stemExclusionSet));

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.util;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
@ -46,7 +45,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
* @return the analyzer's stopword set or an empty set if the analyzer has no * @return the analyzer's stopword set or an empty set if the analyzer has no
* stopwords * stopwords
*/ */
public Set<?> getStopwordSet() { public CharArraySet getStopwordSet() {
return stopwords; return stopwords;
} }
@ -58,7 +57,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
* @param stopwords * @param stopwords
* the analyzer's stopword set * the analyzer's stopword set
*/ */
protected StopwordAnalyzerBase(final Version version, final Set<?> stopwords) { protected StopwordAnalyzerBase(final Version version, final CharArraySet stopwords) {
matchVersion = version; matchVersion = version;
// analyzers should use char array set for stopwords! // analyzers should use char array set for stopwords!
this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet

View File

@ -18,9 +18,6 @@ package org.apache.lucene.analysis.ar;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
@ -79,16 +76,14 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
* Test that custom stopwords work, and are not case-sensitive. * Test that custom stopwords work, and are not case-sensitive.
*/ */
public void testCustomStopwords() throws Exception { public void testCustomStopwords() throws Exception {
Set<String> set = new HashSet<String>(); CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("the", "and", "a"), false);
Collections.addAll(set, "the", "and", "a");
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set); ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set);
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
"brown", "fox" }); "brown", "fox" });
} }
public void testWithStemExclusionSet() throws IOException { public void testWithStemExclusionSet() throws IOException {
Set<String> set = new HashSet<String>(); CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("ساهدهات"), false);
set.add("ساهدهات");
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set); ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" }); assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" }); assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.bg;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -43,8 +42,7 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
} }
public void testCustomStopwords() throws IOException { public void testCustomStopwords() throws IOException {
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, Collections Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
.emptySet());
assertAnalyzesTo(a, "Как се казваш?", assertAnalyzesTo(a, "Как се казваш?",
new String[] {"как", "се", "казваш"}); new String[] {"как", "се", "казваш"});
} }

View File

@ -136,7 +136,8 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
} }
public void testStemExclusionTable() throws Exception { public void testStemExclusionTable() throws Exception {
BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT, Collections.emptySet(), asSet("quintessência")); BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT,
CharArraySet.EMPTY_SET, new CharArraySet(TEST_VERSION_CURRENT, asSet("quintessência"), false));
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged. checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
} }

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.ca;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestCatalanAnalyzer extends BaseTokenStreamTestCase { public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -50,8 +49,7 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("llengües"), false);
exclusionSet.add("llengües");
Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT,
CatalanAnalyzer.getDefaultStopSet(), exclusionSet); CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "llengües", "llengües"); checkOneTermReuse(a, "llengües", "llengües");

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import java.io.StringReader; import java.io.StringReader;
@ -58,10 +59,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
} }
public void testStopList() throws IOException { public void testStopList() throws IOException {
Set<Object> stopWordsSet = new HashSet<Object>(); CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
stopWordsSet.add("good");
stopWordsSet.add("test");
stopWordsSet.add("analyzer");
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet); StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
StringReader reader = new StringReader("This is a good test of the english stop analyzer"); StringReader reader = new StringReader("This is a good test of the english stop analyzer");
TokenStream stream = newStop.tokenStream("test", reader); TokenStream stream = newStop.tokenStream("test", reader);
@ -75,10 +73,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
} }
public void testStopListPositions() throws IOException { public void testStopListPositions() throws IOException {
Set<Object> stopWordsSet = new HashSet<Object>(); CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
stopWordsSet.add("good");
stopWordsSet.add("test");
stopWordsSet.add("analyzer");
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet); StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
StringReader reader = new StringReader("This is a good test of the english stop analyzer with positions"); StringReader reader = new StringReader("This is a good test of the english stop analyzer with positions");
int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1}; int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1};

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.English; import org.apache.lucene.util.English;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -36,22 +37,15 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
public void testExactCase() throws IOException { public void testExactCase() throws IOException {
StringReader reader = new StringReader("Now is The Time"); StringReader reader = new StringReader("Now is The Time");
Set<String> stopWords = asSet("is", "the", "Time"); CharArraySet stopWords = new CharArraySet(TEST_VERSION_CURRENT, asSet("is", "the", "Time"), false);
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, false); TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords);
assertTokenStreamContents(stream, new String[] { "Now", "The" }); assertTokenStreamContents(stream, new String[] { "Now", "The" });
} }
public void testIgnoreCase() throws IOException {
StringReader reader = new StringReader("Now is The Time");
Set<String> stopWords = asSet( "is", "the", "Time" );
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, true);
assertTokenStreamContents(stream, new String[] { "Now" });
}
public void testStopFilt() throws IOException { public void testStopFilt() throws IOException {
StringReader reader = new StringReader("Now is The Time"); StringReader reader = new StringReader("Now is The Time");
String[] stopWords = new String[] { "is", "the", "Time" }; String[] stopWords = new String[] { "is", "the", "Time" };
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords); CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
assertTokenStreamContents(stream, new String[] { "Now", "The" }); assertTokenStreamContents(stream, new String[] { "Now", "The" });
} }
@ -70,7 +64,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
log(sb.toString()); log(sb.toString());
String stopWords[] = a.toArray(new String[0]); String stopWords[] = a.toArray(new String[0]);
for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]); for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords); CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
// with increments // with increments
StringReader reader = new StringReader(sb.toString()); StringReader reader = new StringReader(sb.toString());
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
@ -93,8 +87,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]); for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
String stopWords1[] = a1.toArray(new String[0]); String stopWords1[] = a1.toArray(new String[0]);
for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]); for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
Set<Object> stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0); CharArraySet stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
Set<Object> stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1); CharArraySet stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
reader = new StringReader(sb.toString()); reader = new StringReader(sb.toString());
StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
stpf0.setEnablePositionIncrements(true); stpf0.setEnablePositionIncrements(true);

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.da;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestDanishAnalyzer extends BaseTokenStreamTestCase { public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("undersøgelse"), false);
exclusionSet.add("undersøgelse");
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT,
DanishAnalyzer.getDefaultStopSet(), exclusionSet); DanishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "undersøgelse", "undersøgelse"); checkOneTermReuse(a, "undersøgelse", "undersøgelse");

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.de;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -46,7 +45,8 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
} }
public void testStemExclusionTable() throws Exception { public void testStemExclusionTable() throws Exception {
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, Collections.emptySet(), asSet("tischen")); GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
new CharArraySet(TEST_VERSION_CURRENT, asSet("tischen"), false));
checkOneTermReuse(a, "tischen", "tischen"); checkOneTermReuse(a, "tischen", "tischen");
} }

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.en;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase { public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -45,8 +44,7 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("books"), false);
exclusionSet.add("books");
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT,
EnglishAnalyzer.getDefaultStopSet(), exclusionSet); EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "books", "books"); checkOneTermReuse(a, "books", "books");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.es;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase { public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chicano"), false);
exclusionSet.add("chicano");
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT,
SpanishAnalyzer.getDefaultStopSet(), exclusionSet); SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "chicana", "chican"); checkOneTermReuse(a, "chicana", "chican");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.eu;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestBasqueAnalyzer extends BaseTokenStreamTestCase { public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("zaldiak"), false);
exclusionSet.add("zaldiak");
Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT,
BasqueAnalyzer.getDefaultStopSet(), exclusionSet); BasqueAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "zaldiak", "zaldiak"); checkOneTermReuse(a, "zaldiak", "zaldiak");

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.fa;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
/** /**
* Test the Persian Analyzer * Test the Persian Analyzer
@ -215,7 +216,8 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
* Test that custom stopwords work, and are not case-sensitive. * Test that custom stopwords work, and are not case-sensitive.
*/ */
public void testCustomStopwords() throws Exception { public void testCustomStopwords() throws Exception {
PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, asSet("the", "and", "a")); PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT,
new CharArraySet(TEST_VERSION_CURRENT, asSet("the", "and", "a"), false));
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
"brown", "fox" }); "brown", "fox" });
} }

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.fi;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase { public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
exclusionSet.add("edeltäjistään");
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT,
FinnishAnalyzer.getDefaultStopSet(), exclusionSet); FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj"); checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");

View File

@ -20,15 +20,14 @@ package org.apache.lucene.analysis.fr;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
/** /**
* *
@ -38,9 +37,7 @@ public class TestElision extends BaseTokenStreamTestCase {
public void testElision() throws Exception { public void testElision() throws Exception {
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test)); Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
Set<String> articles = new HashSet<String>(); CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
articles.add("l");
articles.add("M");
TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles); TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles);
List<String> tas = filter(filter); List<String> tas = filter(filter);
assertEquals("embrouille", tas.get(4)); assertEquals("embrouille", tas.get(4));

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.gl;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestGalicianAnalyzer extends BaseTokenStreamTestCase { public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("correspondente"), false);
exclusionSet.add("correspondente");
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT,
GalicianAnalyzer.getDefaultStopSet(), exclusionSet); GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "correspondente", "correspondente"); checkOneTermReuse(a, "correspondente", "correspondente");

View File

@ -1,10 +1,8 @@
package org.apache.lucene.analysis.hi; package org.apache.lucene.analysis.hi;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -41,8 +39,7 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
} }
public void testExclusionSet() throws Exception { public void testExclusionSet() throws Exception {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("हिंदी"), false);
exclusionSet.add("हिंदी");
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT,
HindiAnalyzer.getDefaultStopSet(), exclusionSet); HindiAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "हिंदी", "हिंदी"); checkOneTermReuse(a, "हिंदी", "हिंदी");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.hu;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase { public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
exclusionSet.add("babakocsi");
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT,
HungarianAnalyzer.getDefaultStopSet(), exclusionSet); HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "babakocsi", "babakocsi"); checkOneTermReuse(a, "babakocsi", "babakocsi");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.hy;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestArmenianAnalyzer extends BaseTokenStreamTestCase { public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("արծիվներ"), false);
exclusionSet.add("արծիվներ");
Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT,
ArmenianAnalyzer.getDefaultStopSet(), exclusionSet); ArmenianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "արծիվներ", "արծիվներ"); checkOneTermReuse(a, "արծիվներ", "արծիվներ");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.id;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase { public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("peledakan"), false);
exclusionSet.add("peledakan");
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet); IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "peledakan", "peledakan"); checkOneTermReuse(a, "peledakan", "peledakan");

View File

@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
public class TestItalianAnalyzer extends BaseTokenStreamTestCase { public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
@ -44,8 +45,7 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("abbandonata"), false);
exclusionSet.add("abbandonata");
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT,
ItalianAnalyzer.getDefaultStopSet(), exclusionSet); ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "abbandonata", "abbandonata"); checkOneTermReuse(a, "abbandonata", "abbandonata");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.lv;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase { public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("tirgiem"), false);
exclusionSet.add("tirgiem");
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
LatvianAnalyzer.getDefaultStopSet(), exclusionSet); LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "tirgiem", "tirgiem"); checkOneTermReuse(a, "tirgiem", "tirgiem");

View File

@ -2,10 +2,7 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale; import java.util.Locale;
import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
@ -47,12 +44,11 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(new LowerCaseFilterMock( assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new MockTokenizer(new StringReader( new KeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output); "The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output);
Set<String> jdkSet = new HashSet<String>(); CharArraySet mixedCaseSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("LuceneFox"), false);
jdkSet.add("LuceneFox");
assertTokenStreamContents(new LowerCaseFilterMock( assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new MockTokenizer(new StringReader( new KeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), jdkSet)), output); "The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), mixedCaseSet)), output);
Set<?> set2 = set; CharArraySet set2 = set;
assertTokenStreamContents(new LowerCaseFilterMock( assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new MockTokenizer(new StringReader( new KeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output); "The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output);
@ -64,8 +60,8 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
new KeywordMarkerFilter( new KeywordMarkerFilter(
new KeywordMarkerFilter( new KeywordMarkerFilter(
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false), new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
new HashSet<String>(Arrays.asList("Birds", "Houses"))), new CharArraySet(TEST_VERSION_CURRENT, asSet("Birds", "Houses"), false)),
new HashSet<String>(Arrays.asList("Dogs", "Trees")))); new CharArraySet(TEST_VERSION_CURRENT, asSet("Dogs", "Trees"), false)));
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" }); assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
} }

View File

@ -2,12 +2,11 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
@ -33,7 +32,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
// lets make booked stem to books // lets make booked stem to books
// the override filter will convert "booked" to "books", // the override filter will convert "booked" to "books",
// but also mark it with KeywordAttribute so Porter will not change it. // but also mark it with KeywordAttribute so Porter will not change it.
Map<String,String> dictionary = new HashMap<String,String>(); CharArrayMap<String> dictionary = new CharArrayMap<String>(TEST_VERSION_CURRENT, 1, false);
dictionary.put("booked", "books"); dictionary.put("booked", "books");
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked")); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
TokenStream stream = new PorterStemFilter( TokenStream stream = new PorterStemFilter(

View File

@ -17,7 +17,6 @@ package org.apache.lucene.analysis.nl;
* limitations under the License. * limitations under the License.
*/ */
import java.io.File;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -150,6 +149,26 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
} }
/**
* check that the default stem overrides are used
* even if you use a non-default ctor.
*/
public void testStemOverrides() throws IOException {
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
checkOneTerm(a, "fiets", "fiets");
}
/**
* prior to 3.6, this confusingly did not happen if
* you specified your own stoplist!!!!
* @deprecated (3.6) Remove this test in Lucene 5.0
*/
@Deprecated
public void testBuggyStemOverrides() throws IOException {
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_35, CharArraySet.EMPTY_SET);
checkOneTerm(a, "fiets", "fiet");
}
/** /**
* Prior to 3.1, this analyzer had no lowercase filter. * Prior to 3.1, this analyzer had no lowercase filter.
* stopwords were case sensitive. Preserve this for back compat. * stopwords were case sensitive. Preserve this for back compat.

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.no;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase { public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("havnedistriktene"), false);
exclusionSet.add("havnedistriktene");
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT,
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet); NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene"); checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.pt;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase { public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
exclusionSet.add("quilométricas");
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT,
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet); PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "quilométricas", "quilométricas"); checkOneTermReuse(a, "quilométricas", "quilométricas");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.ro;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase { public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("absenţa"), false);
exclusionSet.add("absenţa");
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT,
RomanianAnalyzer.getDefaultStopSet(), exclusionSet); RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "absenţa", "absenţa"); checkOneTermReuse(a, "absenţa", "absenţa");

View File

@ -18,12 +18,10 @@ package org.apache.lucene.analysis.sv;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.hu.HungarianAnalyzer; import org.apache.lucene.analysis.util.CharArraySet;
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase { public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -44,8 +42,7 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlarne"), false);
exclusionSet.add("jaktkarlarne");
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT,
SwedishAnalyzer.getDefaultStopSet(), exclusionSet); SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne"); checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.tr;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase { public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ağacı"), false);
exclusionSet.add("ağacı");
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT,
TurkishAnalyzer.getDefaultStopSet(), exclusionSet); TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "ağacı", "ağacı"); checkOneTermReuse(a, "ağacı", "ağacı");

View File

@ -39,13 +39,13 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS); this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
} }
public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set<?> stopwords, Set<String> stoptags) { public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set<String> stoptags) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.segmenter = segmenter; this.segmenter = segmenter;
this.stoptags = stoptags; this.stoptags = stoptags;
} }
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
@ -58,7 +58,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
* outer class accesses the static final set the first time. * outer class accesses the static final set the first time.
*/ */
private static class DefaultSetHolder { private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static final Set<String> DEFAULT_STOP_TAGS; static final Set<String> DEFAULT_STOP_TAGS;
static { static {

View File

@ -18,10 +18,7 @@
package org.apache.lucene.analysis.cn.smart; package org.apache.lucene.analysis.cn.smart;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.util.Collections;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
@ -58,7 +55,7 @@ import org.apache.lucene.util.Version;
*/ */
public final class SmartChineseAnalyzer extends Analyzer { public final class SmartChineseAnalyzer extends Analyzer {
private final Set<?> stopWords; private final CharArraySet stopWords;
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -120,7 +117,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
*/ */
public SmartChineseAnalyzer(Version matchVersion, boolean useDefaultStopWords) { public SmartChineseAnalyzer(Version matchVersion, boolean useDefaultStopWords) {
stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
: Collections.EMPTY_SET; : CharArraySet.EMPTY_SET;
this.matchVersion = matchVersion; this.matchVersion = matchVersion;
} }
@ -133,8 +130,8 @@ public final class SmartChineseAnalyzer extends Analyzer {
* </p> * </p>
* @param stopWords {@link Set} of stopwords to use. * @param stopWords {@link Set} of stopwords to use.
*/ */
public SmartChineseAnalyzer(Version matchVersion, Set stopWords) { public SmartChineseAnalyzer(Version matchVersion, CharArraySet stopWords) {
this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords; this.stopWords = stopWords==null?CharArraySet.EMPTY_SET:stopWords;
this.matchVersion = matchVersion; this.matchVersion = matchVersion;
} }
@ -147,7 +144,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
// The porter stemming is too strict, this is not a bug, this is a feature:) // The porter stemming is too strict, this is not a bug, this is a feature:)
result = new PorterStemFilter(result); result = new PorterStemFilter(result);
if (!stopWords.isEmpty()) { if (!stopWords.isEmpty()) {
result = new StopFilter(matchVersion, result, stopWords, false); result = new StopFilter(matchVersion, result, stopWords);
} }
return new TokenStreamComponents(tokenizer, result); return new TokenStreamComponents(tokenizer, result);
} }

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.pl;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -42,7 +41,7 @@ import org.egothor.stemmer.Trie;
* {@link Analyzer} for Polish. * {@link Analyzer} for Polish.
*/ */
public final class PolishAnalyzer extends StopwordAnalyzerBase { public final class PolishAnalyzer extends StopwordAnalyzerBase {
private final Set<?> stemExclusionSet; private final CharArraySet stemExclusionSet;
private final Trie stemTable; private final Trie stemTable;
/** File containing default Polish stopwords. */ /** File containing default Polish stopwords. */
@ -55,7 +54,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
* Returns an unmodifiable instance of the default stop words set. * Returns an unmodifiable instance of the default stop words set.
* @return default stop words set. * @return default stop words set.
*/ */
public static Set<?> getDefaultStopSet(){ public static CharArraySet getDefaultStopSet(){
return DefaultsHolder.DEFAULT_STOP_SET; return DefaultsHolder.DEFAULT_STOP_SET;
} }
@ -64,7 +63,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
* accesses the static final set the first time.; * accesses the static final set the first time.;
*/ */
private static class DefaultsHolder { private static class DefaultsHolder {
static final Set<?> DEFAULT_STOP_SET; static final CharArraySet DEFAULT_STOP_SET;
static final Trie DEFAULT_TABLE; static final Trie DEFAULT_TABLE;
static { static {
@ -100,7 +99,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion lucene compatibility version * @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public PolishAnalyzer(Version matchVersion, Set<?> stopwords) { public PolishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
@ -113,7 +112,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public PolishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { public PolishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords); super(matchVersion, stopwords);
this.stemTable = DefaultsHolder.DEFAULT_TABLE; this.stemTable = DefaultsHolder.DEFAULT_TABLE;
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.pl;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestPolishAnalyzer extends BaseTokenStreamTestCase { public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the /** This test fails with NPE when the
@ -43,8 +42,7 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
Set<String> exclusionSet = new HashSet<String>(); CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("studenta"), false);;
exclusionSet.add("studenta");
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT, Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT,
PolishAnalyzer.getDefaultStopSet(), exclusionSet); PolishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTermReuse(a, "studenta", "studenta"); checkOneTermReuse(a, "studenta", "studenta");

View File

@ -93,15 +93,14 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements
.getTokenFilterFactories(); .getTokenFilterFactories();
for (TokenFilterFactory factory : filterFactories) { for (TokenFilterFactory factory : filterFactories) {
if (factory instanceof StopFilterFactory) { if (factory instanceof StopFilterFactory) {
// StopFilterFactory holds the stop words in a CharArraySet, but // StopFilterFactory holds the stop words in a CharArraySet
// the getStopWords() method returns a Set<?>, so we need to cast.
solrStopWords.put(fieldName, solrStopWords.put(fieldName,
(CharArraySet) ((StopFilterFactory) factory).getStopWords()); ((StopFilterFactory) factory).getStopWords());
} }
if (factory instanceof CommonGramsFilterFactory) { if (factory instanceof CommonGramsFilterFactory) {
solrStopWords.put(fieldName, solrStopWords.put(fieldName,
(CharArraySet) ((CommonGramsFilterFactory) factory) ((CommonGramsFilterFactory) factory)
.getCommonWords()); .getCommonWords());
} }
} }

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.IOException; import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
@ -71,12 +70,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
return ignoreCase; return ignoreCase;
} }
public Set<?> getCommonWords() { public CharArraySet getCommonWords() {
return commonWords; return commonWords;
} }
public CommonGramsFilter create(TokenStream input) { public CommonGramsFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase); CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
return commonGrams; return commonGrams;
} }
} }

View File

@ -18,7 +18,6 @@ package org.apache.solr.analysis;
import java.io.IOException; import java.io.IOException;
import java.util.Map; import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
@ -80,7 +79,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
return ignoreCase; return ignoreCase;
} }
public Set<?> getCommonWords() { public CharArraySet getCommonWords() {
return commonWords; return commonWords;
} }
@ -88,8 +87,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
*/ */
public CommonGramsQueryFilter create(TokenStream input) { public CommonGramsQueryFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
ignoreCase);
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter( CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
commonGrams); commonGrams);
return commonGramsQuery; return commonGramsQuery;

View File

@ -25,7 +25,6 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.io.IOException; import java.io.IOException;
/** /**
@ -81,13 +80,13 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
return ignoreCase; return ignoreCase;
} }
public Set<?> getStopWords() { public CharArraySet getStopWords() {
return stopWords; return stopWords;
} }
@Override @Override
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase); StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords);
stopFilter.setEnablePositionIncrements(enablePositionIncrements); stopFilter.setEnablePositionIncrements(enablePositionIncrements);
return stopFilter; return stopFilter;
} }

View File

@ -20,11 +20,11 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.core.SolrResourceLoader;
import java.io.StringReader; import java.io.StringReader;
import java.util.Set;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
@ -44,7 +44,7 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
args.put("ignoreCase", "true"); args.put("ignoreCase", "true");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set<?> words = factory.getCommonWords(); CharArraySet words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, assertTrue("words Size: " + words.size() + " is not: " + 2,
words.size() == 2); words.size() == 2);
@ -89,7 +89,7 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set<?> words = factory.getCommonWords(); CharArraySet words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the")); assertTrue(words.contains("the"));
Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false);

View File

@ -19,11 +19,11 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.core.SolrResourceLoader;
import java.io.StringReader; import java.io.StringReader;
import java.util.Set;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
@ -43,7 +43,7 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
args.put("ignoreCase", "true"); args.put("ignoreCase", "true");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set<?> words = factory.getCommonWords(); CharArraySet words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, assertTrue("words Size: " + words.size() + " is not: " + 2,
words.size() == 2); words.size() == 2);
@ -88,7 +88,7 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM); Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set<?> words = factory.getCommonWords(); CharArraySet words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the")); assertTrue(words.contains("the"));
Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false);

View File

@ -16,10 +16,10 @@ package org.apache.solr.analysis;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.core.SolrResourceLoader;
import java.util.Set;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
@ -38,7 +38,7 @@ public class TestKeepFilterFactory extends BaseTokenTestCase{
args.put("ignoreCase", "true"); args.put("ignoreCase", "true");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set<?> words = factory.getWords(); CharArraySet words = factory.getWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);

View File

@ -17,10 +17,10 @@ package org.apache.solr.analysis;
*/ */
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.core.SolrResourceLoader;
import java.util.Set;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
@ -39,7 +39,7 @@ public class TestStopFilterFactory extends BaseTokenTestCase {
args.put("ignoreCase", "true"); args.put("ignoreCase", "true");
factory.init(args); factory.init(args);
factory.inform(loader); factory.inform(loader);
Set<?> words = factory.getStopWords(); CharArraySet words = factory.getStopWords();
assertTrue("words is null and it shouldn't be", words != null); assertTrue("words is null and it shouldn't be", words != null);
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);