mirror of https://github.com/apache/lucene.git
LUCENE-3765: Trappy behavior with StopFilter/ignoreCase
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1242497 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ef65f76824
commit
72ae3171be
|
@ -779,6 +779,11 @@ API Changes
|
|||
to be merged. To mimic the old behaviour, just use IndexReader.directory()
|
||||
for choosing the provider by Directory. (Uwe Schindler)
|
||||
|
||||
* LUCENE-3765: Deprecated StopFilter ctor that took ignoreCase, because
|
||||
in some cases (if the set is a CharArraySet), the argument is ignored.
|
||||
Deprecated StandardAnalyzer and ClassicAnalyzer ctors that take File,
|
||||
please use the Reader ctor instead. (Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-3593: Added a FieldValueFilter that accepts all documents that either
|
||||
|
|
|
@ -218,6 +218,10 @@ Bug Fixes
|
|||
* LUCENE-3719: FVH: slow performance on very large queries.
|
||||
(Igor Motov via Koji Sekiguchi)
|
||||
|
||||
* LUCENE-3765: As of Version.LUCENE_36, DutchAnalyzer's two ctors
|
||||
that take stopwords and stem exclusion tables also initialize
|
||||
the default stem overrides (e.g. kind/kinder, fiets). (Robert Muir)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-3599: Javadocs for DistanceUtils.haversine() were incorrectly
|
||||
|
|
|
@ -29,6 +29,11 @@ API Changes
|
|||
since they prevent reuse. Both Analyzers should be configured at instantiation.
|
||||
(Chris Male)
|
||||
|
||||
* LUCENE-3765: Stopset ctors that previously took Set<?> or Map<?,String> now take
|
||||
CharArraySet and CharArrayMap respectively. Previously the behavior was confusing,
|
||||
and sometimes different depending on the type of set, and ultimately a CharArraySet
|
||||
or CharArrayMap was always used anyway. (Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ar;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -63,7 +62,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -72,7 +71,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -85,7 +84,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
}
|
||||
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
|
@ -102,7 +101,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -118,7 +117,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stemExclusionSet
|
||||
* a set of terms not to be stemmed
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
|
||||
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -56,7 +56,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
*
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet() {
|
||||
public static CharArraySet getDefaultStopSet() {
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -65,7 +65,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* class accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -78,7 +78,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
}
|
||||
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
|
@ -91,7 +91,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -100,7 +100,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter}
|
||||
* before {@link BulgarianStemFilter}.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet)); }
|
||||
|
|
|
@ -19,8 +19,6 @@ package org.apache.lucene.analysis.br;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -56,12 +54,12 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -79,7 +77,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Set<?> excltable = Collections.emptySet();
|
||||
private CharArraySet excltable = CharArraySet.EMPTY_SET;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||
|
@ -96,7 +94,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
|
@ -108,8 +106,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords,
|
||||
Set<?> stemExclusionSet) {
|
||||
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords,
|
||||
CharArraySet stemExclusionSet) {
|
||||
this(matchVersion, stopwords);
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ca;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -49,7 +48,7 @@ import org.tartarus.snowball.ext.CatalanStemmer;
|
|||
* </ul>
|
||||
*/
|
||||
public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Catalan stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
@ -64,7 +63,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -73,7 +72,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -100,7 +99,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public CatalanAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -113,7 +112,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public CatalanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cjk;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -27,6 +26,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -49,12 +49,12 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -82,7 +82,7 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
package org.apache.lucene.analysis.commongrams;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -69,35 +68,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
* @param input TokenStream input in filter chain
|
||||
* @param commonWords The set of common words.
|
||||
*/
|
||||
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) {
|
||||
this(matchVersion, input, commonWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input using a Set of common
|
||||
* words to create bigrams, case-sensitive if ignoreCase is false (unless Set
|
||||
* is CharArraySet). If <code>commonWords</code> is an instance of
|
||||
* {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
|
||||
* construct the set) it will be directly used and <code>ignoreCase</code>
|
||||
* will be ignored since <code>CharArraySet</code> directly controls case
|
||||
* sensitivity.
|
||||
* <p/>
|
||||
* If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
|
||||
* new CharArraySet will be constructed and <code>ignoreCase</code> will be
|
||||
* used to specify the case sensitivity of that set.
|
||||
*
|
||||
* @param input TokenStream input in filter chain.
|
||||
* @param commonWords The set of common words.
|
||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
||||
*/
|
||||
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
|
||||
public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) {
|
||||
super(input);
|
||||
if (commonWords instanceof CharArraySet) {
|
||||
this.commonWords = (CharArraySet) commonWords;
|
||||
} else {
|
||||
this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
|
||||
this.commonWords.addAll(commonWords);
|
||||
}
|
||||
this.commonWords = commonWords;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -18,10 +18,7 @@ package org.apache.lucene.analysis.compound;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
@ -43,13 +40,6 @@ import org.apache.lucene.util.Version;
|
|||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
||||
* {@link Set Sets} to the ctors, they will be automatically
|
||||
* transformed to case-insensitive!
|
||||
*/
|
||||
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||
/**
|
||||
|
@ -80,15 +70,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
|
||||
private AttributeSource.State current;
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary) {
|
||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input);
|
||||
|
||||
this.tokens=new LinkedList<CompoundToken>();
|
||||
|
@ -96,12 +86,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
this.minSubwordSize=minSubwordSize;
|
||||
this.maxSubwordSize=maxSubwordSize;
|
||||
this.onlyLongestMatch=onlyLongestMatch;
|
||||
|
||||
if (dictionary==null || dictionary instanceof CharArraySet) {
|
||||
this.dictionary = (CharArraySet) dictionary;
|
||||
} else {
|
||||
this.dictionary = new CharArraySet(matchVersion, dictionary, true);
|
||||
}
|
||||
this.dictionary = dictionary;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -38,13 +39,6 @@ import org.apache.lucene.util.Version;
|
|||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
||||
* {@link Set Sets} to the ctors, they will be automatically
|
||||
* transformed to case-insensitive!
|
||||
*/
|
||||
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
||||
|
||||
|
@ -61,7 +55,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* @param dictionary
|
||||
* the word dictionary to match against.
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) {
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) {
|
||||
super(matchVersion, input, dictionary);
|
||||
}
|
||||
|
||||
|
@ -86,7 +80,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary,
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
|
|
@ -18,12 +18,12 @@ package org.apache.lucene.analysis.compound;
|
|||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
|
@ -41,13 +41,6 @@ import org.xml.sax.InputSource;
|
|||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
||||
* {@link Set Sets} to the ctors, they will be automatically
|
||||
* transformed to case-insensitive!
|
||||
*/
|
||||
public class HyphenationCompoundWordTokenFilter extends
|
||||
CompoundWordTokenFilterBase {
|
||||
|
@ -69,7 +62,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* the word dictionary to match against.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, Set<?> dictionary) {
|
||||
HyphenationTree hyphenator, CharArraySet dictionary) {
|
||||
this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
@ -98,7 +91,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
|
||||
HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||
onlyLongestMatch);
|
||||
|
@ -109,14 +102,14 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
/**
|
||||
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
||||
* <p>
|
||||
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
|
||||
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean)
|
||||
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
||||
* null, minWordSize, minSubwordSize, maxSubwordSize }
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
|
||||
int maxSubwordSize) {
|
||||
this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
|
||||
this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize,
|
||||
maxSubwordSize, false);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -46,7 +45,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/** An unmodifiable set containing some common English words that are not usually useful
|
||||
for searching.*/
|
||||
public static final Set<?> ENGLISH_STOP_WORDS_SET;
|
||||
public static final CharArraySet ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
static {
|
||||
final List<String> stopWords = Arrays.asList(
|
||||
|
@ -72,7 +71,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
|||
/** Builds an analyzer with the stop words from the given set.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopWords Set of stop words */
|
||||
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
public StopAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||
super(matchVersion, stopWords);
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -44,34 +43,6 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
|
||||
private final CharArraySet stopWords;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input. If
|
||||
* <code>stopWords</code> is an instance of {@link CharArraySet} (true if
|
||||
* <code>makeStopSet()</code> was used to construct the set) it will be
|
||||
* directly used and <code>ignoreCase</code> will be ignored since
|
||||
* <code>CharArraySet</code> directly controls case sensitivity.
|
||||
* <p/>
|
||||
* If <code>stopWords</code> is not an instance of {@link CharArraySet}, a new
|
||||
* CharArraySet will be constructed and <code>ignoreCase</code> will be used
|
||||
* to specify the case sensitivity of that set.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the stop
|
||||
* set if Version > 3.0. See <a href="#version">above</a> for details.
|
||||
* @param input
|
||||
* Input TokenStream
|
||||
* @param stopWords
|
||||
* A Set of Strings or char[] or any other toString()-able set
|
||||
* representing the stopwords
|
||||
* @param ignoreCase
|
||||
* if true, all words are lower cased first
|
||||
*/
|
||||
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
|
||||
{
|
||||
super(true, input);
|
||||
this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a filter which removes words from the input TokenStream that are
|
||||
|
@ -83,12 +54,12 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* @param in
|
||||
* Input stream
|
||||
* @param stopWords
|
||||
* A Set of Strings or char[] or any other toString()-able set
|
||||
* representing the stopwords
|
||||
* A {@link CharArraySet} representing the stopwords.
|
||||
* @see #makeStopSet(Version, java.lang.String...)
|
||||
*/
|
||||
public StopFilter(Version matchVersion, TokenStream in, Set<?> stopWords) {
|
||||
this(matchVersion, in, stopWords, false);
|
||||
public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
|
||||
super(true, in);
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -101,7 +72,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* @param stopWords An array of stopwords
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static Set<Object> makeStopSet(Version matchVersion, String... stopWords) {
|
||||
public static CharArraySet makeStopSet(Version matchVersion, String... stopWords) {
|
||||
return makeStopSet(matchVersion, stopWords, false);
|
||||
}
|
||||
|
||||
|
@ -116,7 +87,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static Set<Object> makeStopSet(Version matchVersion, List<?> stopWords) {
|
||||
public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords) {
|
||||
return makeStopSet(matchVersion, stopWords, false);
|
||||
}
|
||||
|
||||
|
@ -128,7 +99,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
*/
|
||||
public static Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
|
||||
public static CharArraySet makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
|
||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
|
||||
stopSet.addAll(Arrays.asList(stopWords));
|
||||
return stopSet;
|
||||
|
@ -141,7 +112,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* @param ignoreCase if true, all words are lower cased first
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
*/
|
||||
public static Set<Object> makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
|
||||
public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
|
||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
|
||||
stopSet.addAll(stopWords);
|
||||
return stopSet;
|
||||
|
|
|
@ -32,7 +32,6 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Czech language.
|
||||
|
@ -62,12 +61,12 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
*
|
||||
* @return a set of default Czech-stopwords
|
||||
*/
|
||||
public static final Set<?> getDefaultStopSet(){
|
||||
public static final CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET;
|
||||
private static final CharArraySet DEFAULT_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -82,7 +81,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
|
||||
private final Set<?> stemExclusionTable;
|
||||
private final CharArraySet stemExclusionTable;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||
|
@ -101,7 +100,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
* {@link <a href="#version">above</a>}
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -114,7 +113,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionTable a stemming exclusion set
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable) {
|
||||
public CzechAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||
}
|
||||
|
@ -129,7 +128,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
|
||||
* a version is >= LUCENE_31 and a stem exclusion set is provided via
|
||||
* {@link #CzechAnalyzer(Version, Set, Set)} a
|
||||
* {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
|
||||
* {@link KeywordMarkerFilter} is added before
|
||||
* {@link CzechStemFilter}.
|
||||
*/
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.da;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.DanishStemmer;
|
|||
* {@link Analyzer} for Danish.
|
||||
*/
|
||||
public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Danish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt";
|
||||
|
@ -50,7 +49,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -59,7 +58,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -86,7 +85,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public DanishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public DanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -99,7 +98,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -21,7 +21,6 @@ package org.apache.lucene.analysis.de;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -90,16 +89,16 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns a set of default German-stopwords
|
||||
* @return a set of default German-stopwords
|
||||
*/
|
||||
public static final Set<?> getDefaultStopSet(){
|
||||
public static final CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
/** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */
|
||||
@Deprecated
|
||||
private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
private static final CharArraySet DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
|
||||
private static final Set<?> DEFAULT_SET;
|
||||
private static final CharArraySet DEFAULT_SET;
|
||||
static {
|
||||
try {
|
||||
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
|
@ -119,7 +118,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private final Set<?> exclusionSet;
|
||||
private final CharArraySet exclusionSet;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
|
@ -139,7 +138,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public GermanAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -153,7 +152,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stemExclusionSet
|
||||
* a stemming exclusion set
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.el;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -27,6 +26,7 @@ import org.apache.lucene.analysis.core.StopFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -58,12 +58,12 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns a set of default Greek-stopwords
|
||||
* @return a set of default Greek-stopwords
|
||||
*/
|
||||
public static final Set<?> getDefaultStopSet(){
|
||||
public static final CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET;
|
||||
private static final CharArraySet DEFAULT_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -95,7 +95,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
* See <a href="#version">above</a>
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.en;
|
|||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -37,13 +36,13 @@ import org.apache.lucene.util.Version;
|
|||
* {@link Analyzer} for English.
|
||||
*/
|
||||
public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -52,7 +51,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -68,7 +67,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -81,7 +80,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.es;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -48,7 +47,7 @@ import org.tartarus.snowball.ext.SpanishStemmer;
|
|||
* </ul>
|
||||
*/
|
||||
public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Spanish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt";
|
||||
|
@ -57,7 +56,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -66,7 +65,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -93,7 +92,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -106,7 +105,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.eu;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.BasqueStemmer;
|
|||
* {@link Analyzer} for Basque.
|
||||
*/
|
||||
public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Basque stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
@ -48,7 +47,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -57,7 +56,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -84,7 +83,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public BasqueAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -97,7 +96,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public BasqueAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fa;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
|
@ -30,6 +29,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
|||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -63,7 +63,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -72,7 +72,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -101,7 +101,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
public PersianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fi;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.FinnishStemmer;
|
|||
* {@link Analyzer} for Finnish.
|
||||
*/
|
||||
public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Italian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt";
|
||||
|
@ -50,7 +49,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -59,7 +58,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -86,7 +85,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -99,7 +98,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,11 +19,9 @@ package org.apache.lucene.analysis.fr;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -56,7 +54,7 @@ public final class ElisionFilter extends TokenFilter {
|
|||
* @param input the source {@link TokenStream}
|
||||
* @param articles a set of stopword articles
|
||||
*/
|
||||
public ElisionFilter(Version matchVersion, TokenStream input, Set<?> articles) {
|
||||
public ElisionFilter(Version matchVersion, TokenStream input, CharArraySet articles) {
|
||||
super(input);
|
||||
this.articles = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(matchVersion, articles, true));
|
||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.util.Version;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for French language.
|
||||
|
@ -101,23 +100,23 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private final Set<?> excltable;
|
||||
private final CharArraySet excltable;
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
/** @deprecated (3.1) remove this in Lucene 5.0, index bw compat */
|
||||
@Deprecated
|
||||
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
|
||||
static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
|
||||
false));
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
|
@ -147,7 +146,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -161,8 +160,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stemExclutionSet
|
||||
* a stemming exclusion set
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
|
||||
Set<?> stemExclutionSet) {
|
||||
public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords,
|
||||
CharArraySet stemExclutionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(matchVersion, stemExclutionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.gl;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -39,7 +38,7 @@ import org.apache.lucene.util.Version;
|
|||
* {@link Analyzer} for Galician.
|
||||
*/
|
||||
public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Galician stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
@ -48,7 +47,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -57,7 +56,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -84,7 +83,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public GalicianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -97,7 +96,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public GalicianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hi;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
@ -44,7 +43,7 @@ import org.apache.lucene.util.Version;
|
|||
* </ul>
|
||||
*/
|
||||
public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* File containing default Hindi stopwords.
|
||||
|
@ -59,7 +58,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -68,7 +67,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -88,7 +87,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a stemming exclusion set
|
||||
*/
|
||||
public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public HindiAnalyzer(Version version, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(version, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(
|
||||
CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
|
@ -100,7 +99,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param version lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public HindiAnalyzer(Version version, Set<?> stopwords) {
|
||||
public HindiAnalyzer(Version version, CharArraySet stopwords) {
|
||||
this(version, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hu;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.HungarianStemmer;
|
|||
* {@link Analyzer} for Hungarian.
|
||||
*/
|
||||
public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Hungarian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt";
|
||||
|
@ -50,7 +49,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -59,7 +58,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -86,7 +85,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -99,7 +98,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hy;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.ArmenianStemmer;
|
|||
* {@link Analyzer} for Armenian.
|
||||
*/
|
||||
public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Armenian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
@ -48,7 +47,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -57,7 +56,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -84,7 +83,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public ArmenianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -97,7 +96,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public ArmenianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.id;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
|
@ -43,7 +42,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -52,7 +51,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -65,7 +64,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
}
|
||||
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
|
@ -82,7 +81,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -98,7 +97,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stemExclusionSet
|
||||
* a set of terms not to be stemmed
|
||||
*/
|
||||
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
|
||||
public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.it;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -52,7 +51,7 @@ import org.tartarus.snowball.ext.ItalianStemmer;
|
|||
* </ul>
|
||||
*/
|
||||
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Italian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
|
||||
|
@ -68,7 +67,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -77,7 +76,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -104,7 +103,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -117,7 +116,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.lv;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -27,7 +26,6 @@ import org.apache.lucene.analysis.core.StopFilter;
|
|||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -40,7 +38,7 @@ import org.apache.lucene.util.Version;
|
|||
* {@link Analyzer} for Latvian.
|
||||
*/
|
||||
public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Latvian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
@ -49,7 +47,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -58,7 +56,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -85,7 +83,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -98,7 +96,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -18,14 +18,12 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Marks terms as keywords via the {@link KeywordAttribute}. Each token
|
||||
|
@ -50,27 +48,11 @@ public final class KeywordMarkerFilter extends TokenFilter {
|
|||
* @param keywordSet
|
||||
* the keywords set to lookup the current termbuffer
|
||||
*/
|
||||
public KeywordMarkerFilter(final TokenStream in,
|
||||
final CharArraySet keywordSet) {
|
||||
public KeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet) {
|
||||
super(in);
|
||||
this.keywordSet = keywordSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new KeywordMarkerFilter, that marks the current token as a
|
||||
* keyword if the tokens term buffer is contained in the given set via the
|
||||
* {@link KeywordAttribute}.
|
||||
*
|
||||
* @param in
|
||||
* TokenStream to filter
|
||||
* @param keywordSet
|
||||
* the keywords set to lookup the current termbuffer
|
||||
*/
|
||||
public KeywordMarkerFilter(final TokenStream in, final Set<?> keywordSet) {
|
||||
this(in, keywordSet instanceof CharArraySet ? (CharArraySet) keywordSet
|
||||
: CharArraySet.copy(Version.LUCENE_31, keywordSet));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.Reader;
|
|||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -139,7 +138,7 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
|
||||
private final Pattern pattern;
|
||||
private final boolean toLowerCase;
|
||||
private final Set<?> stopWords;
|
||||
private final CharArraySet stopWords;
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
|
@ -162,7 +161,7 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
* or <a href="http://www.unine.ch/info/clef/">other stop words
|
||||
* lists </a>.
|
||||
*/
|
||||
public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set<?> stopWords) {
|
||||
public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, CharArraySet stopWords) {
|
||||
if (pattern == null)
|
||||
throw new IllegalArgumentException("pattern must not be null");
|
||||
|
||||
|
@ -404,12 +403,12 @@ public final class PatternAnalyzer extends Analyzer {
|
|||
private int pos;
|
||||
private final boolean isLetter;
|
||||
private final boolean toLowerCase;
|
||||
private final Set<?> stopWords;
|
||||
private final CharArraySet stopWords;
|
||||
private static final Locale locale = Locale.getDefault();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
|
||||
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, CharArraySet stopWords) {
|
||||
super(input);
|
||||
this.str = str;
|
||||
this.isLetter = isLetter;
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -46,10 +45,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
|||
* </p>
|
||||
*/
|
||||
public StemmerOverrideFilter(Version matchVersion, TokenStream input,
|
||||
Map<?,String> dictionary) {
|
||||
CharArrayMap<String> dictionary) {
|
||||
super(input);
|
||||
this.dictionary = dictionary instanceof CharArrayMap ?
|
||||
(CharArrayMap<String>) dictionary : CharArrayMap.copy(matchVersion, dictionary);
|
||||
this.dictionary = CharArrayMap.copy(matchVersion, dictionary);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -28,18 +28,14 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Dutch language.
|
||||
|
@ -56,6 +52,9 @@ import java.util.Map;
|
|||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating DutchAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, {@link #DutchAnalyzer(Version, CharArraySet)} and
|
||||
* {@link #DutchAnalyzer(Version, CharArraySet, CharArraySet)} also populate
|
||||
* the default entries for the stem override dictionary
|
||||
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
||||
* LowerCaseFilter is used prior to StopFilter, and Snowball
|
||||
* stopwords are used by default.
|
||||
|
@ -75,13 +74,13 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
static final CharArrayMap<String> DEFAULT_STEM_DICT;
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
|
@ -91,6 +90,12 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
|
||||
DEFAULT_STEM_DICT = new CharArrayMap<String>(Version.LUCENE_CURRENT, 4, false);
|
||||
DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet
|
||||
DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||
DEFAULT_STEM_DICT.put("ei", "eier");
|
||||
DEFAULT_STEM_DICT.put("kind", "kinder");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -98,14 +103,14 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private final Set<?> stoptable;
|
||||
private final CharArraySet stoptable;
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Set<?> excltable = Collections.emptySet();
|
||||
private CharArraySet excltable = CharArraySet.EMPTY_SET;
|
||||
|
||||
private final Map<String, String> stemdict = new HashMap<String, String>();
|
||||
private final CharArrayMap<String> stemdict;
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
|
@ -114,21 +119,33 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
*
|
||||
*/
|
||||
public DutchAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
stemdict.put("fiets", "fiets"); //otherwise fiet
|
||||
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||
stemdict.put("ei", "eier");
|
||||
stemdict.put("kind", "kinder");
|
||||
// historically, only this ctor populated the stem dict!!!!!
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
// historically, this ctor never the stem dict!!!!!
|
||||
// so we populate it only for >= 3.6
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
|
||||
matchVersion.onOrAfter(Version.LUCENE_36)
|
||||
? DefaultSetHolder.DEFAULT_STEM_DICT
|
||||
: CharArrayMap.<String>emptyMap());
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){
|
||||
// historically, this ctor never the stem dict!!!!!
|
||||
// so we populate it only for >= 3.6
|
||||
this(matchVersion, stopwords, stemExclusionTable,
|
||||
matchVersion.onOrAfter(Version.LUCENE_36)
|
||||
? DefaultSetHolder.DEFAULT_STEM_DICT
|
||||
: CharArrayMap.<String>emptyMap());
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||
this.stemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.no;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.NorwegianStemmer;
|
|||
* {@link Analyzer} for Norwegian.
|
||||
*/
|
||||
public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Norwegian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
|
||||
|
@ -50,7 +49,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -59,7 +58,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -86,7 +85,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -99,7 +98,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.pt;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -48,7 +47,7 @@ import org.tartarus.snowball.ext.PortugueseStemmer;
|
|||
* </ul>
|
||||
*/
|
||||
public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Portuguese stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
|
||||
|
@ -57,7 +56,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -66,7 +65,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -93,7 +92,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -106,7 +105,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.*;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.AnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -179,7 +180,8 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
if (stopWords == null) {
|
||||
return components;
|
||||
}
|
||||
StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(), stopWords);
|
||||
StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(),
|
||||
new CharArraySet(matchVersion, stopWords, false));
|
||||
return new TokenStreamComponents(components.getTokenizer(), stopFilter);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ro;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.RomanianStemmer;
|
|||
* {@link Analyzer} for Romanian.
|
||||
*/
|
||||
public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Romanian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
@ -53,7 +52,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -62,7 +61,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -89,7 +88,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -102,7 +101,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ru;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
|
@ -78,10 +77,10 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
private static class DefaultSetHolder {
|
||||
/** @deprecated (3.1) remove this for Lucene 5.0 */
|
||||
@Deprecated
|
||||
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
|
||||
static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -95,14 +94,14 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
}
|
||||
}
|
||||
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
*
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet() {
|
||||
public static CharArraySet getDefaultStopSet() {
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -120,7 +119,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
public RussianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -133,7 +132,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
* a stopword set
|
||||
* @param stemExclusionSet a set of words not to be stemmed
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
|
||||
public RussianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||
* LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
|
||||
|
@ -48,7 +47,7 @@ import java.util.Set;
|
|||
@Deprecated
|
||||
public final class SnowballAnalyzer extends Analyzer {
|
||||
private String name;
|
||||
private Set<?> stopSet;
|
||||
private CharArraySet stopSet;
|
||||
private final Version matchVersion;
|
||||
|
||||
/** Builds the named analyzer with no stop words. */
|
||||
|
@ -58,7 +57,7 @@ public final class SnowballAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
/** Builds the named analyzer with the given stop words. */
|
||||
public SnowballAnalyzer(Version matchVersion, String name, Set<?> stopWords) {
|
||||
public SnowballAnalyzer(Version matchVersion, String name, CharArraySet stopWords) {
|
||||
this(matchVersion, name);
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
|
||||
stopWords));
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.*;
|
|||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -28,7 +29,6 @@ import org.apache.lucene.util.Version;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
|
||||
|
@ -60,13 +60,13 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/** An unmodifiable set containing some common English words that are usually not
|
||||
useful for searching. */
|
||||
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
/** Builds an analyzer with the given stop words.
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopWords stop words */
|
||||
public ClassicAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||
super(matchVersion, stopWords);
|
||||
}
|
||||
|
||||
|
@ -79,15 +79,6 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
|||
this(matchVersion, STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopwords File to read stop words from */
|
||||
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.*;
|
|||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -28,7 +29,6 @@ import org.apache.lucene.util.Version;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||
|
@ -61,13 +61,13 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/** An unmodifiable set containing some common English words that are usually not
|
||||
useful for searching. */
|
||||
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
/** Builds an analyzer with the given stop words.
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopWords stop words */
|
||||
public StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
public StandardAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||
super(matchVersion, stopWords);
|
||||
}
|
||||
|
||||
|
@ -80,15 +80,6 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
this(matchVersion, STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopwords File to read stop words from */
|
||||
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.sv;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.SwedishStemmer;
|
|||
* {@link Analyzer} for Swedish.
|
||||
*/
|
||||
public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Swedish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
|
||||
|
@ -50,7 +49,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -59,7 +58,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -86,7 +85,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -99,7 +98,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.th;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
|
|||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -55,7 +55,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -64,7 +64,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -93,7 +93,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public ThaiAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.tr;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
|
@ -38,7 +37,7 @@ import org.tartarus.snowball.ext.TurkishStemmer;
|
|||
* {@link Analyzer} for Turkish.
|
||||
*/
|
||||
public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/** File containing default Turkish stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
@ -52,7 +51,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -61,7 +60,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -88,7 +87,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -101,7 +100,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.util;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -46,7 +45,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
|
|||
* @return the analyzer's stopword set or an empty set if the analyzer has no
|
||||
* stopwords
|
||||
*/
|
||||
public Set<?> getStopwordSet() {
|
||||
public CharArraySet getStopwordSet() {
|
||||
return stopwords;
|
||||
}
|
||||
|
||||
|
@ -58,7 +57,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
|
|||
* @param stopwords
|
||||
* the analyzer's stopword set
|
||||
*/
|
||||
protected StopwordAnalyzerBase(final Version version, final Set<?> stopwords) {
|
||||
protected StopwordAnalyzerBase(final Version version, final CharArraySet stopwords) {
|
||||
matchVersion = version;
|
||||
// analyzers should use char array set for stopwords!
|
||||
this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
|
||||
|
|
|
@ -18,9 +18,6 @@ package org.apache.lucene.analysis.ar;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -79,16 +76,14 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
|||
* Test that custom stopwords work, and are not case-sensitive.
|
||||
*/
|
||||
public void testCustomStopwords() throws Exception {
|
||||
Set<String> set = new HashSet<String>();
|
||||
Collections.addAll(set, "the", "and", "a");
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("the", "and", "a"), false);
|
||||
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set);
|
||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||
"brown", "fox" });
|
||||
}
|
||||
|
||||
public void testWithStemExclusionSet() throws IOException {
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("ساهدهات");
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("ساهدهات"), false);
|
||||
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.bg;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -43,8 +42,7 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testCustomStopwords() throws IOException {
|
||||
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, Collections
|
||||
.emptySet());
|
||||
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||
assertAnalyzesTo(a, "Как се казваш?",
|
||||
new String[] {"как", "се", "казваш"});
|
||||
}
|
||||
|
|
|
@ -136,7 +136,8 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testStemExclusionTable() throws Exception {
|
||||
BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT, Collections.emptySet(), asSet("quintessência"));
|
||||
BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT,
|
||||
CharArraySet.EMPTY_SET, new CharArraySet(TEST_VERSION_CURRENT, asSet("quintessência"), false));
|
||||
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
|
||||
}
|
||||
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.ca;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -50,8 +49,7 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("llengües");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("llengües"), false);
|
||||
Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT,
|
||||
CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "llengües", "llengües");
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
@ -58,10 +59,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testStopList() throws IOException {
|
||||
Set<Object> stopWordsSet = new HashSet<Object>();
|
||||
stopWordsSet.add("good");
|
||||
stopWordsSet.add("test");
|
||||
stopWordsSet.add("analyzer");
|
||||
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
|
||||
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
|
||||
StringReader reader = new StringReader("This is a good test of the english stop analyzer");
|
||||
TokenStream stream = newStop.tokenStream("test", reader);
|
||||
|
@ -75,10 +73,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testStopListPositions() throws IOException {
|
||||
Set<Object> stopWordsSet = new HashSet<Object>();
|
||||
stopWordsSet.add("good");
|
||||
stopWordsSet.add("test");
|
||||
stopWordsSet.add("analyzer");
|
||||
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
|
||||
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
|
||||
StringReader reader = new StringReader("This is a good test of the english stop analyzer with positions");
|
||||
int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1};
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -36,22 +37,15 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testExactCase() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
Set<String> stopWords = asSet("is", "the", "Time");
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, false);
|
||||
CharArraySet stopWords = new CharArraySet(TEST_VERSION_CURRENT, asSet("is", "the", "Time"), false);
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords);
|
||||
assertTokenStreamContents(stream, new String[] { "Now", "The" });
|
||||
}
|
||||
|
||||
public void testIgnoreCase() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
Set<String> stopWords = asSet( "is", "the", "Time" );
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, true);
|
||||
assertTokenStreamContents(stream, new String[] { "Now" });
|
||||
}
|
||||
|
||||
public void testStopFilt() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
||||
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||
assertTokenStreamContents(stream, new String[] { "Now", "The" });
|
||||
}
|
||||
|
@ -70,7 +64,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
log(sb.toString());
|
||||
String stopWords[] = a.toArray(new String[0]);
|
||||
for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
|
||||
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
||||
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
||||
// with increments
|
||||
StringReader reader = new StringReader(sb.toString());
|
||||
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||
|
@ -93,8 +87,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
|
||||
String stopWords1[] = a1.toArray(new String[0]);
|
||||
for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
|
||||
Set<Object> stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
|
||||
Set<Object> stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
|
||||
CharArraySet stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
|
||||
CharArraySet stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
|
||||
reader = new StringReader(sb.toString());
|
||||
StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
|
||||
stpf0.setEnablePositionIncrements(true);
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.da;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("undersøgelse");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("undersøgelse"), false);
|
||||
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT,
|
||||
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.de;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -46,7 +45,8 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testStemExclusionTable() throws Exception {
|
||||
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, Collections.emptySet(), asSet("tischen"));
|
||||
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
|
||||
new CharArraySet(TEST_VERSION_CURRENT, asSet("tischen"), false));
|
||||
checkOneTermReuse(a, "tischen", "tischen");
|
||||
}
|
||||
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.en;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -45,8 +44,7 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("books");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("books"), false);
|
||||
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT,
|
||||
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "books", "books");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.es;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("chicano");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chicano"), false);
|
||||
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT,
|
||||
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "chicana", "chican");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.eu;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("zaldiak");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("zaldiak"), false);
|
||||
Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT,
|
||||
BasqueAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "zaldiak", "zaldiak");
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.fa;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* Test the Persian Analyzer
|
||||
|
@ -215,7 +216,8 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
|||
* Test that custom stopwords work, and are not case-sensitive.
|
||||
*/
|
||||
public void testCustomStopwords() throws Exception {
|
||||
PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, asSet("the", "and", "a"));
|
||||
PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT,
|
||||
new CharArraySet(TEST_VERSION_CURRENT, asSet("the", "and", "a"), false));
|
||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||
"brown", "fox" });
|
||||
}
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.fi;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("edeltäjistään");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
|
||||
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT,
|
||||
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||
|
|
|
@ -20,15 +20,14 @@ package org.apache.lucene.analysis.fr;
|
|||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -38,9 +37,7 @@ public class TestElision extends BaseTokenStreamTestCase {
|
|||
public void testElision() throws Exception {
|
||||
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
|
||||
Set<String> articles = new HashSet<String>();
|
||||
articles.add("l");
|
||||
articles.add("M");
|
||||
CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
|
||||
TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles);
|
||||
List<String> tas = filter(filter);
|
||||
assertEquals("embrouille", tas.get(4));
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.gl;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("correspondente");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("correspondente"), false);
|
||||
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT,
|
||||
GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "correspondente", "correspondente");
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
package org.apache.lucene.analysis.hi;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -41,8 +39,7 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testExclusionSet() throws Exception {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("हिंदी");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("हिंदी"), false);
|
||||
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT,
|
||||
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.hu;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("babakocsi");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
|
||||
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT,
|
||||
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.hy;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("արծիվներ");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("արծիվներ"), false);
|
||||
Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT,
|
||||
ArmenianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "արծիվներ", "արծիվներ");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.id;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("peledakan");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("peledakan"), false);
|
||||
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
|
||||
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "peledakan", "peledakan");
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
@ -44,8 +45,7 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("abbandonata");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("abbandonata"), false);
|
||||
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT,
|
||||
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.lv;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("tirgiem");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("tirgiem"), false);
|
||||
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
|
||||
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
||||
|
|
|
@ -2,10 +2,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
|
@ -47,12 +44,11 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output);
|
||||
Set<String> jdkSet = new HashSet<String>();
|
||||
jdkSet.add("LuceneFox");
|
||||
CharArraySet mixedCaseSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("LuceneFox"), false);
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), jdkSet)), output);
|
||||
Set<?> set2 = set;
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), mixedCaseSet)), output);
|
||||
CharArraySet set2 = set;
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output);
|
||||
|
@ -64,8 +60,8 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
|
|||
new KeywordMarkerFilter(
|
||||
new KeywordMarkerFilter(
|
||||
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
|
||||
new HashSet<String>(Arrays.asList("Birds", "Houses"))),
|
||||
new HashSet<String>(Arrays.asList("Dogs", "Trees"))));
|
||||
new CharArraySet(TEST_VERSION_CURRENT, asSet("Birds", "Houses"), false)),
|
||||
new CharArraySet(TEST_VERSION_CURRENT, asSet("Dogs", "Trees"), false)));
|
||||
|
||||
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
|
||||
}
|
||||
|
|
|
@ -2,12 +2,11 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
|
@ -33,7 +32,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
|
|||
// lets make booked stem to books
|
||||
// the override filter will convert "booked" to "books",
|
||||
// but also mark it with KeywordAttribute so Porter will not change it.
|
||||
Map<String,String> dictionary = new HashMap<String,String>();
|
||||
CharArrayMap<String> dictionary = new CharArrayMap<String>(TEST_VERSION_CURRENT, 1, false);
|
||||
dictionary.put("booked", "books");
|
||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
|
||||
TokenStream stream = new PorterStemFilter(
|
||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.nl;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -150,6 +149,26 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* check that the default stem overrides are used
|
||||
* even if you use a non-default ctor.
|
||||
*/
|
||||
public void testStemOverrides() throws IOException {
|
||||
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||
checkOneTerm(a, "fiets", "fiets");
|
||||
}
|
||||
|
||||
/**
|
||||
* prior to 3.6, this confusingly did not happen if
|
||||
* you specified your own stoplist!!!!
|
||||
* @deprecated (3.6) Remove this test in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBuggyStemOverrides() throws IOException {
|
||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_35, CharArraySet.EMPTY_SET);
|
||||
checkOneTerm(a, "fiets", "fiet");
|
||||
}
|
||||
|
||||
/**
|
||||
* Prior to 3.1, this analyzer had no lowercase filter.
|
||||
* stopwords were case sensitive. Preserve this for back compat.
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.no;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("havnedistriktene");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("havnedistriktene"), false);
|
||||
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT,
|
||||
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.pt;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("quilométricas");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT,
|
||||
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.ro;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("absenţa");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("absenţa"), false);
|
||||
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT,
|
||||
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "absenţa", "absenţa");
|
||||
|
|
|
@ -18,12 +18,10 @@ package org.apache.lucene.analysis.sv;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -44,8 +42,7 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("jaktkarlarne");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlarne"), false);
|
||||
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT,
|
||||
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.tr;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("ağacı");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ağacı"), false);
|
||||
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT,
|
||||
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "ağacı", "ağacı");
|
||||
|
|
|
@ -39,13 +39,13 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
|||
this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
|
||||
}
|
||||
|
||||
public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set<?> stopwords, Set<String> stoptags) {
|
||||
public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set<String> stoptags) {
|
||||
super(matchVersion, stopwords);
|
||||
this.segmenter = segmenter;
|
||||
this.stoptags = stoptags;
|
||||
}
|
||||
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -58,7 +58,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
|||
* outer class accesses the static final set the first time.
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
static final Set<String> DEFAULT_STOP_TAGS;
|
||||
|
||||
static {
|
||||
|
|
|
@ -18,10 +18,7 @@
|
|||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -58,7 +55,7 @@ import org.apache.lucene.util.Version;
|
|||
*/
|
||||
public final class SmartChineseAnalyzer extends Analyzer {
|
||||
|
||||
private final Set<?> stopWords;
|
||||
private final CharArraySet stopWords;
|
||||
|
||||
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
|
@ -120,7 +117,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
|||
*/
|
||||
public SmartChineseAnalyzer(Version matchVersion, boolean useDefaultStopWords) {
|
||||
stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||
: Collections.EMPTY_SET;
|
||||
: CharArraySet.EMPTY_SET;
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -133,8 +130,8 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
|||
* </p>
|
||||
* @param stopWords {@link Set} of stopwords to use.
|
||||
*/
|
||||
public SmartChineseAnalyzer(Version matchVersion, Set stopWords) {
|
||||
this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords;
|
||||
public SmartChineseAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||
this.stopWords = stopWords==null?CharArraySet.EMPTY_SET:stopWords;
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -147,7 +144,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
|||
// The porter stemming is too strict, this is not a bug, this is a feature:)
|
||||
result = new PorterStemFilter(result);
|
||||
if (!stopWords.isEmpty()) {
|
||||
result = new StopFilter(matchVersion, result, stopWords, false);
|
||||
result = new StopFilter(matchVersion, result, stopWords);
|
||||
}
|
||||
return new TokenStreamComponents(tokenizer, result);
|
||||
}
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.pl;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -42,7 +41,7 @@ import org.egothor.stemmer.Trie;
|
|||
* {@link Analyzer} for Polish.
|
||||
*/
|
||||
public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Set<?> stemExclusionSet;
|
||||
private final CharArraySet stemExclusionSet;
|
||||
private final Trie stemTable;
|
||||
|
||||
/** File containing default Polish stopwords. */
|
||||
|
@ -55,7 +54,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
|||
* Returns an unmodifiable instance of the default stop words set.
|
||||
* @return default stop words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultsHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -64,7 +63,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultsHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
static final Trie DEFAULT_TABLE;
|
||||
|
||||
static {
|
||||
|
@ -100,7 +99,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public PolishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
public PolishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
|
@ -113,7 +112,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public PolishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
public PolishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemTable = DefaultsHolder.DEFAULT_TABLE;
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.pl;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -43,8 +42,7 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
Set<String> exclusionSet = new HashSet<String>();
|
||||
exclusionSet.add("studenta");
|
||||
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("studenta"), false);;
|
||||
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT,
|
||||
PolishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTermReuse(a, "studenta", "studenta");
|
||||
|
|
|
@ -93,15 +93,14 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements
|
|||
.getTokenFilterFactories();
|
||||
for (TokenFilterFactory factory : filterFactories) {
|
||||
if (factory instanceof StopFilterFactory) {
|
||||
// StopFilterFactory holds the stop words in a CharArraySet, but
|
||||
// the getStopWords() method returns a Set<?>, so we need to cast.
|
||||
// StopFilterFactory holds the stop words in a CharArraySet
|
||||
solrStopWords.put(fieldName,
|
||||
(CharArraySet) ((StopFilterFactory) factory).getStopWords());
|
||||
((StopFilterFactory) factory).getStopWords());
|
||||
}
|
||||
|
||||
if (factory instanceof CommonGramsFilterFactory) {
|
||||
solrStopWords.put(fieldName,
|
||||
(CharArraySet) ((CommonGramsFilterFactory) factory)
|
||||
((CommonGramsFilterFactory) factory)
|
||||
.getCommonWords());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
|
@ -71,12 +70,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
|||
return ignoreCase;
|
||||
}
|
||||
|
||||
public Set<?> getCommonWords() {
|
||||
public CharArraySet getCommonWords() {
|
||||
return commonWords;
|
||||
}
|
||||
|
||||
public CommonGramsFilter create(TokenStream input) {
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase);
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
|
||||
return commonGrams;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
|
@ -80,7 +79,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
|||
return ignoreCase;
|
||||
}
|
||||
|
||||
public Set<?> getCommonWords() {
|
||||
public CharArraySet getCommonWords() {
|
||||
return commonWords;
|
||||
}
|
||||
|
||||
|
@ -88,8 +87,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
|||
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
|
||||
*/
|
||||
public CommonGramsQueryFilter create(TokenStream input) {
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords,
|
||||
ignoreCase);
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
|
||||
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
|
||||
commonGrams);
|
||||
return commonGramsQuery;
|
||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.analysis.core.StopFilter;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
|
@ -81,13 +80,13 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
|||
return ignoreCase;
|
||||
}
|
||||
|
||||
public Set<?> getStopWords() {
|
||||
public CharArraySet getStopWords() {
|
||||
return stopWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase);
|
||||
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords);
|
||||
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
|
||||
return stopFilter;
|
||||
}
|
||||
|
|
|
@ -20,11 +20,11 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
@ -44,7 +44,7 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
|||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set<?> words = factory.getCommonWords();
|
||||
CharArraySet words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||
words.size() == 2);
|
||||
|
@ -89,7 +89,7 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
|||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set<?> words = factory.getCommonWords();
|
||||
CharArraySet words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue(words.contains("the"));
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false);
|
||||
|
|
|
@ -19,11 +19,11 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
@ -43,7 +43,7 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
|||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set<?> words = factory.getCommonWords();
|
||||
CharArraySet words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||
words.size() == 2);
|
||||
|
@ -88,7 +88,7 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
|||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set<?> words = factory.getCommonWords();
|
||||
CharArraySet words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue(words.contains("the"));
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false);
|
||||
|
|
|
@ -16,10 +16,10 @@ package org.apache.solr.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
@ -38,7 +38,7 @@ public class TestKeepFilterFactory extends BaseTokenTestCase{
|
|||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set<?> words = factory.getWords();
|
||||
CharArraySet words = factory.getWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||
|
||||
|
|
|
@ -17,10 +17,10 @@ package org.apache.solr.analysis;
|
|||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
@ -39,7 +39,7 @@ public class TestStopFilterFactory extends BaseTokenTestCase {
|
|||
args.put("ignoreCase", "true");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set<?> words = factory.getStopWords();
|
||||
CharArraySet words = factory.getStopWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||
|
|
Loading…
Reference in New Issue