mirror of https://github.com/apache/lucene.git
LUCENE-3765: Trappy behavior with StopFilter/ignoreCase
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1242497 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ef65f76824
commit
72ae3171be
|
@ -779,6 +779,11 @@ API Changes
|
||||||
to be merged. To mimic the old behaviour, just use IndexReader.directory()
|
to be merged. To mimic the old behaviour, just use IndexReader.directory()
|
||||||
for choosing the provider by Directory. (Uwe Schindler)
|
for choosing the provider by Directory. (Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-3765: Deprecated StopFilter ctor that took ignoreCase, because
|
||||||
|
in some cases (if the set is a CharArraySet), the argument is ignored.
|
||||||
|
Deprecated StandardAnalyzer and ClassicAnalyzer ctors that take File,
|
||||||
|
please use the Reader ctor instead. (Robert Muir)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
* LUCENE-3593: Added a FieldValueFilter that accepts all documents that either
|
* LUCENE-3593: Added a FieldValueFilter that accepts all documents that either
|
||||||
|
|
|
@ -218,6 +218,10 @@ Bug Fixes
|
||||||
* LUCENE-3719: FVH: slow performance on very large queries.
|
* LUCENE-3719: FVH: slow performance on very large queries.
|
||||||
(Igor Motov via Koji Sekiguchi)
|
(Igor Motov via Koji Sekiguchi)
|
||||||
|
|
||||||
|
* LUCENE-3765: As of Version.LUCENE_36, DutchAnalyzer's two ctors
|
||||||
|
that take stopwords and stem exclusion tables also initialize
|
||||||
|
the default stem overrides (e.g. kind/kinder, fiets). (Robert Muir)
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
* LUCENE-3599: Javadocs for DistanceUtils.haversine() were incorrectly
|
* LUCENE-3599: Javadocs for DistanceUtils.haversine() were incorrectly
|
||||||
|
|
|
@ -29,6 +29,11 @@ API Changes
|
||||||
since they prevent reuse. Both Analyzers should be configured at instantiation.
|
since they prevent reuse. Both Analyzers should be configured at instantiation.
|
||||||
(Chris Male)
|
(Chris Male)
|
||||||
|
|
||||||
|
* LUCENE-3765: Stopset ctors that previously took Set<?> or Map<?,String> now take
|
||||||
|
CharArraySet and CharArrayMap respectively. Previously the behavior was confusing,
|
||||||
|
and sometimes different depending on the type of set, and ultimately a CharArraySet
|
||||||
|
or CharArrayMap was always used anyway. (Robert Muir)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -63,7 +62,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +71,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -85,7 +84,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
@ -102,7 +101,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
|
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,7 +117,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stemExclusionSet
|
* @param stemExclusionSet
|
||||||
* a set of terms not to be stemmed
|
* a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
|
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -56,7 +56,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
*
|
*
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet() {
|
public static CharArraySet getDefaultStopSet() {
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* class accesses the static final set the first time.;
|
* class accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -78,7 +78,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words:
|
* Builds an analyzer with the default stop words:
|
||||||
|
@ -91,7 +91,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the given stop words.
|
* Builds an analyzer with the given stop words.
|
||||||
*/
|
*/
|
||||||
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -100,7 +100,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter}
|
* If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter}
|
||||||
* before {@link BulgarianStemFilter}.
|
* before {@link BulgarianStemFilter}.
|
||||||
*/
|
*/
|
||||||
public BulgarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet)); }
|
matchVersion, stemExclusionSet)); }
|
||||||
|
|
|
@ -19,8 +19,6 @@ package org.apache.lucene.analysis.br;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -56,12 +54,12 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -79,7 +77,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
||||||
/**
|
/**
|
||||||
* Contains words that should be indexed but not stemmed.
|
* Contains words that should be indexed but not stemmed.
|
||||||
*/
|
*/
|
||||||
private Set<?> excltable = Collections.emptySet();
|
private CharArraySet excltable = CharArraySet.EMPTY_SET;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||||
|
@ -96,7 +94,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -108,8 +106,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords,
|
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords,
|
||||||
Set<?> stemExclusionSet) {
|
CharArraySet stemExclusionSet) {
|
||||||
this(matchVersion, stopwords);
|
this(matchVersion, stopwords);
|
||||||
excltable = CharArraySet.unmodifiableSet(CharArraySet
|
excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||||
.copy(matchVersion, stemExclusionSet));
|
.copy(matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ca;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -49,7 +48,7 @@ import org.tartarus.snowball.ext.CatalanStemmer;
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Catalan stopwords. */
|
/** File containing default Catalan stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
@ -64,7 +63,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -73,7 +72,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -100,7 +99,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public CatalanAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,7 +112,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public CatalanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cjk;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -27,6 +26,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -49,12 +49,12 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -82,7 +82,7 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
|
public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
package org.apache.lucene.analysis.commongrams;
|
package org.apache.lucene.analysis.commongrams;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -69,35 +68,9 @@ public final class CommonGramsFilter extends TokenFilter {
|
||||||
* @param input TokenStream input in filter chain
|
* @param input TokenStream input in filter chain
|
||||||
* @param commonWords The set of common words.
|
* @param commonWords The set of common words.
|
||||||
*/
|
*/
|
||||||
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords) {
|
public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) {
|
||||||
this(matchVersion, input, commonWords, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Construct a token stream filtering the given input using a Set of common
|
|
||||||
* words to create bigrams, case-sensitive if ignoreCase is false (unless Set
|
|
||||||
* is CharArraySet). If <code>commonWords</code> is an instance of
|
|
||||||
* {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
|
|
||||||
* construct the set) it will be directly used and <code>ignoreCase</code>
|
|
||||||
* will be ignored since <code>CharArraySet</code> directly controls case
|
|
||||||
* sensitivity.
|
|
||||||
* <p/>
|
|
||||||
* If <code>commonWords</code> is not an instance of {@link CharArraySet}, a
|
|
||||||
* new CharArraySet will be constructed and <code>ignoreCase</code> will be
|
|
||||||
* used to specify the case sensitivity of that set.
|
|
||||||
*
|
|
||||||
* @param input TokenStream input in filter chain.
|
|
||||||
* @param commonWords The set of common words.
|
|
||||||
* @param ignoreCase -Ignore case when constructing bigrams for common words.
|
|
||||||
*/
|
|
||||||
public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?> commonWords, boolean ignoreCase) {
|
|
||||||
super(input);
|
super(input);
|
||||||
if (commonWords instanceof CharArraySet) {
|
this.commonWords = commonWords;
|
||||||
this.commonWords = (CharArraySet) commonWords;
|
|
||||||
} else {
|
|
||||||
this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase);
|
|
||||||
this.commonWords.addAll(commonWords);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -18,10 +18,7 @@ package org.apache.lucene.analysis.compound;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.Locale;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
@ -43,13 +40,6 @@ import org.apache.lucene.util.Version;
|
||||||
* supplementary characters in strings and char arrays provided as compound word
|
* supplementary characters in strings and char arrays provided as compound word
|
||||||
* dictionaries.
|
* dictionaries.
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
|
||||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
|
||||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
|
||||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
|
||||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
|
||||||
* {@link Set Sets} to the ctors, they will be automatically
|
|
||||||
* transformed to case-insensitive!
|
|
||||||
*/
|
*/
|
||||||
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
|
@ -80,15 +70,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
|
|
||||||
private AttributeSource.State current;
|
private AttributeSource.State current;
|
||||||
|
|
||||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
|
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) {
|
||||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
|
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary) {
|
||||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||||
super(input);
|
super(input);
|
||||||
|
|
||||||
this.tokens=new LinkedList<CompoundToken>();
|
this.tokens=new LinkedList<CompoundToken>();
|
||||||
|
@ -96,12 +86,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
this.minSubwordSize=minSubwordSize;
|
this.minSubwordSize=minSubwordSize;
|
||||||
this.maxSubwordSize=maxSubwordSize;
|
this.maxSubwordSize=maxSubwordSize;
|
||||||
this.onlyLongestMatch=onlyLongestMatch;
|
this.onlyLongestMatch=onlyLongestMatch;
|
||||||
|
this.dictionary = dictionary;
|
||||||
if (dictionary==null || dictionary instanceof CharArraySet) {
|
|
||||||
this.dictionary = (CharArraySet) dictionary;
|
|
||||||
} else {
|
|
||||||
this.dictionary = new CharArraySet(matchVersion, dictionary, true);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -38,13 +39,6 @@ import org.apache.lucene.util.Version;
|
||||||
* supplementary characters in strings and char arrays provided as compound word
|
* supplementary characters in strings and char arrays provided as compound word
|
||||||
* dictionaries.
|
* dictionaries.
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
|
||||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
|
||||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
|
||||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
|
||||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
|
||||||
* {@link Set Sets} to the ctors, they will be automatically
|
|
||||||
* transformed to case-insensitive!
|
|
||||||
*/
|
*/
|
||||||
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
||||||
|
|
||||||
|
@ -61,7 +55,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
||||||
* @param dictionary
|
* @param dictionary
|
||||||
* the word dictionary to match against.
|
* the word dictionary to match against.
|
||||||
*/
|
*/
|
||||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) {
|
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) {
|
||||||
super(matchVersion, input, dictionary);
|
super(matchVersion, input, dictionary);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,7 +80,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
||||||
* @param onlyLongestMatch
|
* @param onlyLongestMatch
|
||||||
* Add only the longest matching subword to the stream
|
* Add only the longest matching subword to the stream
|
||||||
*/
|
*/
|
||||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary,
|
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary,
|
||||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,12 +18,12 @@ package org.apache.lucene.analysis.compound;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
||||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
import org.xml.sax.InputSource;
|
import org.xml.sax.InputSource;
|
||||||
|
|
||||||
|
@ -41,13 +41,6 @@ import org.xml.sax.InputSource;
|
||||||
* supplementary characters in strings and char arrays provided as compound word
|
* supplementary characters in strings and char arrays provided as compound word
|
||||||
* dictionaries.
|
* dictionaries.
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
|
|
||||||
* it should be case-insensitive unless it contains only lowercased entries and you
|
|
||||||
* have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
|
|
||||||
* For optional performance (as this filter does lots of lookups to the dictionary,
|
|
||||||
* you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
|
|
||||||
* {@link Set Sets} to the ctors, they will be automatically
|
|
||||||
* transformed to case-insensitive!
|
|
||||||
*/
|
*/
|
||||||
public class HyphenationCompoundWordTokenFilter extends
|
public class HyphenationCompoundWordTokenFilter extends
|
||||||
CompoundWordTokenFilterBase {
|
CompoundWordTokenFilterBase {
|
||||||
|
@ -69,7 +62,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
* the word dictionary to match against.
|
* the word dictionary to match against.
|
||||||
*/
|
*/
|
||||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||||
HyphenationTree hyphenator, Set<?> dictionary) {
|
HyphenationTree hyphenator, CharArraySet dictionary) {
|
||||||
this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||||
}
|
}
|
||||||
|
@ -98,7 +91,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
* Add only the longest matching subword to the stream
|
* Add only the longest matching subword to the stream
|
||||||
*/
|
*/
|
||||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||||
HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
|
HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
|
||||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||||
onlyLongestMatch);
|
onlyLongestMatch);
|
||||||
|
@ -109,14 +102,14 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
/**
|
/**
|
||||||
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
||||||
* <p>
|
* <p>
|
||||||
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
|
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean)
|
||||||
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
||||||
* null, minWordSize, minSubwordSize, maxSubwordSize }
|
* null, minWordSize, minSubwordSize, maxSubwordSize }
|
||||||
*/
|
*/
|
||||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||||
HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
|
HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
|
||||||
int maxSubwordSize) {
|
int maxSubwordSize) {
|
||||||
this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
|
this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize,
|
||||||
maxSubwordSize, false);
|
maxSubwordSize, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,6 @@ import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
@ -46,7 +45,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
/** An unmodifiable set containing some common English words that are not usually useful
|
/** An unmodifiable set containing some common English words that are not usually useful
|
||||||
for searching.*/
|
for searching.*/
|
||||||
public static final Set<?> ENGLISH_STOP_WORDS_SET;
|
public static final CharArraySet ENGLISH_STOP_WORDS_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
final List<String> stopWords = Arrays.asList(
|
final List<String> stopWords = Arrays.asList(
|
||||||
|
@ -72,7 +71,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
||||||
/** Builds an analyzer with the stop words from the given set.
|
/** Builds an analyzer with the stop words from the given set.
|
||||||
* @param matchVersion See <a href="#version">above</a>
|
* @param matchVersion See <a href="#version">above</a>
|
||||||
* @param stopWords Set of stop words */
|
* @param stopWords Set of stop words */
|
||||||
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
|
public StopAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||||
super(matchVersion, stopWords);
|
super(matchVersion, stopWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -44,34 +43,6 @@ public final class StopFilter extends FilteringTokenFilter {
|
||||||
|
|
||||||
private final CharArraySet stopWords;
|
private final CharArraySet stopWords;
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
/**
|
|
||||||
* Construct a token stream filtering the given input. If
|
|
||||||
* <code>stopWords</code> is an instance of {@link CharArraySet} (true if
|
|
||||||
* <code>makeStopSet()</code> was used to construct the set) it will be
|
|
||||||
* directly used and <code>ignoreCase</code> will be ignored since
|
|
||||||
* <code>CharArraySet</code> directly controls case sensitivity.
|
|
||||||
* <p/>
|
|
||||||
* If <code>stopWords</code> is not an instance of {@link CharArraySet}, a new
|
|
||||||
* CharArraySet will be constructed and <code>ignoreCase</code> will be used
|
|
||||||
* to specify the case sensitivity of that set.
|
|
||||||
*
|
|
||||||
* @param matchVersion
|
|
||||||
* Lucene version to enable correct Unicode 4.0 behavior in the stop
|
|
||||||
* set if Version > 3.0. See <a href="#version">above</a> for details.
|
|
||||||
* @param input
|
|
||||||
* Input TokenStream
|
|
||||||
* @param stopWords
|
|
||||||
* A Set of Strings or char[] or any other toString()-able set
|
|
||||||
* representing the stopwords
|
|
||||||
* @param ignoreCase
|
|
||||||
* if true, all words are lower cased first
|
|
||||||
*/
|
|
||||||
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
|
|
||||||
{
|
|
||||||
super(true, input);
|
|
||||||
this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a filter which removes words from the input TokenStream that are
|
* Constructs a filter which removes words from the input TokenStream that are
|
||||||
|
@ -83,12 +54,12 @@ public final class StopFilter extends FilteringTokenFilter {
|
||||||
* @param in
|
* @param in
|
||||||
* Input stream
|
* Input stream
|
||||||
* @param stopWords
|
* @param stopWords
|
||||||
* A Set of Strings or char[] or any other toString()-able set
|
* A {@link CharArraySet} representing the stopwords.
|
||||||
* representing the stopwords
|
|
||||||
* @see #makeStopSet(Version, java.lang.String...)
|
* @see #makeStopSet(Version, java.lang.String...)
|
||||||
*/
|
*/
|
||||||
public StopFilter(Version matchVersion, TokenStream in, Set<?> stopWords) {
|
public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
|
||||||
this(matchVersion, in, stopWords, false);
|
super(true, in);
|
||||||
|
this.stopWords = stopWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -101,7 +72,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
||||||
* @param stopWords An array of stopwords
|
* @param stopWords An array of stopwords
|
||||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||||
*/
|
*/
|
||||||
public static Set<Object> makeStopSet(Version matchVersion, String... stopWords) {
|
public static CharArraySet makeStopSet(Version matchVersion, String... stopWords) {
|
||||||
return makeStopSet(matchVersion, stopWords, false);
|
return makeStopSet(matchVersion, stopWords, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,7 +87,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
||||||
* @return A Set ({@link CharArraySet}) containing the words
|
* @return A Set ({@link CharArraySet}) containing the words
|
||||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||||
*/
|
*/
|
||||||
public static Set<Object> makeStopSet(Version matchVersion, List<?> stopWords) {
|
public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords) {
|
||||||
return makeStopSet(matchVersion, stopWords, false);
|
return makeStopSet(matchVersion, stopWords, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,7 +99,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
||||||
* @param ignoreCase If true, all words are lower cased first.
|
* @param ignoreCase If true, all words are lower cased first.
|
||||||
* @return a Set containing the words
|
* @return a Set containing the words
|
||||||
*/
|
*/
|
||||||
public static Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
|
public static CharArraySet makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
|
||||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
|
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
|
||||||
stopSet.addAll(Arrays.asList(stopWords));
|
stopSet.addAll(Arrays.asList(stopWords));
|
||||||
return stopSet;
|
return stopSet;
|
||||||
|
@ -141,7 +112,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
||||||
* @param ignoreCase if true, all words are lower cased first
|
* @param ignoreCase if true, all words are lower cased first
|
||||||
* @return A Set ({@link CharArraySet}) containing the words
|
* @return A Set ({@link CharArraySet}) containing the words
|
||||||
*/
|
*/
|
||||||
public static Set<Object> makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
|
public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
|
||||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
|
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
|
||||||
stopSet.addAll(stopWords);
|
stopSet.addAll(stopWords);
|
||||||
return stopSet;
|
return stopSet;
|
||||||
|
|
|
@ -32,7 +32,6 @@ import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@link Analyzer} for Czech language.
|
* {@link Analyzer} for Czech language.
|
||||||
|
@ -62,12 +61,12 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
||||||
*
|
*
|
||||||
* @return a set of default Czech-stopwords
|
* @return a set of default Czech-stopwords
|
||||||
*/
|
*/
|
||||||
public static final Set<?> getDefaultStopSet(){
|
public static final CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_SET;
|
return DefaultSetHolder.DEFAULT_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
private static final Set<?> DEFAULT_SET;
|
private static final CharArraySet DEFAULT_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -82,7 +81,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private final Set<?> stemExclusionTable;
|
private final CharArraySet stemExclusionTable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||||
|
@ -101,7 +100,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
||||||
* {@link <a href="#version">above</a>}
|
* {@link <a href="#version">above</a>}
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,7 +113,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionTable a stemming exclusion set
|
* @param stemExclusionTable a stemming exclusion set
|
||||||
*/
|
*/
|
||||||
public CzechAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable) {
|
public CzechAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||||
}
|
}
|
||||||
|
@ -129,7 +128,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
||||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
|
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
|
||||||
* a version is >= LUCENE_31 and a stem exclusion set is provided via
|
* a version is >= LUCENE_31 and a stem exclusion set is provided via
|
||||||
* {@link #CzechAnalyzer(Version, Set, Set)} a
|
* {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
|
||||||
* {@link KeywordMarkerFilter} is added before
|
* {@link KeywordMarkerFilter} is added before
|
||||||
* {@link CzechStemFilter}.
|
* {@link CzechStemFilter}.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.da;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.DanishStemmer;
|
||||||
* {@link Analyzer} for Danish.
|
* {@link Analyzer} for Danish.
|
||||||
*/
|
*/
|
||||||
public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Danish stopwords. */
|
/** File containing default Danish stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt";
|
||||||
|
@ -50,7 +49,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,7 +58,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -86,7 +85,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public DanishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public DanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -99,7 +98,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.apache.lucene.analysis.de;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -90,16 +89,16 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns a set of default German-stopwords
|
* Returns a set of default German-stopwords
|
||||||
* @return a set of default German-stopwords
|
* @return a set of default German-stopwords
|
||||||
*/
|
*/
|
||||||
public static final Set<?> getDefaultStopSet(){
|
public static final CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_SET;
|
return DefaultSetHolder.DEFAULT_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
/** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */
|
/** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */
|
||||||
@Deprecated
|
@Deprecated
|
||||||
private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
|
private static final CharArraySet DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||||
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
|
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
|
||||||
private static final Set<?> DEFAULT_SET;
|
private static final CharArraySet DEFAULT_SET;
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||||
|
@ -119,7 +118,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
/**
|
/**
|
||||||
* Contains words that should be indexed but not stemmed.
|
* Contains words that should be indexed but not stemmed.
|
||||||
*/
|
*/
|
||||||
private final Set<?> exclusionSet;
|
private final CharArraySet exclusionSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words:
|
* Builds an analyzer with the default stop words:
|
||||||
|
@ -139,7 +138,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public GermanAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -153,7 +152,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stemExclusionSet
|
* @param stemExclusionSet
|
||||||
* a stemming exclusion set
|
* a stemming exclusion set
|
||||||
*/
|
*/
|
||||||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.el;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -27,6 +26,7 @@ import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -58,12 +58,12 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns a set of default Greek-stopwords
|
* Returns a set of default Greek-stopwords
|
||||||
* @return a set of default Greek-stopwords
|
* @return a set of default Greek-stopwords
|
||||||
*/
|
*/
|
||||||
public static final Set<?> getDefaultStopSet(){
|
public static final CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_SET;
|
return DefaultSetHolder.DEFAULT_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
private static final Set<?> DEFAULT_SET;
|
private static final CharArraySet DEFAULT_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -95,7 +95,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
||||||
* See <a href="#version">above</a>
|
* See <a href="#version">above</a>
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.en;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -37,13 +36,13 @@ import org.apache.lucene.util.Version;
|
||||||
* {@link Analyzer} for English.
|
* {@link Analyzer} for English.
|
||||||
*/
|
*/
|
||||||
public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,7 +51,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
|
static final CharArraySet DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -68,7 +67,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,7 +80,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -48,7 +47,7 @@ import org.tartarus.snowball.ext.SpanishStemmer;
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Spanish stopwords. */
|
/** File containing default Spanish stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt";
|
||||||
|
@ -57,7 +56,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,7 +65,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -93,7 +92,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,7 +105,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public SpanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.eu;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.BasqueStemmer;
|
||||||
* {@link Analyzer} for Basque.
|
* {@link Analyzer} for Basque.
|
||||||
*/
|
*/
|
||||||
public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Basque stopwords. */
|
/** File containing default Basque stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
@ -48,7 +47,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,7 +56,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -84,7 +83,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public BasqueAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,7 +96,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public BasqueAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fa;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
import org.apache.lucene.analysis.CharReader;
|
||||||
|
@ -30,6 +29,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -101,7 +101,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
|
public PersianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fi;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.FinnishStemmer;
|
||||||
* {@link Analyzer} for Finnish.
|
* {@link Analyzer} for Finnish.
|
||||||
*/
|
*/
|
||||||
public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Italian stopwords. */
|
/** File containing default Italian stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt";
|
||||||
|
@ -50,7 +49,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,7 +58,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -86,7 +85,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -99,7 +98,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public FinnishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,11 +19,9 @@ package org.apache.lucene.analysis.fr;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -56,7 +54,7 @@ public final class ElisionFilter extends TokenFilter {
|
||||||
* @param input the source {@link TokenStream}
|
* @param input the source {@link TokenStream}
|
||||||
* @param articles a set of stopword articles
|
* @param articles a set of stopword articles
|
||||||
*/
|
*/
|
||||||
public ElisionFilter(Version matchVersion, TokenStream input, Set<?> articles) {
|
public ElisionFilter(Version matchVersion, TokenStream input, CharArraySet articles) {
|
||||||
super(input);
|
super(input);
|
||||||
this.articles = CharArraySet.unmodifiableSet(
|
this.articles = CharArraySet.unmodifiableSet(
|
||||||
new CharArraySet(matchVersion, articles, true));
|
new CharArraySet(matchVersion, articles, true));
|
||||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.util.Version;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@link Analyzer} for French language.
|
* {@link Analyzer} for French language.
|
||||||
|
@ -101,23 +100,23 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
/**
|
/**
|
||||||
* Contains words that should be indexed but not stemmed.
|
* Contains words that should be indexed but not stemmed.
|
||||||
*/
|
*/
|
||||||
private final Set<?> excltable;
|
private final CharArraySet excltable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
/** @deprecated (3.1) remove this in Lucene 5.0, index bw compat */
|
/** @deprecated (3.1) remove this in Lucene 5.0, index bw compat */
|
||||||
@Deprecated
|
@Deprecated
|
||||||
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
|
static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
|
||||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
|
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
|
||||||
false));
|
false));
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||||
|
@ -147,7 +146,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords){
|
public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,8 +160,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stemExclutionSet
|
* @param stemExclutionSet
|
||||||
* a stemming exclusion set
|
* a stemming exclusion set
|
||||||
*/
|
*/
|
||||||
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
|
public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords,
|
||||||
Set<?> stemExclutionSet) {
|
CharArraySet stemExclutionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
|
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||||
.copy(matchVersion, stemExclutionSet));
|
.copy(matchVersion, stemExclutionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.gl;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -39,7 +38,7 @@ import org.apache.lucene.util.Version;
|
||||||
* {@link Analyzer} for Galician.
|
* {@link Analyzer} for Galician.
|
||||||
*/
|
*/
|
||||||
public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Galician stopwords. */
|
/** File containing default Galician stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
@ -48,7 +47,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,7 +56,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -84,7 +83,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public GalicianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,7 +96,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public GalicianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -44,7 +43,7 @@ import org.apache.lucene.util.Version;
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* File containing default Hindi stopwords.
|
* File containing default Hindi stopwords.
|
||||||
|
@ -59,7 +58,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -68,7 +67,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -88,7 +87,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a stemming exclusion set
|
* @param stemExclusionSet a stemming exclusion set
|
||||||
*/
|
*/
|
||||||
public HindiAnalyzer(Version version, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public HindiAnalyzer(Version version, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(version, stopwords);
|
super(version, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(
|
||||||
CharArraySet.copy(matchVersion, stemExclusionSet));
|
CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||||
|
@ -100,7 +99,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param version lucene compatibility version
|
* @param version lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public HindiAnalyzer(Version version, Set<?> stopwords) {
|
public HindiAnalyzer(Version version, CharArraySet stopwords) {
|
||||||
this(version, stopwords, CharArraySet.EMPTY_SET);
|
this(version, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hu;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.HungarianStemmer;
|
||||||
* {@link Analyzer} for Hungarian.
|
* {@link Analyzer} for Hungarian.
|
||||||
*/
|
*/
|
||||||
public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Hungarian stopwords. */
|
/** File containing default Hungarian stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt";
|
||||||
|
@ -50,7 +49,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,7 +58,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -86,7 +85,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -99,7 +98,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public HungarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hy;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.ArmenianStemmer;
|
||||||
* {@link Analyzer} for Armenian.
|
* {@link Analyzer} for Armenian.
|
||||||
*/
|
*/
|
||||||
public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Armenian stopwords. */
|
/** File containing default Armenian stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
@ -48,7 +47,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,7 +56,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -84,7 +83,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public ArmenianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,7 +96,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public ArmenianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.id;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
@ -43,7 +42,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,7 +51,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -65,7 +64,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
@ -82,7 +81,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords){
|
public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,7 +97,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stemExclusionSet
|
* @param stemExclusionSet
|
||||||
* a set of terms not to be stemmed
|
* a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public IndonesianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
|
public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.it;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -52,7 +51,7 @@ import org.tartarus.snowball.ext.ItalianStemmer;
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Italian stopwords. */
|
/** File containing default Italian stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
|
||||||
|
@ -68,7 +67,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,7 +76,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -104,7 +103,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -117,7 +116,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public ItalianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.lv;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -27,7 +26,6 @@ import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
@ -40,7 +38,7 @@ import org.apache.lucene.util.Version;
|
||||||
* {@link Analyzer} for Latvian.
|
* {@link Analyzer} for Latvian.
|
||||||
*/
|
*/
|
||||||
public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Latvian stopwords. */
|
/** File containing default Latvian stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
@ -49,7 +47,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -58,7 +56,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -85,7 +83,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,7 +96,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public LatvianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -18,14 +18,12 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Marks terms as keywords via the {@link KeywordAttribute}. Each token
|
* Marks terms as keywords via the {@link KeywordAttribute}. Each token
|
||||||
|
@ -50,27 +48,11 @@ public final class KeywordMarkerFilter extends TokenFilter {
|
||||||
* @param keywordSet
|
* @param keywordSet
|
||||||
* the keywords set to lookup the current termbuffer
|
* the keywords set to lookup the current termbuffer
|
||||||
*/
|
*/
|
||||||
public KeywordMarkerFilter(final TokenStream in,
|
public KeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet) {
|
||||||
final CharArraySet keywordSet) {
|
|
||||||
super(in);
|
super(in);
|
||||||
this.keywordSet = keywordSet;
|
this.keywordSet = keywordSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a new KeywordMarkerFilter, that marks the current token as a
|
|
||||||
* keyword if the tokens term buffer is contained in the given set via the
|
|
||||||
* {@link KeywordAttribute}.
|
|
||||||
*
|
|
||||||
* @param in
|
|
||||||
* TokenStream to filter
|
|
||||||
* @param keywordSet
|
|
||||||
* the keywords set to lookup the current termbuffer
|
|
||||||
*/
|
|
||||||
public KeywordMarkerFilter(final TokenStream in, final Set<?> keywordSet) {
|
|
||||||
this(in, keywordSet instanceof CharArraySet ? (CharArraySet) keywordSet
|
|
||||||
: CharArraySet.copy(Version.LUCENE_31, keywordSet));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -139,7 +138,7 @@ public final class PatternAnalyzer extends Analyzer {
|
||||||
|
|
||||||
private final Pattern pattern;
|
private final Pattern pattern;
|
||||||
private final boolean toLowerCase;
|
private final boolean toLowerCase;
|
||||||
private final Set<?> stopWords;
|
private final CharArraySet stopWords;
|
||||||
|
|
||||||
private final Version matchVersion;
|
private final Version matchVersion;
|
||||||
|
|
||||||
|
@ -162,7 +161,7 @@ public final class PatternAnalyzer extends Analyzer {
|
||||||
* or <a href="http://www.unine.ch/info/clef/">other stop words
|
* or <a href="http://www.unine.ch/info/clef/">other stop words
|
||||||
* lists </a>.
|
* lists </a>.
|
||||||
*/
|
*/
|
||||||
public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set<?> stopWords) {
|
public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, CharArraySet stopWords) {
|
||||||
if (pattern == null)
|
if (pattern == null)
|
||||||
throw new IllegalArgumentException("pattern must not be null");
|
throw new IllegalArgumentException("pattern must not be null");
|
||||||
|
|
||||||
|
@ -404,12 +403,12 @@ public final class PatternAnalyzer extends Analyzer {
|
||||||
private int pos;
|
private int pos;
|
||||||
private final boolean isLetter;
|
private final boolean isLetter;
|
||||||
private final boolean toLowerCase;
|
private final boolean toLowerCase;
|
||||||
private final Set<?> stopWords;
|
private final CharArraySet stopWords;
|
||||||
private static final Locale locale = Locale.getDefault();
|
private static final Locale locale = Locale.getDefault();
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
|
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, CharArraySet stopWords) {
|
||||||
super(input);
|
super(input);
|
||||||
this.str = str;
|
this.str = str;
|
||||||
this.isLetter = isLetter;
|
this.isLetter = isLetter;
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -46,10 +45,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public StemmerOverrideFilter(Version matchVersion, TokenStream input,
|
public StemmerOverrideFilter(Version matchVersion, TokenStream input,
|
||||||
Map<?,String> dictionary) {
|
CharArrayMap<String> dictionary) {
|
||||||
super(input);
|
super(input);
|
||||||
this.dictionary = dictionary instanceof CharArrayMap ?
|
this.dictionary = CharArrayMap.copy(matchVersion, dictionary);
|
||||||
(CharArrayMap<String>) dictionary : CharArrayMap.copy(matchVersion, dictionary);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -28,18 +28,14 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||||
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@link Analyzer} for Dutch language.
|
* {@link Analyzer} for Dutch language.
|
||||||
|
@ -56,6 +52,9 @@ import java.util.Map;
|
||||||
* <p>You must specify the required {@link Version}
|
* <p>You must specify the required {@link Version}
|
||||||
* compatibility when creating DutchAnalyzer:
|
* compatibility when creating DutchAnalyzer:
|
||||||
* <ul>
|
* <ul>
|
||||||
|
* <li> As of 3.6, {@link #DutchAnalyzer(Version, CharArraySet)} and
|
||||||
|
* {@link #DutchAnalyzer(Version, CharArraySet, CharArraySet)} also populate
|
||||||
|
* the default entries for the stem override dictionary
|
||||||
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
||||||
* LowerCaseFilter is used prior to StopFilter, and Snowball
|
* LowerCaseFilter is used prior to StopFilter, and Snowball
|
||||||
* stopwords are used by default.
|
* stopwords are used by default.
|
||||||
|
@ -75,13 +74,13 @@ public final class DutchAnalyzer extends Analyzer {
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
static final CharArrayMap<String> DEFAULT_STEM_DICT;
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||||
|
@ -91,6 +90,12 @@ public final class DutchAnalyzer extends Analyzer {
|
||||||
// distribution (JAR)
|
// distribution (JAR)
|
||||||
throw new RuntimeException("Unable to load default stopword set");
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DEFAULT_STEM_DICT = new CharArrayMap<String>(Version.LUCENE_CURRENT, 4, false);
|
||||||
|
DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet
|
||||||
|
DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||||
|
DEFAULT_STEM_DICT.put("ei", "eier");
|
||||||
|
DEFAULT_STEM_DICT.put("kind", "kinder");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,14 +103,14 @@ public final class DutchAnalyzer extends Analyzer {
|
||||||
/**
|
/**
|
||||||
* Contains the stopwords used with the StopFilter.
|
* Contains the stopwords used with the StopFilter.
|
||||||
*/
|
*/
|
||||||
private final Set<?> stoptable;
|
private final CharArraySet stoptable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains words that should be indexed but not stemmed.
|
* Contains words that should be indexed but not stemmed.
|
||||||
*/
|
*/
|
||||||
private Set<?> excltable = Collections.emptySet();
|
private CharArraySet excltable = CharArraySet.EMPTY_SET;
|
||||||
|
|
||||||
private final Map<String, String> stemdict = new HashMap<String, String>();
|
private final CharArrayMap<String> stemdict;
|
||||||
private final Version matchVersion;
|
private final Version matchVersion;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -114,21 +119,33 @@ public final class DutchAnalyzer extends Analyzer {
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public DutchAnalyzer(Version matchVersion) {
|
public DutchAnalyzer(Version matchVersion) {
|
||||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
// historically, only this ctor populated the stem dict!!!!!
|
||||||
stemdict.put("fiets", "fiets"); //otherwise fiet
|
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||||
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
|
||||||
stemdict.put("ei", "eier");
|
|
||||||
stemdict.put("kind", "kinder");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public DutchAnalyzer(Version matchVersion, Set<?> stopwords){
|
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
// historically, this ctor never the stem dict!!!!!
|
||||||
|
// so we populate it only for >= 3.6
|
||||||
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
|
||||||
|
matchVersion.onOrAfter(Version.LUCENE_36)
|
||||||
|
? DefaultSetHolder.DEFAULT_STEM_DICT
|
||||||
|
: CharArrayMap.<String>emptyMap());
|
||||||
}
|
}
|
||||||
|
|
||||||
public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){
|
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){
|
||||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
// historically, this ctor never the stem dict!!!!!
|
||||||
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
// so we populate it only for >= 3.6
|
||||||
|
this(matchVersion, stopwords, stemExclusionTable,
|
||||||
|
matchVersion.onOrAfter(Version.LUCENE_36)
|
||||||
|
? DefaultSetHolder.DEFAULT_STEM_DICT
|
||||||
|
: CharArrayMap.<String>emptyMap());
|
||||||
|
}
|
||||||
|
|
||||||
|
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
|
||||||
this.matchVersion = matchVersion;
|
this.matchVersion = matchVersion;
|
||||||
|
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||||
|
this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||||
|
this.stemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.no;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.NorwegianStemmer;
|
||||||
* {@link Analyzer} for Norwegian.
|
* {@link Analyzer} for Norwegian.
|
||||||
*/
|
*/
|
||||||
public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Norwegian stopwords. */
|
/** File containing default Norwegian stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
|
||||||
|
@ -50,7 +49,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,7 +58,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -86,7 +85,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -99,7 +98,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -48,7 +47,7 @@ import org.tartarus.snowball.ext.PortugueseStemmer;
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Portuguese stopwords. */
|
/** File containing default Portuguese stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
|
||||||
|
@ -57,7 +56,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,7 +65,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -93,7 +92,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,7 +105,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.util.*;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.AnalyzerWrapper;
|
import org.apache.lucene.analysis.AnalyzerWrapper;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.MultiFields;
|
import org.apache.lucene.index.MultiFields;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -179,7 +180,8 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
||||||
if (stopWords == null) {
|
if (stopWords == null) {
|
||||||
return components;
|
return components;
|
||||||
}
|
}
|
||||||
StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(), stopWords);
|
StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(),
|
||||||
|
new CharArraySet(matchVersion, stopWords, false));
|
||||||
return new TokenStreamComponents(components.getTokenizer(), stopFilter);
|
return new TokenStreamComponents(components.getTokenizer(), stopFilter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ro;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.RomanianStemmer;
|
||||||
* {@link Analyzer} for Romanian.
|
* {@link Analyzer} for Romanian.
|
||||||
*/
|
*/
|
||||||
public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Romanian stopwords. */
|
/** File containing default Romanian stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
@ -53,7 +52,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,7 +61,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -89,7 +88,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,7 +101,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ru;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
@ -78,10 +77,10 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
/** @deprecated (3.1) remove this for Lucene 5.0 */
|
/** @deprecated (3.1) remove this for Lucene 5.0 */
|
||||||
@Deprecated
|
@Deprecated
|
||||||
static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
|
static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
|
||||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||||
Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
|
Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -95,14 +94,14 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an unmodifiable instance of the default stop-words set.
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
*
|
*
|
||||||
* @return an unmodifiable instance of the default stop-words set.
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet() {
|
public static CharArraySet getDefaultStopSet() {
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,7 +119,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* a stopword set
|
* a stopword set
|
||||||
*/
|
*/
|
||||||
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
|
public RussianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -133,7 +132,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
* a stopword set
|
* a stopword set
|
||||||
* @param stemExclusionSet a set of words not to be stemmed
|
* @param stemExclusionSet a set of words not to be stemmed
|
||||||
*/
|
*/
|
||||||
public RussianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet){
|
public RussianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||||
* LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
|
* LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
|
||||||
|
@ -48,7 +47,7 @@ import java.util.Set;
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public final class SnowballAnalyzer extends Analyzer {
|
public final class SnowballAnalyzer extends Analyzer {
|
||||||
private String name;
|
private String name;
|
||||||
private Set<?> stopSet;
|
private CharArraySet stopSet;
|
||||||
private final Version matchVersion;
|
private final Version matchVersion;
|
||||||
|
|
||||||
/** Builds the named analyzer with no stop words. */
|
/** Builds the named analyzer with no stop words. */
|
||||||
|
@ -58,7 +57,7 @@ public final class SnowballAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Builds the named analyzer with the given stop words. */
|
/** Builds the named analyzer with the given stop words. */
|
||||||
public SnowballAnalyzer(Version matchVersion, String name, Set<?> stopWords) {
|
public SnowballAnalyzer(Version matchVersion, String name, CharArraySet stopWords) {
|
||||||
this(matchVersion, name);
|
this(matchVersion, name);
|
||||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
|
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
|
||||||
stopWords));
|
stopWords));
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -28,7 +29,6 @@ import org.apache.lucene.util.Version;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
|
* Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
|
||||||
|
@ -60,13 +60,13 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
/** An unmodifiable set containing some common English words that are usually not
|
/** An unmodifiable set containing some common English words that are usually not
|
||||||
useful for searching. */
|
useful for searching. */
|
||||||
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||||
|
|
||||||
/** Builds an analyzer with the given stop words.
|
/** Builds an analyzer with the given stop words.
|
||||||
* @param matchVersion Lucene version to match See {@link
|
* @param matchVersion Lucene version to match See {@link
|
||||||
* <a href="#version">above</a>}
|
* <a href="#version">above</a>}
|
||||||
* @param stopWords stop words */
|
* @param stopWords stop words */
|
||||||
public ClassicAnalyzer(Version matchVersion, Set<?> stopWords) {
|
public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||||
super(matchVersion, stopWords);
|
super(matchVersion, stopWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,15 +79,6 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
||||||
this(matchVersion, STOP_WORDS_SET);
|
this(matchVersion, STOP_WORDS_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Builds an analyzer with the stop words from the given file.
|
|
||||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
|
||||||
* @param matchVersion Lucene version to match See {@link
|
|
||||||
* <a href="#version">above</a>}
|
|
||||||
* @param stopwords File to read stop words from */
|
|
||||||
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
|
||||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Builds an analyzer with the stop words from the given reader.
|
/** Builds an analyzer with the stop words from the given reader.
|
||||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||||
* @param matchVersion Lucene version to match See {@link
|
* @param matchVersion Lucene version to match See {@link
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -28,7 +29,6 @@ import org.apache.lucene.util.Version;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||||
|
@ -61,13 +61,13 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
/** An unmodifiable set containing some common English words that are usually not
|
/** An unmodifiable set containing some common English words that are usually not
|
||||||
useful for searching. */
|
useful for searching. */
|
||||||
public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||||
|
|
||||||
/** Builds an analyzer with the given stop words.
|
/** Builds an analyzer with the given stop words.
|
||||||
* @param matchVersion Lucene version to match See {@link
|
* @param matchVersion Lucene version to match See {@link
|
||||||
* <a href="#version">above</a>}
|
* <a href="#version">above</a>}
|
||||||
* @param stopWords stop words */
|
* @param stopWords stop words */
|
||||||
public StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
|
public StandardAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||||
super(matchVersion, stopWords);
|
super(matchVersion, stopWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -80,15 +80,6 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
||||||
this(matchVersion, STOP_WORDS_SET);
|
this(matchVersion, STOP_WORDS_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Builds an analyzer with the stop words from the given file.
|
|
||||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
|
||||||
* @param matchVersion Lucene version to match See {@link
|
|
||||||
* <a href="#version">above</a>}
|
|
||||||
* @param stopwords File to read stop words from */
|
|
||||||
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
|
||||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Builds an analyzer with the stop words from the given reader.
|
/** Builds an analyzer with the stop words from the given reader.
|
||||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||||
* @param matchVersion Lucene version to match See {@link
|
* @param matchVersion Lucene version to match See {@link
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.SwedishStemmer;
|
||||||
* {@link Analyzer} for Swedish.
|
* {@link Analyzer} for Swedish.
|
||||||
*/
|
*/
|
||||||
public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Swedish stopwords. */
|
/** File containing default Swedish stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
|
||||||
|
@ -50,7 +49,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,7 +58,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -86,7 +85,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -99,7 +98,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.th;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -93,7 +93,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public ThaiAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.tr;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
@ -38,7 +37,7 @@ import org.tartarus.snowball.ext.TurkishStemmer;
|
||||||
* {@link Analyzer} for Turkish.
|
* {@link Analyzer} for Turkish.
|
||||||
*/
|
*/
|
||||||
public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
/** File containing default Turkish stopwords. */
|
/** File containing default Turkish stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
@ -52,7 +51,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,7 +60,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
|
@ -88,7 +87,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -101,7 +100,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
matchVersion, stemExclusionSet));
|
matchVersion, stemExclusionSet));
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.util;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
@ -46,7 +45,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
|
||||||
* @return the analyzer's stopword set or an empty set if the analyzer has no
|
* @return the analyzer's stopword set or an empty set if the analyzer has no
|
||||||
* stopwords
|
* stopwords
|
||||||
*/
|
*/
|
||||||
public Set<?> getStopwordSet() {
|
public CharArraySet getStopwordSet() {
|
||||||
return stopwords;
|
return stopwords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -58,7 +57,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
* the analyzer's stopword set
|
* the analyzer's stopword set
|
||||||
*/
|
*/
|
||||||
protected StopwordAnalyzerBase(final Version version, final Set<?> stopwords) {
|
protected StopwordAnalyzerBase(final Version version, final CharArraySet stopwords) {
|
||||||
matchVersion = version;
|
matchVersion = version;
|
||||||
// analyzers should use char array set for stopwords!
|
// analyzers should use char array set for stopwords!
|
||||||
this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
|
this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
|
||||||
|
|
|
@ -18,9 +18,6 @@ package org.apache.lucene.analysis.ar;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
@ -79,16 +76,14 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Test that custom stopwords work, and are not case-sensitive.
|
* Test that custom stopwords work, and are not case-sensitive.
|
||||||
*/
|
*/
|
||||||
public void testCustomStopwords() throws Exception {
|
public void testCustomStopwords() throws Exception {
|
||||||
Set<String> set = new HashSet<String>();
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("the", "and", "a"), false);
|
||||||
Collections.addAll(set, "the", "and", "a");
|
|
||||||
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set);
|
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set);
|
||||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||||
"brown", "fox" });
|
"brown", "fox" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithStemExclusionSet() throws IOException {
|
public void testWithStemExclusionSet() throws IOException {
|
||||||
Set<String> set = new HashSet<String>();
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("ساهدهات"), false);
|
||||||
set.add("ساهدهات");
|
|
||||||
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||||
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.bg;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
@ -43,8 +42,7 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCustomStopwords() throws IOException {
|
public void testCustomStopwords() throws IOException {
|
||||||
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, Collections
|
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||||
.emptySet());
|
|
||||||
assertAnalyzesTo(a, "Как се казваш?",
|
assertAnalyzesTo(a, "Как се казваш?",
|
||||||
new String[] {"как", "се", "казваш"});
|
new String[] {"как", "се", "казваш"});
|
||||||
}
|
}
|
||||||
|
|
|
@ -136,7 +136,8 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStemExclusionTable() throws Exception {
|
public void testStemExclusionTable() throws Exception {
|
||||||
BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT, Collections.emptySet(), asSet("quintessência"));
|
BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
|
CharArraySet.EMPTY_SET, new CharArraySet(TEST_VERSION_CURRENT, asSet("quintessência"), false));
|
||||||
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
|
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.ca;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -50,8 +49,7 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("llengües"), false);
|
||||||
exclusionSet.add("llengües");
|
|
||||||
Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT,
|
||||||
CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
|
CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "llengües", "llengües");
|
checkOneTermReuse(a, "llengües", "llengües");
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -58,10 +59,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStopList() throws IOException {
|
public void testStopList() throws IOException {
|
||||||
Set<Object> stopWordsSet = new HashSet<Object>();
|
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
|
||||||
stopWordsSet.add("good");
|
|
||||||
stopWordsSet.add("test");
|
|
||||||
stopWordsSet.add("analyzer");
|
|
||||||
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
|
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
|
||||||
StringReader reader = new StringReader("This is a good test of the english stop analyzer");
|
StringReader reader = new StringReader("This is a good test of the english stop analyzer");
|
||||||
TokenStream stream = newStop.tokenStream("test", reader);
|
TokenStream stream = newStop.tokenStream("test", reader);
|
||||||
|
@ -75,10 +73,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStopListPositions() throws IOException {
|
public void testStopListPositions() throws IOException {
|
||||||
Set<Object> stopWordsSet = new HashSet<Object>();
|
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
|
||||||
stopWordsSet.add("good");
|
|
||||||
stopWordsSet.add("test");
|
|
||||||
stopWordsSet.add("analyzer");
|
|
||||||
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
|
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
|
||||||
StringReader reader = new StringReader("This is a good test of the english stop analyzer with positions");
|
StringReader reader = new StringReader("This is a good test of the english stop analyzer with positions");
|
||||||
int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1};
|
int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1};
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.English;
|
import org.apache.lucene.util.English;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -36,22 +37,15 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testExactCase() throws IOException {
|
public void testExactCase() throws IOException {
|
||||||
StringReader reader = new StringReader("Now is The Time");
|
StringReader reader = new StringReader("Now is The Time");
|
||||||
Set<String> stopWords = asSet("is", "the", "Time");
|
CharArraySet stopWords = new CharArraySet(TEST_VERSION_CURRENT, asSet("is", "the", "Time"), false);
|
||||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, false);
|
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords);
|
||||||
assertTokenStreamContents(stream, new String[] { "Now", "The" });
|
assertTokenStreamContents(stream, new String[] { "Now", "The" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testIgnoreCase() throws IOException {
|
|
||||||
StringReader reader = new StringReader("Now is The Time");
|
|
||||||
Set<String> stopWords = asSet( "is", "the", "Time" );
|
|
||||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, true);
|
|
||||||
assertTokenStreamContents(stream, new String[] { "Now" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testStopFilt() throws IOException {
|
public void testStopFilt() throws IOException {
|
||||||
StringReader reader = new StringReader("Now is The Time");
|
StringReader reader = new StringReader("Now is The Time");
|
||||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||||
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
||||||
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||||
assertTokenStreamContents(stream, new String[] { "Now", "The" });
|
assertTokenStreamContents(stream, new String[] { "Now", "The" });
|
||||||
}
|
}
|
||||||
|
@ -70,7 +64,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
||||||
log(sb.toString());
|
log(sb.toString());
|
||||||
String stopWords[] = a.toArray(new String[0]);
|
String stopWords[] = a.toArray(new String[0]);
|
||||||
for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
|
for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
|
||||||
Set<Object> stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
|
||||||
// with increments
|
// with increments
|
||||||
StringReader reader = new StringReader(sb.toString());
|
StringReader reader = new StringReader(sb.toString());
|
||||||
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||||
|
@ -93,8 +87,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
||||||
for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
|
for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
|
||||||
String stopWords1[] = a1.toArray(new String[0]);
|
String stopWords1[] = a1.toArray(new String[0]);
|
||||||
for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
|
for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
|
||||||
Set<Object> stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
|
CharArraySet stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
|
||||||
Set<Object> stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
|
CharArraySet stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
|
||||||
reader = new StringReader(sb.toString());
|
reader = new StringReader(sb.toString());
|
||||||
StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
|
StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
|
||||||
stpf0.setEnablePositionIncrements(true);
|
stpf0.setEnablePositionIncrements(true);
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.da;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("undersøgelse"), false);
|
||||||
exclusionSet.add("undersøgelse");
|
|
||||||
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Collections;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
@ -46,7 +45,8 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStemExclusionTable() throws Exception {
|
public void testStemExclusionTable() throws Exception {
|
||||||
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, Collections.emptySet(), asSet("tischen"));
|
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
|
||||||
|
new CharArraySet(TEST_VERSION_CURRENT, asSet("tischen"), false));
|
||||||
checkOneTermReuse(a, "tischen", "tischen");
|
checkOneTermReuse(a, "tischen", "tischen");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.en;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -45,8 +44,7 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("books"), false);
|
||||||
exclusionSet.add("books");
|
|
||||||
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
|
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "books", "books");
|
checkOneTermReuse(a, "books", "books");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.es;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chicano"), false);
|
||||||
exclusionSet.add("chicano");
|
|
||||||
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "chicana", "chican");
|
checkOneTermReuse(a, "chicana", "chican");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.eu;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("zaldiak"), false);
|
||||||
exclusionSet.add("zaldiak");
|
|
||||||
Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT,
|
||||||
BasqueAnalyzer.getDefaultStopSet(), exclusionSet);
|
BasqueAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "zaldiak", "zaldiak");
|
checkOneTermReuse(a, "zaldiak", "zaldiak");
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.fa;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Persian Analyzer
|
* Test the Persian Analyzer
|
||||||
|
@ -215,7 +216,8 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Test that custom stopwords work, and are not case-sensitive.
|
* Test that custom stopwords work, and are not case-sensitive.
|
||||||
*/
|
*/
|
||||||
public void testCustomStopwords() throws Exception {
|
public void testCustomStopwords() throws Exception {
|
||||||
PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, asSet("the", "and", "a"));
|
PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
|
new CharArraySet(TEST_VERSION_CURRENT, asSet("the", "and", "a"), false));
|
||||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||||
"brown", "fox" });
|
"brown", "fox" });
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.fi;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
|
||||||
exclusionSet.add("edeltäjistään");
|
|
||||||
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
|
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||||
|
|
|
@ -20,15 +20,14 @@ package org.apache.lucene.analysis.fr;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -38,9 +37,7 @@ public class TestElision extends BaseTokenStreamTestCase {
|
||||||
public void testElision() throws Exception {
|
public void testElision() throws Exception {
|
||||||
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
|
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
|
||||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
|
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
|
||||||
Set<String> articles = new HashSet<String>();
|
CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
|
||||||
articles.add("l");
|
|
||||||
articles.add("M");
|
|
||||||
TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles);
|
TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles);
|
||||||
List<String> tas = filter(filter);
|
List<String> tas = filter(filter);
|
||||||
assertEquals("embrouille", tas.get(4));
|
assertEquals("embrouille", tas.get(4));
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.gl;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("correspondente"), false);
|
||||||
exclusionSet.add("correspondente");
|
|
||||||
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
|
GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "correspondente", "correspondente");
|
checkOneTermReuse(a, "correspondente", "correspondente");
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
package org.apache.lucene.analysis.hi;
|
package org.apache.lucene.analysis.hi;
|
||||||
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -41,8 +39,7 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testExclusionSet() throws Exception {
|
public void testExclusionSet() throws Exception {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("हिंदी"), false);
|
||||||
exclusionSet.add("हिंदी");
|
|
||||||
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT,
|
||||||
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.hu;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
|
||||||
exclusionSet.add("babakocsi");
|
|
||||||
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
|
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.hy;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("արծիվներ"), false);
|
||||||
exclusionSet.add("արծիվներ");
|
|
||||||
Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
ArmenianAnalyzer.getDefaultStopSet(), exclusionSet);
|
ArmenianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "արծիվներ", "արծիվներ");
|
checkOneTermReuse(a, "արծիվներ", "արծիվներ");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.id;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("peledakan"), false);
|
||||||
exclusionSet.add("peledakan");
|
|
||||||
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
|
IndonesianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "peledakan", "peledakan");
|
checkOneTermReuse(a, "peledakan", "peledakan");
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
@ -44,8 +45,7 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("abbandonata"), false);
|
||||||
exclusionSet.add("abbandonata");
|
|
||||||
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
|
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.lv;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("tirgiem"), false);
|
||||||
exclusionSet.add("tirgiem");
|
|
||||||
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
|
LatvianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
checkOneTermReuse(a, "tirgiem", "tirgiem");
|
||||||
|
|
|
@ -2,10 +2,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
@ -47,12 +44,11 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
|
||||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output);
|
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output);
|
||||||
Set<String> jdkSet = new HashSet<String>();
|
CharArraySet mixedCaseSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("LuceneFox"), false);
|
||||||
jdkSet.add("LuceneFox");
|
|
||||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), jdkSet)), output);
|
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), mixedCaseSet)), output);
|
||||||
Set<?> set2 = set;
|
CharArraySet set2 = set;
|
||||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output);
|
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output);
|
||||||
|
@ -64,8 +60,8 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
|
||||||
new KeywordMarkerFilter(
|
new KeywordMarkerFilter(
|
||||||
new KeywordMarkerFilter(
|
new KeywordMarkerFilter(
|
||||||
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
|
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
|
||||||
new HashSet<String>(Arrays.asList("Birds", "Houses"))),
|
new CharArraySet(TEST_VERSION_CURRENT, asSet("Birds", "Houses"), false)),
|
||||||
new HashSet<String>(Arrays.asList("Dogs", "Trees"))));
|
new CharArraySet(TEST_VERSION_CURRENT, asSet("Dogs", "Trees"), false)));
|
||||||
|
|
||||||
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
|
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,12 +2,11 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.en.PorterStemFilter;
|
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
@ -33,7 +32,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
|
||||||
// lets make booked stem to books
|
// lets make booked stem to books
|
||||||
// the override filter will convert "booked" to "books",
|
// the override filter will convert "booked" to "books",
|
||||||
// but also mark it with KeywordAttribute so Porter will not change it.
|
// but also mark it with KeywordAttribute so Porter will not change it.
|
||||||
Map<String,String> dictionary = new HashMap<String,String>();
|
CharArrayMap<String> dictionary = new CharArrayMap<String>(TEST_VERSION_CURRENT, 1, false);
|
||||||
dictionary.put("booked", "books");
|
dictionary.put("booked", "books");
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
|
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
|
||||||
TokenStream stream = new PorterStemFilter(
|
TokenStream stream = new PorterStemFilter(
|
||||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.nl;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
@ -150,6 +149,26 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* check that the default stem overrides are used
|
||||||
|
* even if you use a non-default ctor.
|
||||||
|
*/
|
||||||
|
public void testStemOverrides() throws IOException {
|
||||||
|
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||||
|
checkOneTerm(a, "fiets", "fiets");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* prior to 3.6, this confusingly did not happen if
|
||||||
|
* you specified your own stoplist!!!!
|
||||||
|
* @deprecated (3.6) Remove this test in Lucene 5.0
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public void testBuggyStemOverrides() throws IOException {
|
||||||
|
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_35, CharArraySet.EMPTY_SET);
|
||||||
|
checkOneTerm(a, "fiets", "fiet");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prior to 3.1, this analyzer had no lowercase filter.
|
* Prior to 3.1, this analyzer had no lowercase filter.
|
||||||
* stopwords were case sensitive. Preserve this for back compat.
|
* stopwords were case sensitive. Preserve this for back compat.
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.no;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("havnedistriktene"), false);
|
||||||
exclusionSet.add("havnedistriktene");
|
|
||||||
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
|
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.pt;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
|
||||||
exclusionSet.add("quilométricas");
|
|
||||||
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT,
|
||||||
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
|
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.ro;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("absenţa"), false);
|
||||||
exclusionSet.add("absenţa");
|
|
||||||
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
|
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "absenţa", "absenţa");
|
checkOneTermReuse(a, "absenţa", "absenţa");
|
||||||
|
|
|
@ -18,12 +18,10 @@ package org.apache.lucene.analysis.sv;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -44,8 +42,7 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlarne"), false);
|
||||||
exclusionSet.add("jaktkarlarne");
|
|
||||||
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
|
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.tr;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ağacı"), false);
|
||||||
exclusionSet.add("ağacı");
|
|
||||||
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
|
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "ağacı", "ağacı");
|
checkOneTermReuse(a, "ağacı", "ağacı");
|
||||||
|
|
|
@ -39,13 +39,13 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
||||||
this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
|
this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
|
||||||
}
|
}
|
||||||
|
|
||||||
public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set<?> stopwords, Set<String> stoptags) {
|
public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set<String> stoptags) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.segmenter = segmenter;
|
this.segmenter = segmenter;
|
||||||
this.stoptags = stoptags;
|
this.stoptags = stoptags;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
||||||
* outer class accesses the static final set the first time.
|
* outer class accesses the static final set the first time.
|
||||||
*/
|
*/
|
||||||
private static class DefaultSetHolder {
|
private static class DefaultSetHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
static final Set<String> DEFAULT_STOP_TAGS;
|
static final Set<String> DEFAULT_STOP_TAGS;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
|
|
@ -18,10 +18,7 @@
|
||||||
package org.apache.lucene.analysis.cn.smart;
|
package org.apache.lucene.analysis.cn.smart;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
@ -58,7 +55,7 @@ import org.apache.lucene.util.Version;
|
||||||
*/
|
*/
|
||||||
public final class SmartChineseAnalyzer extends Analyzer {
|
public final class SmartChineseAnalyzer extends Analyzer {
|
||||||
|
|
||||||
private final Set<?> stopWords;
|
private final CharArraySet stopWords;
|
||||||
|
|
||||||
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
@ -120,7 +117,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
||||||
*/
|
*/
|
||||||
public SmartChineseAnalyzer(Version matchVersion, boolean useDefaultStopWords) {
|
public SmartChineseAnalyzer(Version matchVersion, boolean useDefaultStopWords) {
|
||||||
stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
|
stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||||
: Collections.EMPTY_SET;
|
: CharArraySet.EMPTY_SET;
|
||||||
this.matchVersion = matchVersion;
|
this.matchVersion = matchVersion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -133,8 +130,8 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
||||||
* </p>
|
* </p>
|
||||||
* @param stopWords {@link Set} of stopwords to use.
|
* @param stopWords {@link Set} of stopwords to use.
|
||||||
*/
|
*/
|
||||||
public SmartChineseAnalyzer(Version matchVersion, Set stopWords) {
|
public SmartChineseAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||||
this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords;
|
this.stopWords = stopWords==null?CharArraySet.EMPTY_SET:stopWords;
|
||||||
this.matchVersion = matchVersion;
|
this.matchVersion = matchVersion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -147,7 +144,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
||||||
// The porter stemming is too strict, this is not a bug, this is a feature:)
|
// The porter stemming is too strict, this is not a bug, this is a feature:)
|
||||||
result = new PorterStemFilter(result);
|
result = new PorterStemFilter(result);
|
||||||
if (!stopWords.isEmpty()) {
|
if (!stopWords.isEmpty()) {
|
||||||
result = new StopFilter(matchVersion, result, stopWords, false);
|
result = new StopFilter(matchVersion, result, stopWords);
|
||||||
}
|
}
|
||||||
return new TokenStreamComponents(tokenizer, result);
|
return new TokenStreamComponents(tokenizer, result);
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.pl;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
@ -42,7 +41,7 @@ import org.egothor.stemmer.Trie;
|
||||||
* {@link Analyzer} for Polish.
|
* {@link Analyzer} for Polish.
|
||||||
*/
|
*/
|
||||||
public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||||
private final Set<?> stemExclusionSet;
|
private final CharArraySet stemExclusionSet;
|
||||||
private final Trie stemTable;
|
private final Trie stemTable;
|
||||||
|
|
||||||
/** File containing default Polish stopwords. */
|
/** File containing default Polish stopwords. */
|
||||||
|
@ -55,7 +54,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
*/
|
*/
|
||||||
public static Set<?> getDefaultStopSet(){
|
public static CharArraySet getDefaultStopSet(){
|
||||||
return DefaultsHolder.DEFAULT_STOP_SET;
|
return DefaultsHolder.DEFAULT_STOP_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,7 +63,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* accesses the static final set the first time.;
|
* accesses the static final set the first time.;
|
||||||
*/
|
*/
|
||||||
private static class DefaultsHolder {
|
private static class DefaultsHolder {
|
||||||
static final Set<?> DEFAULT_STOP_SET;
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
static final Trie DEFAULT_TABLE;
|
static final Trie DEFAULT_TABLE;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
@ -100,7 +99,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param matchVersion lucene compatibility version
|
* @param matchVersion lucene compatibility version
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
*/
|
*/
|
||||||
public PolishAnalyzer(Version matchVersion, Set<?> stopwords) {
|
public PolishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,7 +112,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||||
* @param stopwords a stopword set
|
* @param stopwords a stopword set
|
||||||
* @param stemExclusionSet a set of terms not to be stemmed
|
* @param stemExclusionSet a set of terms not to be stemmed
|
||||||
*/
|
*/
|
||||||
public PolishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
public PolishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
super(matchVersion, stopwords);
|
super(matchVersion, stopwords);
|
||||||
this.stemTable = DefaultsHolder.DEFAULT_TABLE;
|
this.stemTable = DefaultsHolder.DEFAULT_TABLE;
|
||||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.pl;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
|
@ -43,8 +42,7 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("studenta"), false);;
|
||||||
exclusionSet.add("studenta");
|
|
||||||
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT,
|
Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
PolishAnalyzer.getDefaultStopSet(), exclusionSet);
|
PolishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "studenta", "studenta");
|
checkOneTermReuse(a, "studenta", "studenta");
|
||||||
|
|
|
@ -93,15 +93,14 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements
|
||||||
.getTokenFilterFactories();
|
.getTokenFilterFactories();
|
||||||
for (TokenFilterFactory factory : filterFactories) {
|
for (TokenFilterFactory factory : filterFactories) {
|
||||||
if (factory instanceof StopFilterFactory) {
|
if (factory instanceof StopFilterFactory) {
|
||||||
// StopFilterFactory holds the stop words in a CharArraySet, but
|
// StopFilterFactory holds the stop words in a CharArraySet
|
||||||
// the getStopWords() method returns a Set<?>, so we need to cast.
|
|
||||||
solrStopWords.put(fieldName,
|
solrStopWords.put(fieldName,
|
||||||
(CharArraySet) ((StopFilterFactory) factory).getStopWords());
|
((StopFilterFactory) factory).getStopWords());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (factory instanceof CommonGramsFilterFactory) {
|
if (factory instanceof CommonGramsFilterFactory) {
|
||||||
solrStopWords.put(fieldName,
|
solrStopWords.put(fieldName,
|
||||||
(CharArraySet) ((CommonGramsFilterFactory) factory)
|
((CommonGramsFilterFactory) factory)
|
||||||
.getCommonWords());
|
.getCommonWords());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||||
|
@ -71,12 +70,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
|
||||||
return ignoreCase;
|
return ignoreCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<?> getCommonWords() {
|
public CharArraySet getCommonWords() {
|
||||||
return commonWords;
|
return commonWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CommonGramsFilter create(TokenStream input) {
|
public CommonGramsFilter create(TokenStream input) {
|
||||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase);
|
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
|
||||||
return commonGrams;
|
return commonGrams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||||
|
@ -80,7 +79,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
||||||
return ignoreCase;
|
return ignoreCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<?> getCommonWords() {
|
public CharArraySet getCommonWords() {
|
||||||
return commonWords;
|
return commonWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -88,8 +87,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
|
||||||
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
|
* Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
|
||||||
*/
|
*/
|
||||||
public CommonGramsQueryFilter create(TokenStream input) {
|
public CommonGramsQueryFilter create(TokenStream input) {
|
||||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords,
|
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
|
||||||
ignoreCase);
|
|
||||||
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
|
CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
|
||||||
commonGrams);
|
commonGrams);
|
||||||
return commonGramsQuery;
|
return commonGramsQuery;
|
||||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -81,13 +80,13 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc
|
||||||
return ignoreCase;
|
return ignoreCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<?> getStopWords() {
|
public CharArraySet getStopWords() {
|
||||||
return stopWords;
|
return stopWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase);
|
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords);
|
||||||
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
|
stopFilter.setEnablePositionIncrements(enablePositionIncrements);
|
||||||
return stopFilter;
|
return stopFilter;
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,11 +20,11 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.core.SolrResourceLoader;
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
||||||
args.put("ignoreCase", "true");
|
args.put("ignoreCase", "true");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set<?> words = factory.getCommonWords();
|
CharArraySet words = factory.getCommonWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||||
words.size() == 2);
|
words.size() == 2);
|
||||||
|
@ -89,7 +89,7 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
||||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set<?> words = factory.getCommonWords();
|
CharArraySet words = factory.getCommonWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue(words.contains("the"));
|
assertTrue(words.contains("the"));
|
||||||
Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false);
|
||||||
|
|
|
@ -19,11 +19,11 @@ package org.apache.solr.analysis;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.core.SolrResourceLoader;
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
||||||
args.put("ignoreCase", "true");
|
args.put("ignoreCase", "true");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set<?> words = factory.getCommonWords();
|
CharArraySet words = factory.getCommonWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
assertTrue("words Size: " + words.size() + " is not: " + 2,
|
||||||
words.size() == 2);
|
words.size() == 2);
|
||||||
|
@ -88,7 +88,7 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
||||||
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set<?> words = factory.getCommonWords();
|
CharArraySet words = factory.getCommonWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue(words.contains("the"));
|
assertTrue(words.contains("the"));
|
||||||
Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false);
|
||||||
|
|
|
@ -16,10 +16,10 @@ package org.apache.solr.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.core.SolrResourceLoader;
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ public class TestKeepFilterFactory extends BaseTokenTestCase{
|
||||||
args.put("ignoreCase", "true");
|
args.put("ignoreCase", "true");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set<?> words = factory.getWords();
|
CharArraySet words = factory.getWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||||
|
|
||||||
|
|
|
@ -17,10 +17,10 @@ package org.apache.solr.analysis;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.core.SolrResourceLoader;
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ public class TestStopFilterFactory extends BaseTokenTestCase {
|
||||||
args.put("ignoreCase", "true");
|
args.put("ignoreCase", "true");
|
||||||
factory.init(args);
|
factory.init(args);
|
||||||
factory.inform(loader);
|
factory.inform(loader);
|
||||||
Set<?> words = factory.getStopWords();
|
CharArraySet words = factory.getStopWords();
|
||||||
assertTrue("words is null and it shouldn't be", words != null);
|
assertTrue("words is null and it shouldn't be", words != null);
|
||||||
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
|
||||||
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);
|
||||||
|
|
Loading…
Reference in New Issue