From 72ae3171beffb53da4b25462e49dd34684d41ba7 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 9 Feb 2012 19:59:50 +0000 Subject: [PATCH] LUCENE-3765: Trappy behavior with StopFilter/ignoreCase git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1242497 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 ++ lucene/contrib/CHANGES.txt | 4 ++ modules/analysis/CHANGES.txt | 5 ++ .../lucene/analysis/ar/ArabicAnalyzer.java | 11 ++-- .../lucene/analysis/bg/BulgarianAnalyzer.java | 10 ++-- .../lucene/analysis/br/BrazilianAnalyzer.java | 14 ++--- .../lucene/analysis/ca/CatalanAnalyzer.java | 11 ++-- .../lucene/analysis/cjk/CJKAnalyzer.java | 8 +-- .../commongrams/CommonGramsFilter.java | 31 +--------- .../compound/CompoundWordTokenFilterBase.java | 23 ++------ .../DictionaryCompoundWordTokenFilter.java | 12 +--- .../HyphenationCompoundWordTokenFilter.java | 17 ++---- .../lucene/analysis/core/StopAnalyzer.java | 5 +- .../lucene/analysis/core/StopFilter.java | 45 +++----------- .../lucene/analysis/cz/CzechAnalyzer.java | 13 ++-- .../lucene/analysis/da/DanishAnalyzer.java | 11 ++-- .../lucene/analysis/de/GermanAnalyzer.java | 13 ++-- .../lucene/analysis/el/GreekAnalyzer.java | 8 +-- .../lucene/analysis/en/EnglishAnalyzer.java | 11 ++-- .../lucene/analysis/es/SpanishAnalyzer.java | 11 ++-- .../lucene/analysis/eu/BasqueAnalyzer.java | 11 ++-- .../lucene/analysis/fa/PersianAnalyzer.java | 8 +-- .../lucene/analysis/fi/FinnishAnalyzer.java | 11 ++-- .../lucene/analysis/fr/ElisionFilter.java | 4 +- .../lucene/analysis/fr/FrenchAnalyzer.java | 15 +++-- .../lucene/analysis/gl/GalicianAnalyzer.java | 11 ++-- .../lucene/analysis/hi/HindiAnalyzer.java | 11 ++-- .../lucene/analysis/hu/HungarianAnalyzer.java | 11 ++-- .../lucene/analysis/hy/ArmenianAnalyzer.java | 11 ++-- .../analysis/id/IndonesianAnalyzer.java | 11 ++-- .../lucene/analysis/it/ItalianAnalyzer.java | 11 ++-- .../lucene/analysis/lv/LatvianAnalyzer.java | 12 ++-- .../miscellaneous/KeywordMarkerFilter.java | 20 +------ .../miscellaneous/PatternAnalyzer.java | 9 ++- .../miscellaneous/StemmerOverrideFilter.java | 6 +- .../lucene/analysis/nl/DutchAnalyzer.java | 59 ++++++++++++------- .../lucene/analysis/no/NorwegianAnalyzer.java | 11 ++-- .../analysis/pt/PortugueseAnalyzer.java | 11 ++-- .../query/QueryAutoStopWordAnalyzer.java | 4 +- .../lucene/analysis/ro/RomanianAnalyzer.java | 11 ++-- .../lucene/analysis/ru/RussianAnalyzer.java | 13 ++-- .../analysis/snowball/SnowballAnalyzer.java | 5 +- .../analysis/standard/ClassicAnalyzer.java | 15 +---- .../analysis/standard/StandardAnalyzer.java | 15 +---- .../lucene/analysis/sv/SwedishAnalyzer.java | 11 ++-- .../lucene/analysis/th/ThaiAnalyzer.java | 8 +-- .../lucene/analysis/tr/TurkishAnalyzer.java | 11 ++-- .../analysis/util/StopwordAnalyzerBase.java | 5 +- .../analysis/ar/TestArabicAnalyzer.java | 9 +-- .../analysis/bg/TestBulgarianAnalyzer.java | 4 +- .../analysis/br/TestBrazilianStemmer.java | 3 +- .../analysis/ca/TestCatalanAnalyzer.java | 6 +- .../analysis/core/TestStopAnalyzer.java | 11 +--- .../lucene/analysis/core/TestStopFilter.java | 20 +++---- .../analysis/da/TestDanishAnalyzer.java | 6 +- .../analysis/de/TestGermanAnalyzer.java | 4 +- .../analysis/en/TestEnglishAnalyzer.java | 6 +- .../analysis/es/TestSpanishAnalyzer.java | 6 +- .../analysis/eu/TestBasqueAnalyzer.java | 6 +- .../analysis/fa/TestPersianAnalyzer.java | 4 +- .../analysis/fi/TestFinnishAnalyzer.java | 6 +- .../lucene/analysis/fr/TestElision.java | 7 +-- .../analysis/gl/TestGalicianAnalyzer.java | 6 +- .../lucene/analysis/hi/TestHindiAnalyzer.java | 7 +-- .../analysis/hu/TestHungarianAnalyzer.java | 6 +- .../analysis/hy/TestArmenianAnalyzer.java | 6 +- .../analysis/id/TestIndonesianAnalyzer.java | 6 +- .../analysis/it/TestItalianAnalyzer.java | 4 +- .../analysis/lv/TestLatvianAnalyzer.java | 6 +- .../TestKeywordMarkerFilter.java | 14 ++--- .../TestStemmerOverrideFilter.java | 5 +- .../lucene/analysis/nl/TestDutchStemmer.java | 21 ++++++- .../analysis/no/TestNorwegianAnalyzer.java | 6 +- .../analysis/pt/TestPortugueseAnalyzer.java | 6 +- .../analysis/ro/TestRomanianAnalyzer.java | 6 +- .../analysis/sv/TestSwedishAnalyzer.java | 7 +-- .../analysis/tr/TestTurkishAnalyzer.java | 6 +- .../analysis/kuromoji/KuromojiAnalyzer.java | 6 +- .../cn/smart/SmartChineseAnalyzer.java | 13 ++-- .../lucene/analysis/pl/PolishAnalyzer.java | 11 ++-- .../analysis/pl/TestPolishAnalyzer.java | 6 +- ...olrStopwordsCarrot2LexicalDataFactory.java | 7 +-- .../analysis/CommonGramsFilterFactory.java | 5 +- .../CommonGramsQueryFilterFactory.java | 6 +- .../solr/analysis/StopFilterFactory.java | 5 +- .../CommonGramsFilterFactoryTest.java | 6 +- .../CommonGramsQueryFilterFactoryTest.java | 6 +- .../solr/analysis/TestKeepFilterFactory.java | 4 +- .../solr/analysis/TestStopFilterFactory.java | 4 +- 89 files changed, 363 insertions(+), 535 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index eee038472a8..ef52eef2780 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -779,6 +779,11 @@ API Changes to be merged. To mimic the old behaviour, just use IndexReader.directory() for choosing the provider by Directory. (Uwe Schindler) +* LUCENE-3765: Deprecated StopFilter ctor that took ignoreCase, because + in some cases (if the set is a CharArraySet), the argument is ignored. + Deprecated StandardAnalyzer and ClassicAnalyzer ctors that take File, + please use the Reader ctor instead. (Robert Muir) + New Features * LUCENE-3593: Added a FieldValueFilter that accepts all documents that either diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 0ec6dca13f9..ecd49143647 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -218,6 +218,10 @@ Bug Fixes * LUCENE-3719: FVH: slow performance on very large queries. (Igor Motov via Koji Sekiguchi) + * LUCENE-3765: As of Version.LUCENE_36, DutchAnalyzer's two ctors + that take stopwords and stem exclusion tables also initialize + the default stem overrides (e.g. kind/kinder, fiets). (Robert Muir) + Documentation * LUCENE-3599: Javadocs for DistanceUtils.haversine() were incorrectly diff --git a/modules/analysis/CHANGES.txt b/modules/analysis/CHANGES.txt index 7d4012f2926..9d46f1361d9 100644 --- a/modules/analysis/CHANGES.txt +++ b/modules/analysis/CHANGES.txt @@ -29,6 +29,11 @@ API Changes since they prevent reuse. Both Analyzers should be configured at instantiation. (Chris Male) + * LUCENE-3765: Stopset ctors that previously took Set or Map now take + CharArraySet and CharArrayMap respectively. Previously the behavior was confusing, + and sometimes different depending on the type of set, and ultimately a CharArraySet + or CharArrayMap was always used anyway. (Robert Muir) + New Features * LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java index 4549a17e023..a96c1ef814c 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ar; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -63,7 +62,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -72,7 +71,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -85,7 +84,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase { } } - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. @@ -102,7 +101,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase { * @param stopwords * a stopword set */ - public ArabicAnalyzer(Version matchVersion, Set stopwords){ + public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){ this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -118,7 +117,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase { * @param stemExclusionSet * a set of terms not to be stemmed */ - public ArabicAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet){ + public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){ super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java index 6f0419ec65f..a6dbf705a9e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java @@ -56,7 +56,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase { * * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet() { + public static CharArraySet getDefaultStopSet() { return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -65,7 +65,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase { * class accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -78,7 +78,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase { } } - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** * Builds an analyzer with the default stop words: @@ -91,7 +91,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase { /** * Builds an analyzer with the given stop words. */ - public BulgarianAnalyzer(Version matchVersion, Set stopwords) { + public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -100,7 +100,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase { * If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter} * before {@link BulgarianStemFilter}. */ - public BulgarianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java index 23ed34b04f0..3e2e23b44b8 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java @@ -19,8 +19,6 @@ package org.apache.lucene.analysis.br; import java.io.IOException; import java.io.Reader; -import java.util.Collections; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -56,12 +54,12 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -79,7 +77,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase { /** * Contains words that should be indexed but not stemmed. */ - private Set excltable = Collections.emptySet(); + private CharArraySet excltable = CharArraySet.EMPTY_SET; /** * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}). @@ -96,7 +94,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase { * @param stopwords * a stopword set */ - public BrazilianAnalyzer(Version matchVersion, Set stopwords) { + public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) { super(matchVersion, stopwords); } @@ -108,8 +106,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase { * @param stopwords * a stopword set */ - public BrazilianAnalyzer(Version matchVersion, Set stopwords, - Set stemExclusionSet) { + public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords, + CharArraySet stemExclusionSet) { this(matchVersion, stopwords); excltable = CharArraySet.unmodifiableSet(CharArraySet .copy(matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java index 4f95cad55f7..294f18ebb40 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ca; import java.io.IOException; import java.io.Reader; import java.util.Arrays; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -49,7 +48,7 @@ import org.tartarus.snowball.ext.CatalanStemmer; * */ public final class CatalanAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Catalan stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; @@ -64,7 +63,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -73,7 +72,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -100,7 +99,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public CatalanAnalyzer(Version matchVersion, Set stopwords) { + public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -113,7 +112,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public CatalanAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java index 0e64b5d29e5..87aa5c82ea4 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cjk; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -27,6 +26,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; @@ -49,12 +49,12 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -82,7 +82,7 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase { * @param stopwords * a stopword set */ - public CJKAnalyzer(Version matchVersion, Set stopwords){ + public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){ super(matchVersion, stopwords); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java index 283940627a0..8232b88c2bf 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java @@ -10,7 +10,6 @@ package org.apache.lucene.analysis.commongrams; import java.io.IOException; -import java.util.Set; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -69,35 +68,9 @@ public final class CommonGramsFilter extends TokenFilter { * @param input TokenStream input in filter chain * @param commonWords The set of common words. */ - public CommonGramsFilter(Version matchVersion, TokenStream input, Set commonWords) { - this(matchVersion, input, commonWords, false); - } - - /** - * Construct a token stream filtering the given input using a Set of common - * words to create bigrams, case-sensitive if ignoreCase is false (unless Set - * is CharArraySet). If commonWords is an instance of - * {@link CharArraySet} (true if makeCommonSet() was used to - * construct the set) it will be directly used and ignoreCase - * will be ignored since CharArraySet directly controls case - * sensitivity. - *

- * If commonWords is not an instance of {@link CharArraySet}, a - * new CharArraySet will be constructed and ignoreCase will be - * used to specify the case sensitivity of that set. - * - * @param input TokenStream input in filter chain. - * @param commonWords The set of common words. - * @param ignoreCase -Ignore case when constructing bigrams for common words. - */ - public CommonGramsFilter(Version matchVersion, TokenStream input, Set commonWords, boolean ignoreCase) { + public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) { super(input); - if (commonWords instanceof CharArraySet) { - this.commonWords = (CharArraySet) commonWords; - } else { - this.commonWords = new CharArraySet(matchVersion, commonWords.size(), ignoreCase); - this.commonWords.addAll(commonWords); - } + this.commonWords = commonWords; } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java index 756221baf9f..3b3fae9ca76 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -18,10 +18,7 @@ package org.apache.lucene.analysis.compound; */ import java.io.IOException; -import java.util.Arrays; -import java.util.Collection; import java.util.LinkedList; -import java.util.Locale; import java.util.Set; import org.apache.lucene.analysis.TokenFilter; @@ -43,13 +40,6 @@ import org.apache.lucene.util.Version; * supplementary characters in strings and char arrays provided as compound word * dictionaries. * - *

If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary, - * it should be case-insensitive unless it contains only lowercased entries and you - * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. - * For optional performance (as this filter does lots of lookups to the dictionary, - * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary - * {@link Set Sets} to the ctors, they will be automatically - * transformed to case-insensitive! */ public abstract class CompoundWordTokenFilterBase extends TokenFilter { /** @@ -80,15 +70,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { private AttributeSource.State current; - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, boolean onlyLongestMatch) { + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) { this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); } - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary) { + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary) { this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); } - protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input); this.tokens=new LinkedList(); @@ -96,12 +86,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { this.minSubwordSize=minSubwordSize; this.maxSubwordSize=maxSubwordSize; this.onlyLongestMatch=onlyLongestMatch; - - if (dictionary==null || dictionary instanceof CharArraySet) { - this.dictionary = (CharArraySet) dictionary; - } else { - this.dictionary = new CharArraySet(matchVersion, dictionary, true); - } + this.dictionary = dictionary; } @Override diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index da85b1baa9e..4d914f57eea 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -22,6 +22,7 @@ import java.util.Set; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; /** @@ -38,13 +39,6 @@ import org.apache.lucene.util.Version; * supplementary characters in strings and char arrays provided as compound word * dictionaries. * - *

If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary, - * it should be case-insensitive unless it contains only lowercased entries and you - * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. - * For optional performance (as this filter does lots of lookups to the dictionary, - * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary - * {@link Set Sets} to the ctors, they will be automatically - * transformed to case-insensitive! */ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase { @@ -61,7 +55,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa * @param dictionary * the word dictionary to match against. */ - public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) { + public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) { super(matchVersion, input, dictionary); } @@ -86,7 +80,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa * @param onlyLongestMatch * Add only the longest matching subword to the stream */ - public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary, + public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java index 234dc338dc2..935c607c3de 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java @@ -18,12 +18,12 @@ package org.apache.lucene.analysis.compound; */ import java.io.File; -import java.util.Set; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.compound.hyphenation.Hyphenation; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; import org.xml.sax.InputSource; @@ -41,13 +41,6 @@ import org.xml.sax.InputSource; * supplementary characters in strings and char arrays provided as compound word * dictionaries. * - *

If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary, - * it should be case-insensitive unless it contains only lowercased entries and you - * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain. - * For optional performance (as this filter does lots of lookups to the dictionary, - * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary - * {@link Set Sets} to the ctors, they will be automatically - * transformed to case-insensitive! */ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterBase { @@ -69,7 +62,7 @@ public class HyphenationCompoundWordTokenFilter extends * the word dictionary to match against. */ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, - HyphenationTree hyphenator, Set dictionary) { + HyphenationTree hyphenator, CharArraySet dictionary) { this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false); } @@ -98,7 +91,7 @@ public class HyphenationCompoundWordTokenFilter extends * Add only the longest matching subword to the stream */ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, - HyphenationTree hyphenator, Set dictionary, int minWordSize, + HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); @@ -109,14 +102,14 @@ public class HyphenationCompoundWordTokenFilter extends /** * Create a HyphenationCompoundWordTokenFilter with no dictionary. *

- * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean) + * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean) * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, * null, minWordSize, minSubwordSize, maxSubwordSize } */ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input, HyphenationTree hyphenator, int minWordSize, int minSubwordSize, int maxSubwordSize) { - this(matchVersion, input, hyphenator, (Set) null, minWordSize, minSubwordSize, + this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize, maxSubwordSize, false); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java index d0fedb2e21e..c4f36aad070 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java @@ -21,7 +21,6 @@ import java.io.File; import java.io.IOException; import java.io.Reader; import java.util.Arrays; -import java.util.Set; import java.util.List; import org.apache.lucene.analysis.Tokenizer; @@ -46,7 +45,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { /** An unmodifiable set containing some common English words that are not usually useful for searching.*/ - public static final Set ENGLISH_STOP_WORDS_SET; + public static final CharArraySet ENGLISH_STOP_WORDS_SET; static { final List stopWords = Arrays.asList( @@ -72,7 +71,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { /** Builds an analyzer with the stop words from the given set. * @param matchVersion See above * @param stopWords Set of stop words */ - public StopAnalyzer(Version matchVersion, Set stopWords) { + public StopAnalyzer(Version matchVersion, CharArraySet stopWords) { super(matchVersion, stopWords); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java index c36dcbe4c63..f27fc010619 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core; import java.io.IOException; import java.util.Arrays; import java.util.List; -import java.util.Set; import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -44,34 +43,6 @@ public final class StopFilter extends FilteringTokenFilter { private final CharArraySet stopWords; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - - /** - * Construct a token stream filtering the given input. If - * stopWords is an instance of {@link CharArraySet} (true if - * makeStopSet() was used to construct the set) it will be - * directly used and ignoreCase will be ignored since - * CharArraySet directly controls case sensitivity. - *

- * If stopWords is not an instance of {@link CharArraySet}, a new - * CharArraySet will be constructed and ignoreCase will be used - * to specify the case sensitivity of that set. - * - * @param matchVersion - * Lucene version to enable correct Unicode 4.0 behavior in the stop - * set if Version > 3.0. See above for details. - * @param input - * Input TokenStream - * @param stopWords - * A Set of Strings or char[] or any other toString()-able set - * representing the stopwords - * @param ignoreCase - * if true, all words are lower cased first - */ - public StopFilter(Version matchVersion, TokenStream input, Set stopWords, boolean ignoreCase) - { - super(true, input); - this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase); - } /** * Constructs a filter which removes words from the input TokenStream that are @@ -83,12 +54,12 @@ public final class StopFilter extends FilteringTokenFilter { * @param in * Input stream * @param stopWords - * A Set of Strings or char[] or any other toString()-able set - * representing the stopwords + * A {@link CharArraySet} representing the stopwords. * @see #makeStopSet(Version, java.lang.String...) */ - public StopFilter(Version matchVersion, TokenStream in, Set stopWords) { - this(matchVersion, in, stopWords, false); + public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) { + super(true, in); + this.stopWords = stopWords; } /** @@ -101,7 +72,7 @@ public final class StopFilter extends FilteringTokenFilter { * @param stopWords An array of stopwords * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase */ - public static Set makeStopSet(Version matchVersion, String... stopWords) { + public static CharArraySet makeStopSet(Version matchVersion, String... stopWords) { return makeStopSet(matchVersion, stopWords, false); } @@ -116,7 +87,7 @@ public final class StopFilter extends FilteringTokenFilter { * @return A Set ({@link CharArraySet}) containing the words * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase */ - public static Set makeStopSet(Version matchVersion, List stopWords) { + public static CharArraySet makeStopSet(Version matchVersion, List stopWords) { return makeStopSet(matchVersion, stopWords, false); } @@ -128,7 +99,7 @@ public final class StopFilter extends FilteringTokenFilter { * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ - public static Set makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) { + public static CharArraySet makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) { CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase); stopSet.addAll(Arrays.asList(stopWords)); return stopSet; @@ -141,7 +112,7 @@ public final class StopFilter extends FilteringTokenFilter { * @param ignoreCase if true, all words are lower cased first * @return A Set ({@link CharArraySet}) containing the words */ - public static Set makeStopSet(Version matchVersion, List stopWords, boolean ignoreCase){ + public static CharArraySet makeStopSet(Version matchVersion, List stopWords, boolean ignoreCase){ CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase); stopSet.addAll(stopWords); return stopSet; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java index ba845ff1609..b0d9c8039c9 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java @@ -32,7 +32,6 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.*; -import java.util.Set; /** * {@link Analyzer} for Czech language. @@ -62,12 +61,12 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase { * * @return a set of default Czech-stopwords */ - public static final Set getDefaultStopSet(){ + public static final CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_SET; } private static class DefaultSetHolder { - private static final Set DEFAULT_SET; + private static final CharArraySet DEFAULT_SET; static { try { @@ -82,7 +81,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase { } - private final Set stemExclusionTable; + private final CharArraySet stemExclusionTable; /** * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}). @@ -101,7 +100,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase { * {@link above} * @param stopwords a stopword set */ - public CzechAnalyzer(Version matchVersion, Set stopwords) { + public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -114,7 +113,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionTable a stemming exclusion set */ - public CzechAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionTable) { + public CzechAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) { super(matchVersion, stopwords); this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); } @@ -129,7 +128,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase { * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If * a version is >= LUCENE_31 and a stem exclusion set is provided via - * {@link #CzechAnalyzer(Version, Set, Set)} a + * {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a * {@link KeywordMarkerFilter} is added before * {@link CzechStemFilter}. */ diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java index c94676a5196..0006e58f1bc 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.da; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.DanishStemmer; * {@link Analyzer} for Danish. */ public final class DanishAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Danish stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt"; @@ -50,7 +49,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -59,7 +58,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -86,7 +85,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public DanishAnalyzer(Version matchVersion, Set stopwords) { + public DanishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -99,7 +98,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public DanishAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 3d2dd4d9ef5..d64c93ba44d 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -21,7 +21,6 @@ package org.apache.lucene.analysis.de; import java.io.IOException; import java.io.Reader; import java.util.Arrays; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -90,16 +89,16 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase { * Returns a set of default German-stopwords * @return a set of default German-stopwords */ - public static final Set getDefaultStopSet(){ + public static final CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_SET; } private static class DefaultSetHolder { /** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */ @Deprecated - private static final Set DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet( + private static final CharArraySet DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet( Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false)); - private static final Set DEFAULT_SET; + private static final CharArraySet DEFAULT_SET; static { try { DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, @@ -119,7 +118,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase { /** * Contains words that should be indexed but not stemmed. */ - private final Set exclusionSet; + private final CharArraySet exclusionSet; /** * Builds an analyzer with the default stop words: @@ -139,7 +138,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase { * @param stopwords * a stopword set */ - public GermanAnalyzer(Version matchVersion, Set stopwords) { + public GermanAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -153,7 +152,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase { * @param stemExclusionSet * a stemming exclusion set */ - public GermanAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java index 8cbd82931e0..3b1b5acf965 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.el; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -27,6 +26,7 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; @@ -58,12 +58,12 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase { * Returns a set of default Greek-stopwords * @return a set of default Greek-stopwords */ - public static final Set getDefaultStopSet(){ + public static final CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_SET; } private static class DefaultSetHolder { - private static final Set DEFAULT_SET; + private static final CharArraySet DEFAULT_SET; static { try { @@ -95,7 +95,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase { * See above * @param stopwords a stopword set */ - public GreekAnalyzer(Version matchVersion, Set stopwords) { + public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) { super(matchVersion, stopwords); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java index 6e71e40af86..2d94e9fbec2 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.en; */ import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -37,13 +36,13 @@ import org.apache.lucene.util.Version; * {@link Analyzer} for English. */ public final class EnglishAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -52,7 +51,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET; + static final CharArraySet DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET; } /** @@ -68,7 +67,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public EnglishAnalyzer(Version matchVersion, Set stopwords) { + public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -81,7 +80,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public EnglishAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java index 42f10e5d44c..b4eb2e0127d 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.es; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -48,7 +47,7 @@ import org.tartarus.snowball.ext.SpanishStemmer; * */ public final class SpanishAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Spanish stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt"; @@ -57,7 +56,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -66,7 +65,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -93,7 +92,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public SpanishAnalyzer(Version matchVersion, Set stopwords) { + public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -106,7 +105,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public SpanishAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java index 9ed380823fa..4fde234a7b1 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.eu; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.BasqueStemmer; * {@link Analyzer} for Basque. */ public final class BasqueAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Basque stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; @@ -48,7 +47,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -57,7 +56,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -84,7 +83,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public BasqueAnalyzer(Version matchVersion, Set stopwords) { + public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -97,7 +96,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public BasqueAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java index efdbd2e1010..2b47f68cfc9 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fa; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharReader; @@ -30,6 +29,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; @@ -63,7 +63,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -72,7 +72,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -101,7 +101,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase { * @param stopwords * a stopword set */ - public PersianAnalyzer(Version matchVersion, Set stopwords){ + public PersianAnalyzer(Version matchVersion, CharArraySet stopwords){ super(matchVersion, stopwords); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java index caf59278a3f..3cd17773219 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fi; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.FinnishStemmer; * {@link Analyzer} for Finnish. */ public final class FinnishAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Italian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt"; @@ -50,7 +49,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -59,7 +58,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -86,7 +85,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public FinnishAnalyzer(Version matchVersion, Set stopwords) { + public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -99,7 +98,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public FinnishAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java index 507a114336a..99c53c6d879 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java @@ -19,11 +19,9 @@ package org.apache.lucene.analysis.fr; import java.io.IOException; import java.util.Arrays; -import java.util.Set; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; @@ -56,7 +54,7 @@ public final class ElisionFilter extends TokenFilter { * @param input the source {@link TokenStream} * @param articles a set of stopword articles */ - public ElisionFilter(Version matchVersion, TokenStream input, Set articles) { + public ElisionFilter(Version matchVersion, TokenStream input, CharArraySet articles) { super(input); this.articles = CharArraySet.unmodifiableSet( new CharArraySet(matchVersion, articles, true)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java index 2c89881f785..1f8b47b3a85 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java @@ -36,7 +36,6 @@ import org.apache.lucene.util.Version; import java.io.IOException; import java.io.Reader; import java.util.Arrays; -import java.util.Set; /** * {@link Analyzer} for French language. @@ -101,23 +100,23 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase { /** * Contains words that should be indexed but not stemmed. */ - private final Set excltable; + private final CharArraySet excltable; /** * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } private static class DefaultSetHolder { /** @deprecated (3.1) remove this in Lucene 5.0, index bw compat */ @Deprecated - static final Set DEFAULT_STOP_SET_30 = CharArraySet + static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS), false)); - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, @@ -147,7 +146,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase { * @param stopwords * a stopword set */ - public FrenchAnalyzer(Version matchVersion, Set stopwords){ + public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords){ this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -161,8 +160,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase { * @param stemExclutionSet * a stemming exclusion set */ - public FrenchAnalyzer(Version matchVersion, Set stopwords, - Set stemExclutionSet) { + public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords, + CharArraySet stemExclutionSet) { super(matchVersion, stopwords); this.excltable = CharArraySet.unmodifiableSet(CharArraySet .copy(matchVersion, stemExclutionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java index 7ce43f1bf15..c8c80828389 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.gl; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -39,7 +38,7 @@ import org.apache.lucene.util.Version; * {@link Analyzer} for Galician. */ public final class GalicianAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Galician stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; @@ -48,7 +47,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -57,7 +56,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -84,7 +83,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public GalicianAnalyzer(Version matchVersion, Set stopwords) { + public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -97,7 +96,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public GalicianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java index 0d3d72ee645..a4d616cdf78 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hi; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -44,7 +43,7 @@ import org.apache.lucene.util.Version; * */ public final class HindiAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** * File containing default Hindi stopwords. @@ -59,7 +58,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -68,7 +67,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -88,7 +87,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a stemming exclusion set */ - public HindiAnalyzer(Version version, Set stopwords, Set stemExclusionSet) { + public HindiAnalyzer(Version version, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(version, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet( CharArraySet.copy(matchVersion, stemExclusionSet)); @@ -100,7 +99,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase { * @param version lucene compatibility version * @param stopwords a stopword set */ - public HindiAnalyzer(Version version, Set stopwords) { + public HindiAnalyzer(Version version, CharArraySet stopwords) { this(version, stopwords, CharArraySet.EMPTY_SET); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java index a9270097d17..da9686e4459 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hu; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.HungarianStemmer; * {@link Analyzer} for Hungarian. */ public final class HungarianAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Hungarian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt"; @@ -50,7 +49,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -59,7 +58,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -86,7 +85,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public HungarianAnalyzer(Version matchVersion, Set stopwords) { + public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -99,7 +98,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public HungarianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java index 76983deeba5..917712267de 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.hy; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.ArmenianStemmer; * {@link Analyzer} for Armenian. */ public final class ArmenianAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Armenian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; @@ -48,7 +47,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -57,7 +56,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -84,7 +83,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public ArmenianAnalyzer(Version matchVersion, Set stopwords) { + public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -97,7 +96,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public ArmenianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java index dfea4042b4b..b3861d24b53 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.id; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; @@ -43,7 +42,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -52,7 +51,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -65,7 +64,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase { } } - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. @@ -82,7 +81,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase { * @param stopwords * a stopword set */ - public IndonesianAnalyzer(Version matchVersion, Set stopwords){ + public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords){ this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -98,7 +97,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase { * @param stemExclusionSet * a set of terms not to be stemmed */ - public IndonesianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet){ + public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){ super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java index 5e94049f595..f5d2ef9f5e2 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.it; import java.io.IOException; import java.io.Reader; import java.util.Arrays; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -52,7 +51,7 @@ import org.tartarus.snowball.ext.ItalianStemmer; * */ public final class ItalianAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Italian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt"; @@ -68,7 +67,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -77,7 +76,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -104,7 +103,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public ItalianAnalyzer(Version matchVersion, Set stopwords) { + public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -117,7 +116,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public ItalianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java index 370e706bd5a..1c2ab24352c 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.lv; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -27,7 +26,6 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; @@ -40,7 +38,7 @@ import org.apache.lucene.util.Version; * {@link Analyzer} for Latvian. */ public final class LatvianAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Latvian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; @@ -49,7 +47,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -58,7 +56,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -85,7 +83,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public LatvianAnalyzer(Version matchVersion, Set stopwords) { + public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -98,7 +96,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public LatvianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java index 7a55e32c53f..96be25238a2 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java @@ -18,14 +18,12 @@ package org.apache.lucene.analysis.miscellaneous; */ import java.io.IOException; -import java.util.Set; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.util.Version; /** * Marks terms as keywords via the {@link KeywordAttribute}. Each token @@ -50,27 +48,11 @@ public final class KeywordMarkerFilter extends TokenFilter { * @param keywordSet * the keywords set to lookup the current termbuffer */ - public KeywordMarkerFilter(final TokenStream in, - final CharArraySet keywordSet) { + public KeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet) { super(in); this.keywordSet = keywordSet; } - /** - * Create a new KeywordMarkerFilter, that marks the current token as a - * keyword if the tokens term buffer is contained in the given set via the - * {@link KeywordAttribute}. - * - * @param in - * TokenStream to filter - * @param keywordSet - * the keywords set to lookup the current termbuffer - */ - public KeywordMarkerFilter(final TokenStream in, final Set keywordSet) { - this(in, keywordSet instanceof CharArraySet ? (CharArraySet) keywordSet - : CharArraySet.copy(Version.LUCENE_31, keywordSet)); - } - @Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java index 6c78e844166..a3d2a11694e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java @@ -22,7 +22,6 @@ import java.io.Reader; import java.io.StringReader; import java.util.Arrays; import java.util.Locale; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -139,7 +138,7 @@ public final class PatternAnalyzer extends Analyzer { private final Pattern pattern; private final boolean toLowerCase; - private final Set stopWords; + private final CharArraySet stopWords; private final Version matchVersion; @@ -162,7 +161,7 @@ public final class PatternAnalyzer extends Analyzer { * or other stop words * lists . */ - public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set stopWords) { + public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, CharArraySet stopWords) { if (pattern == null) throw new IllegalArgumentException("pattern must not be null"); @@ -404,12 +403,12 @@ public final class PatternAnalyzer extends Analyzer { private int pos; private final boolean isLetter; private final boolean toLowerCase; - private final Set stopWords; + private final CharArraySet stopWords; private static final Locale locale = Locale.getDefault(); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set stopWords) { + public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, CharArraySet stopWords) { super(input); this.str = str; this.isLetter = isLetter; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java index 65c0e160552..11250365bdf 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.miscellaneous; */ import java.io.IOException; -import java.util.Map; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -46,10 +45,9 @@ public final class StemmerOverrideFilter extends TokenFilter { *

*/ public StemmerOverrideFilter(Version matchVersion, TokenStream input, - Map dictionary) { + CharArrayMap dictionary) { super(input); - this.dictionary = dictionary instanceof CharArrayMap ? - (CharArrayMap) dictionary : CharArrayMap.copy(matchVersion, dictionary); + this.dictionary = CharArrayMap.copy(matchVersion, dictionary); } @Override diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java index 312242f196a..1780c545914 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java @@ -28,18 +28,14 @@ import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc +import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; -import java.io.File; import java.io.IOException; import java.io.Reader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Set; -import java.util.Map; /** * {@link Analyzer} for Dutch language. @@ -56,6 +52,9 @@ import java.util.Map; *

You must specify the required {@link Version} * compatibility when creating DutchAnalyzer: *

    + *
  • As of 3.6, {@link #DutchAnalyzer(Version, CharArraySet)} and + * {@link #DutchAnalyzer(Version, CharArraySet, CharArraySet)} also populate + * the default entries for the stem override dictionary *
  • As of 3.1, Snowball stemming is done with SnowballFilter, * LowerCaseFilter is used prior to StopFilter, and Snowball * stopwords are used by default. @@ -75,13 +74,13 @@ public final class DutchAnalyzer extends Analyzer { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; - + static final CharArraySet DEFAULT_STOP_SET; + static final CharArrayMap DEFAULT_STEM_DICT; static { try { DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, @@ -91,6 +90,12 @@ public final class DutchAnalyzer extends Analyzer { // distribution (JAR) throw new RuntimeException("Unable to load default stopword set"); } + + DEFAULT_STEM_DICT = new CharArrayMap(Version.LUCENE_CURRENT, 4, false); + DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet + DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet + DEFAULT_STEM_DICT.put("ei", "eier"); + DEFAULT_STEM_DICT.put("kind", "kinder"); } } @@ -98,14 +103,14 @@ public final class DutchAnalyzer extends Analyzer { /** * Contains the stopwords used with the StopFilter. */ - private final Set stoptable; + private final CharArraySet stoptable; /** * Contains words that should be indexed but not stemmed. */ - private Set excltable = Collections.emptySet(); + private CharArraySet excltable = CharArraySet.EMPTY_SET; - private final Map stemdict = new HashMap(); + private final CharArrayMap stemdict; private final Version matchVersion; /** @@ -114,21 +119,33 @@ public final class DutchAnalyzer extends Analyzer { * */ public DutchAnalyzer(Version matchVersion) { - this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); - stemdict.put("fiets", "fiets"); //otherwise fiet - stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet - stemdict.put("ei", "eier"); - stemdict.put("kind", "kinder"); + // historically, only this ctor populated the stem dict!!!!! + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT); } - public DutchAnalyzer(Version matchVersion, Set stopwords){ - this(matchVersion, stopwords, CharArraySet.EMPTY_SET); + public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){ + // historically, this ctor never the stem dict!!!!! + // so we populate it only for >= 3.6 + this(matchVersion, stopwords, CharArraySet.EMPTY_SET, + matchVersion.onOrAfter(Version.LUCENE_36) + ? DefaultSetHolder.DEFAULT_STEM_DICT + : CharArrayMap.emptyMap()); } - public DutchAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionTable){ - stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); - excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); + public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){ + // historically, this ctor never the stem dict!!!!! + // so we populate it only for >= 3.6 + this(matchVersion, stopwords, stemExclusionTable, + matchVersion.onOrAfter(Version.LUCENE_36) + ? DefaultSetHolder.DEFAULT_STEM_DICT + : CharArrayMap.emptyMap()); + } + + public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap stemOverrideDict) { this.matchVersion = matchVersion; + this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); + this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); + this.stemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict)); } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java index 00403f1f720..e08173571a6 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.no; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.NorwegianStemmer; * {@link Analyzer} for Norwegian. */ public final class NorwegianAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Norwegian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt"; @@ -50,7 +49,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -59,7 +58,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -86,7 +85,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public NorwegianAnalyzer(Version matchVersion, Set stopwords) { + public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -99,7 +98,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public NorwegianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java index fd20bf93647..554d2272b43 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.pt; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -48,7 +47,7 @@ import org.tartarus.snowball.ext.PortugueseStemmer; *
*/ public final class PortugueseAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Portuguese stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt"; @@ -57,7 +56,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -66,7 +65,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -93,7 +92,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public PortugueseAnalyzer(Version matchVersion, Set stopwords) { + public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -106,7 +105,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public PortugueseAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java index 7f4464f65ec..8b70d128906 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java @@ -22,6 +22,7 @@ import java.util.*; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; @@ -179,7 +180,8 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper { if (stopWords == null) { return components; } - StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(), stopWords); + StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(), + new CharArraySet(matchVersion, stopWords, false)); return new TokenStreamComponents(components.getTokenizer(), stopFilter); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java index 1d3c40d6dbe..36c7e3d29da 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ro; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -39,7 +38,7 @@ import org.tartarus.snowball.ext.RomanianStemmer; * {@link Analyzer} for Romanian. */ public final class RomanianAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Romanian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; @@ -53,7 +52,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -62,7 +61,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -89,7 +88,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public RomanianAnalyzer(Version matchVersion, Set stopwords) { + public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -102,7 +101,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public RomanianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java index 247bdf636e9..0aa13092bd1 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ru; import java.io.IOException; import java.io.Reader; import java.util.Arrays; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.snowball.SnowballFilter; @@ -78,10 +77,10 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase private static class DefaultSetHolder { /** @deprecated (3.1) remove this for Lucene 5.0 */ @Deprecated - static final Set DEFAULT_STOP_SET_30 = CharArraySet + static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(RUSSIAN_STOP_WORDS_30), false)); - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -95,14 +94,14 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase } } - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** * Returns an unmodifiable instance of the default stop-words set. * * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet() { + public static CharArraySet getDefaultStopSet() { return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -120,7 +119,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase * @param stopwords * a stopword set */ - public RussianAnalyzer(Version matchVersion, Set stopwords){ + public RussianAnalyzer(Version matchVersion, CharArraySet stopwords){ this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -133,7 +132,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase * a stopword set * @param stemExclusionSet a set of words not to be stemmed */ - public RussianAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet){ + public RussianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){ super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java index 7a6c710d849..d9c624d123a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java @@ -27,7 +27,6 @@ import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; import java.io.Reader; -import java.util.Set; /** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link * LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}. @@ -48,7 +47,7 @@ import java.util.Set; @Deprecated public final class SnowballAnalyzer extends Analyzer { private String name; - private Set stopSet; + private CharArraySet stopSet; private final Version matchVersion; /** Builds the named analyzer with no stop words. */ @@ -58,7 +57,7 @@ public final class SnowballAnalyzer extends Analyzer { } /** Builds the named analyzer with the given stop words. */ - public SnowballAnalyzer(Version matchVersion, String name, Set stopWords) { + public SnowballAnalyzer(Version matchVersion, String name, CharArraySet stopWords) { this(matchVersion, name); stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopWords)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java index 5803a862529..5115a5e2e97 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.util.Version; @@ -28,7 +29,6 @@ import org.apache.lucene.util.Version; import java.io.File; import java.io.IOException; import java.io.Reader; -import java.util.Set; /** * Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link @@ -60,13 +60,13 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase { /** An unmodifiable set containing some common English words that are usually not useful for searching. */ - public static final Set STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; /** Builds an analyzer with the given stop words. * @param matchVersion Lucene version to match See {@link * above} * @param stopWords stop words */ - public ClassicAnalyzer(Version matchVersion, Set stopWords) { + public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) { super(matchVersion, stopWords); } @@ -79,15 +79,6 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase { this(matchVersion, STOP_WORDS_SET); } - /** Builds an analyzer with the stop words from the given file. - * @see WordlistLoader#getWordSet(Reader, Version) - * @param matchVersion Lucene version to match See {@link - * above} - * @param stopwords File to read stop words from */ - public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException { - this(matchVersion, loadStopwordSet(stopwords, matchVersion)); - } - /** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader, Version) * @param matchVersion Lucene version to match See {@link diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java index 807c0f8c980..13d15f1faaf 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.util.Version; @@ -28,7 +29,6 @@ import org.apache.lucene.util.Version; import java.io.File; import java.io.IOException; import java.io.Reader; -import java.util.Set; /** * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link @@ -61,13 +61,13 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase { /** An unmodifiable set containing some common English words that are usually not useful for searching. */ - public static final Set STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; /** Builds an analyzer with the given stop words. * @param matchVersion Lucene version to match See {@link * above} * @param stopWords stop words */ - public StandardAnalyzer(Version matchVersion, Set stopWords) { + public StandardAnalyzer(Version matchVersion, CharArraySet stopWords) { super(matchVersion, stopWords); } @@ -80,15 +80,6 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase { this(matchVersion, STOP_WORDS_SET); } - /** Builds an analyzer with the stop words from the given file. - * @see WordlistLoader#getWordSet(Reader, Version) - * @param matchVersion Lucene version to match See {@link - * above} - * @param stopwords File to read stop words from */ - public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException { - this(matchVersion, loadStopwordSet(stopwords, matchVersion)); - } - /** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader, Version) * @param matchVersion Lucene version to match See {@link diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java index b1f9442b642..c1a6305aa61 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.sv; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -41,7 +40,7 @@ import org.tartarus.snowball.ext.SwedishStemmer; * {@link Analyzer} for Swedish. */ public final class SwedishAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Swedish stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt"; @@ -50,7 +49,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -59,7 +58,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -86,7 +85,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public SwedishAnalyzer(Version matchVersion, Set stopwords) { + public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -99,7 +98,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public SwedishAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java index ca2e39d92eb..7dd505cba6d 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.th; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; @@ -55,7 +55,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -64,7 +64,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -93,7 +93,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public ThaiAnalyzer(Version matchVersion, Set stopwords) { + public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) { super(matchVersion, stopwords); } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java index e74732494ea..4523897f80c 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.tr; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.StopFilter; @@ -38,7 +37,7 @@ import org.tartarus.snowball.ext.TurkishStemmer; * {@link Analyzer} for Turkish. */ public final class TurkishAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; /** File containing default Turkish stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; @@ -52,7 +51,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -61,7 +60,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -88,7 +87,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public TurkishAnalyzer(Version matchVersion, Set stopwords) { + public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -101,7 +100,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public TurkishAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java index ba85a499740..e371beea751 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.util; import java.io.File; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.util.IOUtils; @@ -46,7 +45,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer { * @return the analyzer's stopword set or an empty set if the analyzer has no * stopwords */ - public Set getStopwordSet() { + public CharArraySet getStopwordSet() { return stopwords; } @@ -58,7 +57,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer { * @param stopwords * the analyzer's stopword set */ - protected StopwordAnalyzerBase(final Version version, final Set stopwords) { + protected StopwordAnalyzerBase(final Version version, final CharArraySet stopwords) { matchVersion = version; // analyzers should use char array set for stopwords! this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java index d365cba19e5..66fe05b0f7d 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java @@ -18,9 +18,6 @@ package org.apache.lucene.analysis.ar; */ import java.io.IOException; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.util.CharArraySet; @@ -79,16 +76,14 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase { * Test that custom stopwords work, and are not case-sensitive. */ public void testCustomStopwords() throws Exception { - Set set = new HashSet(); - Collections.addAll(set, "the", "and", "a"); + CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("the", "and", "a"), false); ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set); assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", "brown", "fox" }); } public void testWithStemExclusionSet() throws IOException { - Set set = new HashSet(); - set.add("ساهدهات"); + CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, asSet("ساهدهات"), false); ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set); assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" }); assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" }); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java index 2832b1697d8..9296e317389 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.bg; */ import java.io.IOException; -import java.util.Collections; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -43,8 +42,7 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase { } public void testCustomStopwords() throws IOException { - Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, Collections - .emptySet()); + Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET); assertAnalyzesTo(a, "Как се казваш?", new String[] {"как", "се", "казваш"}); } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java index 80f6ab1fe3a..5d3b42e0704 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java @@ -136,7 +136,8 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase { } public void testStemExclusionTable() throws Exception { - BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT, Collections.emptySet(), asSet("quintessência")); + BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT, + CharArraySet.EMPTY_SET, new CharArraySet(TEST_VERSION_CURRENT, asSet("quintessência"), false)); checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged. } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java index d2b26cf7c9c..2680819a9e4 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ca/TestCatalanAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.ca; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestCatalanAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -50,8 +49,7 @@ public class TestCatalanAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("llengües"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("llengües"), false); Analyzer a = new CatalanAnalyzer(TEST_VERSION_CURRENT, CatalanAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "llengües", "llengües"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java index d56f9f162b8..eac56fdb2ce 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; import java.io.StringReader; @@ -58,10 +59,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase { } public void testStopList() throws IOException { - Set stopWordsSet = new HashSet(); - stopWordsSet.add("good"); - stopWordsSet.add("test"); - stopWordsSet.add("analyzer"); + CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false); StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet); StringReader reader = new StringReader("This is a good test of the english stop analyzer"); TokenStream stream = newStop.tokenStream("test", reader); @@ -75,10 +73,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase { } public void testStopListPositions() throws IOException { - Set stopWordsSet = new HashSet(); - stopWordsSet.add("good"); - stopWordsSet.add("test"); - stopWordsSet.add("analyzer"); + CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false); StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet); StringReader reader = new StringReader("This is a good test of the english stop analyzer with positions"); int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1}; diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java index 2f13bc405dc..fe14521e918 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.English; import org.apache.lucene.util.Version; @@ -36,22 +37,15 @@ public class TestStopFilter extends BaseTokenStreamTestCase { public void testExactCase() throws IOException { StringReader reader = new StringReader("Now is The Time"); - Set stopWords = asSet("is", "the", "Time"); - TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, false); + CharArraySet stopWords = new CharArraySet(TEST_VERSION_CURRENT, asSet("is", "the", "Time"), false); + TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords); assertTokenStreamContents(stream, new String[] { "Now", "The" }); } - public void testIgnoreCase() throws IOException { - StringReader reader = new StringReader("Now is The Time"); - Set stopWords = asSet( "is", "the", "Time" ); - TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords, true); - assertTokenStreamContents(stream, new String[] { "Now" }); - } - public void testStopFilt() throws IOException { StringReader reader = new StringReader("Now is The Time"); String[] stopWords = new String[] { "is", "the", "Time" }; - Set stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords); + CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords); TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); assertTokenStreamContents(stream, new String[] { "Now", "The" }); } @@ -70,7 +64,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase { log(sb.toString()); String stopWords[] = a.toArray(new String[0]); for (int i=0; i stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords); + CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords); // with increments StringReader reader = new StringReader(sb.toString()); StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); @@ -93,8 +87,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase { for (int i=0; i stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0); - Set stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1); + CharArraySet stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0); + CharArraySet stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1); reader = new StringReader(sb.toString()); StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set stpf0.setEnablePositionIncrements(true); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java index e7863b0503b..d39409a0ba7 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.da; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestDanishAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("undersøgelse"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("undersøgelse"), false); Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT, DanishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "undersøgelse", "undersøgelse"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java index c3bc23f3483..0565e7ba501 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.de; import java.io.IOException; import java.io.StringReader; -import java.util.Collections; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -46,7 +45,8 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase { } public void testStemExclusionTable() throws Exception { - GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, Collections.emptySet(), asSet("tischen")); + GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, + new CharArraySet(TEST_VERSION_CURRENT, asSet("tischen"), false)); checkOneTermReuse(a, "tischen", "tischen"); } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java index b10ae03f268..34e4fb95bf0 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.en; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestEnglishAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -45,8 +44,7 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("books"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("books"), false); Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT, EnglishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "books", "books"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java index a41c8efca6c..e4fd9f63a82 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.es; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestSpanishAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("chicano"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chicano"), false); Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT, SpanishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "chicana", "chican"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java index 5c11deb02e9..8461d545665 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/eu/TestBasqueAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.eu; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestBasqueAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestBasqueAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("zaldiak"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("zaldiak"), false); Analyzer a = new BasqueAnalyzer(TEST_VERSION_CURRENT, BasqueAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "zaldiak", "zaldiak"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java index 5934bad40d9..9e4022ad6f1 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java @@ -19,6 +19,7 @@ package org.apache.lucene.analysis.fa; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; /** * Test the Persian Analyzer @@ -215,7 +216,8 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase { * Test that custom stopwords work, and are not case-sensitive. */ public void testCustomStopwords() throws Exception { - PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, asSet("the", "and", "a")); + PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, + new CharArraySet(TEST_VERSION_CURRENT, asSet("the", "and", "a"), false)); assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", "brown", "fox" }); } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java index 35b67a278aa..0c24d8be361 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.fi; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestFinnishAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("edeltäjistään"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false); Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT, FinnishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "edeltäjiinsä", "edeltäj"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java index 7a73685198f..1223e014132 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElision.java @@ -20,15 +20,14 @@ package org.apache.lucene.analysis.fr; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.CharArraySet; /** * @@ -38,9 +37,7 @@ public class TestElision extends BaseTokenStreamTestCase { public void testElision() throws Exception { String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test)); - Set articles = new HashSet(); - articles.add("l"); - articles.add("M"); + CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false); TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles); List tas = filter(filter); assertEquals("embrouille", tas.get(4)); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java index 0264427c444..8f9ea8e4da1 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.gl; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestGalicianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestGalicianAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("correspondente"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("correspondente"), false); Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT, GalicianAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "correspondente", "correspondente"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java index 343a52b8fdd..393226db6b4 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java @@ -1,10 +1,8 @@ package org.apache.lucene.analysis.hi; -import java.util.HashSet; -import java.util.Set; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -41,8 +39,7 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase { } public void testExclusionSet() throws Exception { - Set exclusionSet = new HashSet(); - exclusionSet.add("हिंदी"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("हिंदी"), false); Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT, HindiAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "हिंदी", "हिंदी"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java index b2ada3be0d6..5f7e22376e5 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.hu; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestHungarianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("babakocsi"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false); Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT, HungarianAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "babakocsi", "babakocsi"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java index 68caf5c2c39..7bb72c701b3 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hy/TestArmenianAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.hy; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestArmenianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestArmenianAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("արծիվներ"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("արծիվներ"), false); Analyzer a = new ArmenianAnalyzer(TEST_VERSION_CURRENT, ArmenianAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "արծիվներ", "արծիվներ"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java index 3002e62bb99..0967ed6c600 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.id; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestIndonesianAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("peledakan"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("peledakan"), false); Analyzer a = new IndonesianAnalyzer(TEST_VERSION_CURRENT, IndonesianAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "peledakan", "peledakan"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java index 56f64f19074..079ce8f0883 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java @@ -23,6 +23,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; public class TestItalianAnalyzer extends BaseTokenStreamTestCase { @@ -44,8 +45,7 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("abbandonata"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("abbandonata"), false); Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT, ItalianAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "abbandonata", "abbandonata"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java index 724584582c4..2f7ff13946b 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.lv; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestLatvianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestLatvianAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("tirgiem"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("tirgiem"), false); Analyzer a = new LatvianAnalyzer(TEST_VERSION_CURRENT, LatvianAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "tirgiem", "tirgiem"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java index 33085e7b047..141d0bd8116 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java @@ -2,10 +2,7 @@ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.io.StringReader; -import java.util.Arrays; -import java.util.HashSet; import java.util.Locale; -import java.util.Set; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; @@ -47,12 +44,11 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase { assertTokenStreamContents(new LowerCaseFilterMock( new KeywordMarkerFilter(new MockTokenizer(new StringReader( "The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output); - Set jdkSet = new HashSet(); - jdkSet.add("LuceneFox"); + CharArraySet mixedCaseSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("LuceneFox"), false); assertTokenStreamContents(new LowerCaseFilterMock( new KeywordMarkerFilter(new MockTokenizer(new StringReader( - "The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), jdkSet)), output); - Set set2 = set; + "The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), mixedCaseSet)), output); + CharArraySet set2 = set; assertTokenStreamContents(new LowerCaseFilterMock( new KeywordMarkerFilter(new MockTokenizer(new StringReader( "The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output); @@ -64,8 +60,8 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase { new KeywordMarkerFilter( new KeywordMarkerFilter( new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false), - new HashSet(Arrays.asList("Birds", "Houses"))), - new HashSet(Arrays.asList("Dogs", "Trees")))); + new CharArraySet(TEST_VERSION_CURRENT, asSet("Birds", "Houses"), false)), + new CharArraySet(TEST_VERSION_CURRENT, asSet("Dogs", "Trees"), false))); assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" }); } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java index 463faf403a1..145568945b3 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java @@ -2,12 +2,11 @@ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -33,7 +32,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. - Map dictionary = new HashMap(); + CharArrayMap dictionary = new CharArrayMap(TEST_VERSION_CURRENT, 1, false); dictionary.put("booked", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked")); TokenStream stream = new PorterStemFilter( diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java index 5bdaa3e8d3e..cd91cbc3e35 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.nl; * limitations under the License. */ -import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -150,6 +149,26 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase { } + /** + * check that the default stem overrides are used + * even if you use a non-default ctor. + */ + public void testStemOverrides() throws IOException { + DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET); + checkOneTerm(a, "fiets", "fiets"); + } + + /** + * prior to 3.6, this confusingly did not happen if + * you specified your own stoplist!!!! + * @deprecated (3.6) Remove this test in Lucene 5.0 + */ + @Deprecated + public void testBuggyStemOverrides() throws IOException { + DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_35, CharArraySet.EMPTY_SET); + checkOneTerm(a, "fiets", "fiet"); + } + /** * Prior to 3.1, this analyzer had no lowercase filter. * stopwords were case sensitive. Preserve this for back compat. diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java index 9990fdac95c..acf6e0b9691 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.no; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("havnedistriktene"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("havnedistriktene"), false); Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT, NorwegianAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "havnedistriktene", "havnedistriktene"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java index a0e263fd8f3..8c96b2bbc7c 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.pt; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("quilométricas"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false); Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT, PortugueseAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "quilométricas", "quilométricas"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java index ee6f94b0404..10bfa438660 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.ro; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestRomanianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("absenţa"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("absenţa"), false); Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT, RomanianAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "absenţa", "absenţa"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java index 493da3abca9..15a18412b74 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java @@ -18,12 +18,10 @@ package org.apache.lucene.analysis.sv; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.hu.HungarianAnalyzer; +import org.apache.lucene.analysis.util.CharArraySet; public class TestSwedishAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -44,8 +42,7 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("jaktkarlarne"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlarne"), false); Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT, SwedishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne"); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java index 4b9587a3810..fce2e6f84e9 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.tr; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestTurkishAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("ağacı"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ağacı"), false); Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT, TurkishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "ağacı", "ağacı"); diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java index 257f9971c1a..324136d04f1 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java @@ -39,13 +39,13 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase { this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS); } - public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set stopwords, Set stoptags) { + public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set stoptags) { super(matchVersion, stopwords); this.segmenter = segmenter; this.stoptags = stoptags; } - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -58,7 +58,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase { * outer class accesses the static final set the first time. */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static final Set DEFAULT_STOP_TAGS; static { diff --git a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java index 7710209b863..b3ca2c915b9 100644 --- a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java +++ b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java @@ -18,10 +18,7 @@ package org.apache.lucene.analysis.cn.smart; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.Reader; -import java.util.Collections; import java.util.Set; import org.apache.lucene.analysis.Analyzer; @@ -58,7 +55,7 @@ import org.apache.lucene.util.Version; */ public final class SmartChineseAnalyzer extends Analyzer { - private final Set stopWords; + private final CharArraySet stopWords; private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; @@ -120,7 +117,7 @@ public final class SmartChineseAnalyzer extends Analyzer { */ public SmartChineseAnalyzer(Version matchVersion, boolean useDefaultStopWords) { stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET - : Collections.EMPTY_SET; + : CharArraySet.EMPTY_SET; this.matchVersion = matchVersion; } @@ -133,8 +130,8 @@ public final class SmartChineseAnalyzer extends Analyzer { *

* @param stopWords {@link Set} of stopwords to use. */ - public SmartChineseAnalyzer(Version matchVersion, Set stopWords) { - this.stopWords = stopWords==null?Collections.EMPTY_SET:stopWords; + public SmartChineseAnalyzer(Version matchVersion, CharArraySet stopWords) { + this.stopWords = stopWords==null?CharArraySet.EMPTY_SET:stopWords; this.matchVersion = matchVersion; } @@ -147,7 +144,7 @@ public final class SmartChineseAnalyzer extends Analyzer { // The porter stemming is too strict, this is not a bug, this is a feature:) result = new PorterStemFilter(result); if (!stopWords.isEmpty()) { - result = new StopFilter(matchVersion, result, stopWords, false); + result = new StopFilter(matchVersion, result, stopWords); } return new TokenStreamComponents(tokenizer, result); } diff --git a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java index 59c8fd9889a..34b03d674d2 100644 --- a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java +++ b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.pl; import java.io.IOException; import java.io.Reader; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; @@ -42,7 +41,7 @@ import org.egothor.stemmer.Trie; * {@link Analyzer} for Polish. */ public final class PolishAnalyzer extends StopwordAnalyzerBase { - private final Set stemExclusionSet; + private final CharArraySet stemExclusionSet; private final Trie stemTable; /** File containing default Polish stopwords. */ @@ -55,7 +54,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase { * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultsHolder.DEFAULT_STOP_SET; } @@ -64,7 +63,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase { * accesses the static final set the first time.; */ private static class DefaultsHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static final Trie DEFAULT_TABLE; static { @@ -100,7 +99,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase { * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ - public PolishAnalyzer(Version matchVersion, Set stopwords) { + public PolishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); } @@ -113,7 +112,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase { * @param stopwords a stopword set * @param stemExclusionSet a set of terms not to be stemmed */ - public PolishAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { + public PolishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemTable = DefaultsHolder.DEFAULT_TABLE; this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( diff --git a/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java b/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java index 6cb7c7de3bd..21c55238d93 100644 --- a/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java +++ b/modules/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java @@ -18,11 +18,10 @@ package org.apache.lucene.analysis.pl; */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; public class TestPolishAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -43,8 +42,7 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { - Set exclusionSet = new HashSet(); - exclusionSet.add("studenta"); + CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("studenta"), false);; Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT, PolishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTermReuse(a, "studenta", "studenta"); diff --git a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java index 20ebe22d47b..2159d4265c3 100644 --- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java +++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java @@ -93,15 +93,14 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements .getTokenFilterFactories(); for (TokenFilterFactory factory : filterFactories) { if (factory instanceof StopFilterFactory) { - // StopFilterFactory holds the stop words in a CharArraySet, but - // the getStopWords() method returns a Set, so we need to cast. + // StopFilterFactory holds the stop words in a CharArraySet solrStopWords.put(fieldName, - (CharArraySet) ((StopFilterFactory) factory).getStopWords()); + ((StopFilterFactory) factory).getStopWords()); } if (factory instanceof CommonGramsFilterFactory) { solrStopWords.put(fieldName, - (CharArraySet) ((CommonGramsFilterFactory) factory) + ((CommonGramsFilterFactory) factory) .getCommonWords()); } } diff --git a/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java index 87b2a308bab..1cd17bbfbc7 100644 --- a/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java @@ -17,7 +17,6 @@ package org.apache.solr.analysis; import java.io.IOException; -import java.util.Set; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; @@ -71,12 +70,12 @@ public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements return ignoreCase; } - public Set getCommonWords() { + public CharArraySet getCommonWords() { return commonWords; } public CommonGramsFilter create(TokenStream input) { - CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, ignoreCase); + CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords); return commonGrams; } } diff --git a/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java index f70dfac9fb2..3dad726aa2b 100644 --- a/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java @@ -18,7 +18,6 @@ package org.apache.solr.analysis; import java.io.IOException; import java.util.Map; -import java.util.Set; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; @@ -80,7 +79,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory return ignoreCase; } - public Set getCommonWords() { + public CharArraySet getCommonWords() { return commonWords; } @@ -88,8 +87,7 @@ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter */ public CommonGramsQueryFilter create(TokenStream input) { - CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords, - ignoreCase); + CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords); CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter( commonGrams); return commonGramsQuery; diff --git a/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java index 143137e377a..29b59b006bd 100644 --- a/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java @@ -25,7 +25,6 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.CharArraySet; import java.util.Map; -import java.util.Set; import java.io.IOException; /** @@ -81,13 +80,13 @@ public class StopFilterFactory extends BaseTokenFilterFactory implements Resourc return ignoreCase; } - public Set getStopWords() { + public CharArraySet getStopWords() { return stopWords; } @Override public TokenStream create(TokenStream input) { - StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords,ignoreCase); + StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords); stopFilter.setEnablePositionIncrements(enablePositionIncrements); return stopFilter; } diff --git a/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java b/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java index 4f18f071b9e..2e4874bc385 100644 --- a/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java @@ -20,11 +20,11 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.solr.common.ResourceLoader; import org.apache.solr.core.SolrResourceLoader; import java.io.StringReader; -import java.util.Set; import java.util.Map; import java.util.HashMap; @@ -44,7 +44,7 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase { args.put("ignoreCase", "true"); factory.init(args); factory.inform(loader); - Set words = factory.getCommonWords(); + CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); @@ -89,7 +89,7 @@ public class CommonGramsFilterFactoryTest extends BaseTokenTestCase { Map args = new HashMap(DEFAULT_VERSION_PARAM); factory.init(args); factory.inform(loader); - Set words = factory.getCommonWords(); + CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue(words.contains("the")); Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); diff --git a/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java b/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java index bf0afffdab0..2c96f8b4f77 100644 --- a/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java @@ -19,11 +19,11 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.solr.common.ResourceLoader; import org.apache.solr.core.SolrResourceLoader; import java.io.StringReader; -import java.util.Set; import java.util.Map; import java.util.HashMap; @@ -43,7 +43,7 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase { args.put("ignoreCase", "true"); factory.init(args); factory.inform(loader); - Set words = factory.getCommonWords(); + CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); @@ -88,7 +88,7 @@ public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase { Map args = new HashMap(DEFAULT_VERSION_PARAM); factory.init(args); factory.inform(loader); - Set words = factory.getCommonWords(); + CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue(words.contains("the")); Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); diff --git a/solr/core/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java b/solr/core/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java index 2a9aba5b115..9afaa07de08 100644 --- a/solr/core/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java @@ -16,10 +16,10 @@ package org.apache.solr.analysis; * limitations under the License. */ +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.solr.common.ResourceLoader; import org.apache.solr.core.SolrResourceLoader; -import java.util.Set; import java.util.Map; import java.util.HashMap; @@ -38,7 +38,7 @@ public class TestKeepFilterFactory extends BaseTokenTestCase{ args.put("ignoreCase", "true"); factory.init(args); factory.inform(loader); - Set words = factory.getWords(); + CharArraySet words = factory.getWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); diff --git a/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java b/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java index dae4551bf4a..6c3c33392c4 100644 --- a/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java @@ -17,10 +17,10 @@ package org.apache.solr.analysis; */ +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.solr.common.ResourceLoader; import org.apache.solr.core.SolrResourceLoader; -import java.util.Set; import java.util.Map; import java.util.HashMap; @@ -39,7 +39,7 @@ public class TestStopFilterFactory extends BaseTokenTestCase { args.put("ignoreCase", "true"); factory.init(args); factory.inform(loader); - Set words = factory.getStopWords(); + CharArraySet words = factory.getStopWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);