LUCENE-5859: Literally add back dead code to please a bunch of fucking babies

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1614852 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-07-31 10:44:39 +00:00
parent 0f8f76ce6a
commit e6d29d223b
261 changed files with 1840 additions and 1248 deletions

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Arabic. * {@link Analyzer} for Arabic.
@ -88,18 +89,20 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public ArabicAnalyzer() { public ArabicAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public ArabicAnalyzer(CharArraySet stopwords){ public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -107,14 +110,17 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* {@link ArabicStemFilter}. * {@link ArabicStemFilter}.
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
* @param stemExclusionSet * @param stemExclusionSet
* a set of terms not to be stemmed * a set of terms not to be stemmed
*/ */
public ArabicAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet){ public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -130,10 +136,10 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new LowerCaseFilter(source); TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized! // the order here is important: the stopword list is not normalized!
result = new StopFilter(result, stopwords); result = new StopFilter( matchVersion, result, stopwords);
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?! // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
result = new ArabicNormalizationFilter(result); result = new ArabicNormalizationFilter(result);
if(!stemExclusionSet.isEmpty()) { if(!stemExclusionSet.isEmpty()) {

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.bg;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
@ -30,6 +31,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Bulgarian. * {@link Analyzer} for Bulgarian.
@ -40,7 +42,6 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
* <p> * <p>
*/ */
public final class BulgarianAnalyzer extends StopwordAnalyzerBase { public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
/** /**
* File containing default Bulgarian stopwords. * File containing default Bulgarian stopwords.
* *
@ -83,15 +84,15 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
* Builds an analyzer with the default stop words: * Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}. * {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public BulgarianAnalyzer() { public BulgarianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
*/ */
public BulgarianAnalyzer(CharArraySet stopwords) { public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -99,10 +100,10 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
* If a stem exclusion set is provided this analyzer will add a {@link SetKeywordMarkerFilter} * If a stem exclusion set is provided this analyzer will add a {@link SetKeywordMarkerFilter}
* before {@link BulgarianStemFilter}. * before {@link BulgarianStemFilter}.
*/ */
public BulgarianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
} matchVersion, stemExclusionSet)); }
/** /**
* Creates a * Creates a
@ -118,10 +119,10 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
public TokenStreamComponents createComponents(String fieldName) { public TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new BulgarianStemFilter(result); result = new BulgarianStemFilter(result);

View File

@ -65,7 +65,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class, DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#"); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -83,29 +83,35 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}). * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
*/ */
public BrazilianAnalyzer() { public BrazilianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public BrazilianAnalyzer(CharArraySet stopwords) { public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) {
super(stopwords); super(matchVersion, stopwords);
} }
/** /**
* Builds an analyzer with the given stop words and stemming exclusion words * Builds an analyzer with the given stop words and stemming exclusion words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public BrazilianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords,
this(stopwords); CharArraySet stemExclusionSet) {
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this(matchVersion, stopwords);
excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(matchVersion, stemExclusionSet));
} }
/** /**
@ -120,10 +126,10 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer(); Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new LowerCaseFilter(source); TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StandardFilter(result); result = new StandardFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(excltable != null && !excltable.isEmpty()) if(excltable != null && !excltable.isEmpty())
result = new SetKeywordMarkerFilter(result, excltable); result = new SetKeywordMarkerFilter(result, excltable);
return new TokenStreamComponents(source, new BrazilianStemFilter(result)); return new TokenStreamComponents(source, new BrazilianStemFilter(result));

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.CatalanStemmer; import org.tartarus.snowball.ext.CatalanStemmer;
/** /**
@ -45,7 +46,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet( new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList( Arrays.asList(
"d", "l", "m", "n", "s", "t" "d", "l", "m", "n", "s", "t"
), true)); ), true));
@ -80,17 +81,18 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public CatalanAnalyzer() { public CatalanAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public CatalanAnalyzer(CharArraySet stopwords) { public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -98,12 +100,14 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public CatalanAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -120,11 +124,11 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new CatalanStemmer()); result = new SnowballFilter(result, new CatalanStemmer());

View File

@ -26,6 +26,7 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder; import org.apache.lucene.analysis.util.OpenStringBuilder;
@ -29840,7 +29841,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
upperCaseVariantsAccepted.put("amp", "AMP"); upperCaseVariantsAccepted.put("amp", "AMP");
} }
private static final CharArrayMap<Character> entityValues private static final CharArrayMap<Character> entityValues
= new CharArrayMap<>(253, false); = new CharArrayMap<>(Version.LUCENE_CURRENT, 253, false);
static { static {
String[] entities = { String[] entities = {
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2", "AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
@ -29979,7 +29980,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
escapeSTYLE = true; escapeSTYLE = true;
} else { } else {
if (null == this.escapedTags) { if (null == this.escapedTags) {
this.escapedTags = new CharArraySet(16, true); this.escapedTags = new CharArraySet(Version.LUCENE_CURRENT, 16, true);
} }
this.escapedTags.add(tag); this.escapedTags.add(tag);
} }

View File

@ -24,6 +24,7 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder; import org.apache.lucene.analysis.util.OpenStringBuilder;
@ -194,7 +195,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
escapeSTYLE = true; escapeSTYLE = true;
} else { } else {
if (null == this.escapedTags) { if (null == this.escapedTags) {
this.escapedTags = new CharArraySet(16, true); this.escapedTags = new CharArraySet(Version.LUCENE_CURRENT, 16, true);
} }
this.escapedTags.add(tag); this.escapedTags.add(tag);
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -27,6 +28,7 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/** /**
* An {@link Analyzer} that tokenizes text with {@link StandardTokenizer}, * An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
@ -35,7 +37,6 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
* and filters stopwords with {@link StopFilter} * and filters stopwords with {@link StopFilter}
*/ */
public final class CJKAnalyzer extends StopwordAnalyzerBase { public final class CJKAnalyzer extends StopwordAnalyzerBase {
/** /**
* File containing default CJK stopwords. * File containing default CJK stopwords.
* <p/> * <p/>
@ -69,27 +70,29 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer which removes words in {@link #getDefaultStopSet()}. * Builds an analyzer which removes words in {@link #getDefaultStopSet()}.
*/ */
public CJKAnalyzer() { public CJKAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public CJKAnalyzer(CharArraySet stopwords){ public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){
super(stopwords); super(matchVersion, stopwords);
} }
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
// run the widthfilter first before bigramming, it sometimes combines characters. // run the widthfilter first before bigramming, it sometimes combines characters.
TokenStream result = new CJKWidthFilter(source); TokenStream result = new CJKWidthFilter(source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new CJKBigramFilter(result); result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(result, stopwords)); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
} }
} }

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Sorani Kurdish. * {@link Analyzer} for Sorani Kurdish.
@ -61,7 +62,7 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class, DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -73,17 +74,18 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public SoraniAnalyzer() { public SoraniAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public SoraniAnalyzer(CharArraySet stopwords) { public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -91,12 +93,14 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public SoraniAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -114,11 +118,11 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new SoraniNormalizationFilter(result); result = new SoraniNormalizationFilter(result);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SoraniStemFilter(result); result = new SoraniStemFilter(result);

View File

@ -78,7 +78,7 @@ public final class CommonGramsFilter extends TokenFilter {
* @param input TokenStream input in filter chain * @param input TokenStream input in filter chain
* @param commonWords The set of common words. * @param commonWords The set of common words.
*/ */
public CommonGramsFilter(TokenStream input, CharArraySet commonWords) { public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) {
super(input); super(input);
this.commonWords = commonWords; this.commonWords = commonWords;
} }

View File

@ -76,7 +76,7 @@ public class CommonGramsFilterFactory extends TokenFilterFactory implements Reso
@Override @Override
public TokenFilter create(TokenStream input) { public TokenFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords); CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
return commonGrams; return commonGrams;
} }
} }

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.core;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
/** /**

View File

@ -18,11 +18,13 @@ package org.apache.lucene.analysis.core;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
/** /**
* Emits the entire input as a single token. * Emits the entire input as a single token.

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import java.io.Reader;
import java.util.Map; import java.util.Map;
/** /**

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.Version;
/** /**
* A LetterTokenizer is a tokenizer that divides text at non-letters. That's to * A LetterTokenizer is a tokenizer that divides text at non-letters. That's to
@ -29,25 +30,41 @@ import org.apache.lucene.util.AttributeFactory;
* Note: this does a decent job for most European languages, but does a terrible * Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces. * job for some Asian languages, where words are not separated by spaces.
* </p> * </p>
* <p>
* <a name="version"/>
* You must specify the required {@link Version} compatibility when creating
* {@link LetterTokenizer}:
* <ul>
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
* {@link CharTokenizer#normalize(int)} for details.</li>
* </ul>
* </p>
*/ */
public class LetterTokenizer extends CharTokenizer { public class LetterTokenizer extends CharTokenizer {
/** /**
* Construct a new LetterTokenizer. * Construct a new LetterTokenizer.
*
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
*/ */
public LetterTokenizer() { public LetterTokenizer(Version matchVersion) {
super(matchVersion);
} }
/** /**
* Construct a new LetterTokenizer using a given * Construct a new LetterTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}. * {@link org.apache.lucene.util.AttributeFactory}.
* *
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
* @param factory * @param factory
* the attribute factory to use for this {@link Tokenizer} * the attribute factory to use for this {@link Tokenizer}
*/ */
public LetterTokenizer(AttributeFactory factory) { public LetterTokenizer(Version matchVersion, AttributeFactory factory) {
super(factory); super(matchVersion, factory);
} }
/** Collects only characters which satisfy /** Collects only characters which satisfy

View File

@ -36,6 +36,7 @@ public class LetterTokenizerFactory extends TokenizerFactory {
/** Creates a new LetterTokenizerFactory */ /** Creates a new LetterTokenizerFactory */
public LetterTokenizerFactory(Map<String,String> args) { public LetterTokenizerFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -43,6 +44,6 @@ public class LetterTokenizerFactory extends TokenizerFactory {
@Override @Override
public LetterTokenizer create(AttributeFactory factory) { public LetterTokenizer create(AttributeFactory factory) {
return new LetterTokenizer(factory); return new LetterTokenizer(luceneMatchVersion, factory);
} }
} }

View File

@ -23,21 +23,30 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/** /**
* Normalizes token text to lower case. * Normalizes token text to lower case.
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating LowerCaseFilter:
* <ul>
* <li> As of 3.1, supplementary characters are properly lowercased.
* </ul>
*/ */
public final class LowerCaseFilter extends TokenFilter { public final class LowerCaseFilter extends TokenFilter {
private final CharacterUtils charUtils = CharacterUtils.getInstance(); private final CharacterUtils charUtils;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** /**
* Create a new LowerCaseFilter, that normalizes token text to lower case. * Create a new LowerCaseFilter, that normalizes token text to lower case.
* *
* @param matchVersion See <a href="#version">above</a>
* @param in TokenStream to filter * @param in TokenStream to filter
*/ */
public LowerCaseFilter(TokenStream in) { public LowerCaseFilter(Version matchVersion, TokenStream in) {
super(in); super(in);
charUtils = CharacterUtils.getInstance(matchVersion);
} }
@Override @Override

View File

@ -40,6 +40,7 @@ public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiT
/** Creates a new LowerCaseFilterFactory */ /** Creates a new LowerCaseFilterFactory */
public LowerCaseFilterFactory(Map<String,String> args) { public LowerCaseFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -47,7 +48,7 @@ public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiT
@Override @Override
public LowerCaseFilter create(TokenStream input) { public LowerCaseFilter create(TokenStream input) {
return new LowerCaseFilter(input); return new LowerCaseFilter(luceneMatchVersion,input);
} }
@Override @Override

View File

@ -17,8 +17,13 @@ package org.apache.lucene.analysis.core;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** /**
* LowerCaseTokenizer performs the function of LetterTokenizer * LowerCaseTokenizer performs the function of LetterTokenizer
@ -30,24 +35,41 @@ import org.apache.lucene.util.AttributeFactory;
* Note: this does a decent job for most European languages, but does a terrible * Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces. * job for some Asian languages, where words are not separated by spaces.
* </p> * </p>
* <p>
* <a name="version"/>
* You must specify the required {@link Version} compatibility when creating
* {@link LowerCaseTokenizer}:
* <ul>
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
* {@link CharTokenizer#normalize(int)} for details.</li>
* </ul>
* </p>
*/ */
public final class LowerCaseTokenizer extends LetterTokenizer { public final class LowerCaseTokenizer extends LetterTokenizer {
/** /**
* Construct a new LowerCaseTokenizer. * Construct a new LowerCaseTokenizer.
*
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
*
*/ */
public LowerCaseTokenizer() { public LowerCaseTokenizer(Version matchVersion) {
super(matchVersion);
} }
/** /**
* Construct a new LowerCaseTokenizer using a given * Construct a new LowerCaseTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}. * {@link org.apache.lucene.util.AttributeFactory}.
* *
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
* @param factory * @param factory
* the attribute factory to use for this {@link Tokenizer} * the attribute factory to use for this {@link Tokenizer}
*/ */
public LowerCaseTokenizer(AttributeFactory factory) { public LowerCaseTokenizer(Version matchVersion, AttributeFactory factory) {
super(factory); super(matchVersion, factory);
} }
/** Converts char to lower case /** Converts char to lower case

View File

@ -39,6 +39,7 @@ public class LowerCaseTokenizerFactory extends TokenizerFactory implements Multi
/** Creates a new LowerCaseTokenizerFactory */ /** Creates a new LowerCaseTokenizerFactory */
public LowerCaseTokenizerFactory(Map<String,String> args) { public LowerCaseTokenizerFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -46,7 +47,7 @@ public class LowerCaseTokenizerFactory extends TokenizerFactory implements Multi
@Override @Override
public LowerCaseTokenizer create(AttributeFactory factory) { public LowerCaseTokenizer create(AttributeFactory factory) {
return new LowerCaseTokenizer(factory); return new LowerCaseTokenizer(luceneMatchVersion, factory);
} }
@Override @Override

View File

@ -17,22 +17,38 @@ package org.apache.lucene.analysis.core;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
/** An {@link Analyzer} that filters {@link LetterTokenizer} /** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter} * with {@link LowerCaseFilter}
* <p>
* <a name="version">You must specify the required {@link Version} compatibility
* when creating {@link CharTokenizer}:
* <ul>
* <li>As of 3.1, {@link LowerCaseTokenizer} uses an int based API to normalize and
* detect token codepoints. See {@link CharTokenizer#isTokenChar(int)} and
* {@link CharTokenizer#normalize(int)} for details.</li>
* </ul>
* <p>
**/ **/
public final class SimpleAnalyzer extends Analyzer { public final class SimpleAnalyzer extends Analyzer {
private final Version matchVersion;
/** /**
* Creates a new {@link SimpleAnalyzer} * Creates a new {@link SimpleAnalyzer}
* @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
*/ */
public SimpleAnalyzer() { public SimpleAnalyzer(Version matchVersion) {
this.matchVersion = matchVersion;
} }
@Override @Override
protected TokenStreamComponents createComponents(final String fieldName) { protected TokenStreamComponents createComponents(final String fieldName) {
return new TokenStreamComponents(new LowerCaseTokenizer()); return new TokenStreamComponents(new LowerCaseTokenizer(matchVersion));
} }
} }

View File

@ -27,10 +27,20 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version;
/** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StopAnalyzer:
* <ul>
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
* supplementary characters in stopwords
* <li> As of 2.9, position increments are preserved
* </ul>
*/
/**
* Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
*/
public final class StopAnalyzer extends StopwordAnalyzerBase { public final class StopAnalyzer extends StopwordAnalyzerBase {
/** An unmodifiable set containing some common English words that are not usually useful /** An unmodifiable set containing some common English words that are not usually useful
@ -45,35 +55,40 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
"that", "the", "their", "then", "there", "these", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with" "they", "this", "to", "was", "will", "with"
); );
final CharArraySet stopSet = new CharArraySet(stopWords, false); final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT,
stopWords, false);
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
} }
/** Builds an analyzer which removes words in /** Builds an analyzer which removes words in
* {@link #ENGLISH_STOP_WORDS_SET}. * {@link #ENGLISH_STOP_WORDS_SET}.
* @param matchVersion See <a href="#version">above</a>
*/ */
public StopAnalyzer() { public StopAnalyzer(Version matchVersion) {
this(ENGLISH_STOP_WORDS_SET); this(matchVersion, ENGLISH_STOP_WORDS_SET);
} }
/** Builds an analyzer with the stop words from the given set. /** Builds an analyzer with the stop words from the given set.
* @param matchVersion See <a href="#version">above</a>
* @param stopWords Set of stop words */ * @param stopWords Set of stop words */
public StopAnalyzer(CharArraySet stopWords) { public StopAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(stopWords); super(matchVersion, stopWords);
} }
/** Builds an analyzer with the stop words from the given file. /** Builds an analyzer with the stop words from the given file.
* @see WordlistLoader#getWordSet(Reader) * @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion See <a href="#version">above</a>
* @param stopwordsFile File to load stop words from */ * @param stopwordsFile File to load stop words from */
public StopAnalyzer(File stopwordsFile) throws IOException { public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
this(loadStopwordSet(stopwordsFile)); this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion));
} }
/** Builds an analyzer with the stop words from the given reader. /** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader) * @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion See <a href="#version">above</a>
* @param stopwords Reader to load stop words from */ * @param stopwords Reader to load stop words from */
public StopAnalyzer(Reader stopwords) throws IOException { public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(loadStopwordSet(stopwords)); this(matchVersion, loadStopwordSet(stopwords, matchVersion));
} }
/** /**
@ -87,8 +102,9 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new LowerCaseTokenizer(); final Tokenizer source = new LowerCaseTokenizer(matchVersion);
return new TokenStreamComponents(source, new StopFilter(source, stopwords)); return new TokenStreamComponents(source, new StopFilter(matchVersion,
source, stopwords));
} }
} }

View File

@ -24,9 +24,19 @@ import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/** /**
* Removes stop words from a token stream. * Removes stop words from a token stream.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StopFilter:
* <ul>
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
* supplementary characters in stopwords and position
* increments are preserved
* </ul>
*/ */
public final class StopFilter extends FilteringTokenFilter { public final class StopFilter extends FilteringTokenFilter {
@ -37,14 +47,17 @@ public final class StopFilter extends FilteringTokenFilter {
* Constructs a filter which removes words from the input TokenStream that are * Constructs a filter which removes words from the input TokenStream that are
* named in the Set. * named in the Set.
* *
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the stop
* set if Version > 3.0. See <a href="#version">above</a> for details.
* @param in * @param in
* Input stream * Input stream
* @param stopWords * @param stopWords
* A {@link CharArraySet} representing the stopwords. * A {@link CharArraySet} representing the stopwords.
* @see #makeStopSet(java.lang.String...) * @see #makeStopSet(Version, java.lang.String...)
*/ */
public StopFilter(TokenStream in, CharArraySet stopWords) { public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
super(in); super(matchVersion, in);
this.stopWords = stopWords; this.stopWords = stopWords;
} }
@ -54,11 +67,12 @@ public final class StopFilter extends FilteringTokenFilter {
* This permits this stopWords construction to be cached once when * This permits this stopWords construction to be cached once when
* an Analyzer is constructed. * an Analyzer is constructed.
* *
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
* @param stopWords An array of stopwords * @param stopWords An array of stopwords
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
*/ */
public static CharArraySet makeStopSet(String... stopWords) { public static CharArraySet makeStopSet(Version matchVersion, String... stopWords) {
return makeStopSet(stopWords, false); return makeStopSet(matchVersion, stopWords, false);
} }
/** /**
@ -67,35 +81,38 @@ public final class StopFilter extends FilteringTokenFilter {
* This permits this stopWords construction to be cached once when * This permits this stopWords construction to be cached once when
* an Analyzer is constructed. * an Analyzer is constructed.
* *
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
* @return A Set ({@link CharArraySet}) containing the words * @return A Set ({@link CharArraySet}) containing the words
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
*/ */
public static CharArraySet makeStopSet(List<?> stopWords) { public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords) {
return makeStopSet(stopWords, false); return makeStopSet(matchVersion, stopWords, false);
} }
/** /**
* Creates a stopword set from the given stopword array. * Creates a stopword set from the given stopword array.
* *
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
* @param stopWords An array of stopwords * @param stopWords An array of stopwords
* @param ignoreCase If true, all words are lower cased first. * @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words * @return a Set containing the words
*/ */
public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) { public static CharArraySet makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
stopSet.addAll(Arrays.asList(stopWords)); stopSet.addAll(Arrays.asList(stopWords));
return stopSet; return stopSet;
} }
/** /**
* Creates a stopword set from the given stopword list. * Creates a stopword set from the given stopword list.
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
* @param ignoreCase if true, all words are lower cased first * @param ignoreCase if true, all words are lower cased first
* @return A Set ({@link CharArraySet}) containing the words * @return A Set ({@link CharArraySet}) containing the words
*/ */
public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){ public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase); CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
stopSet.addAll(stopWords); stopSet.addAll(stopWords);
return stopSet; return stopSet;
} }

View File

@ -81,6 +81,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
/** Creates a new StopFilterFactory */ /** Creates a new StopFilterFactory */
public StopFilterFactory(Map<String,String> args) { public StopFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
stopWordFiles = get(args, "words"); stopWordFiles = get(args, "words");
format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET)); format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
ignoreCase = getBoolean(args, "ignoreCase", false); ignoreCase = getBoolean(args, "ignoreCase", false);
@ -103,7 +104,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
if (null != format) { if (null != format) {
throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format); throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
} }
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
} }
} }
@ -117,7 +118,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
@Override @Override
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
StopFilter stopFilter = new StopFilter(input,stopWords); StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords);
return stopFilter; return stopFilter;
} }
} }

View File

@ -22,6 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.util.Version;
/** /**
* Removes tokens whose types appear in a set of blocked types from a token stream. * Removes tokens whose types appear in a set of blocked types from a token stream.
@ -34,13 +35,14 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
/** /**
* Create a new {@link TypeTokenFilter}. * Create a new {@link TypeTokenFilter}.
* @param version the Lucene match version
* @param input the {@link TokenStream} to consume * @param input the {@link TokenStream} to consume
* @param stopTypes the types to filter * @param stopTypes the types to filter
* @param useWhiteList if true, then tokens whose type is in stopTypes will * @param useWhiteList if true, then tokens whose type is in stopTypes will
* be kept, otherwise they will be filtered out * be kept, otherwise they will be filtered out
*/ */
public TypeTokenFilter(TokenStream input, Set<String> stopTypes, boolean useWhiteList) { public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
super(input); super(version, input);
this.stopTypes = stopTypes; this.stopTypes = stopTypes;
this.useWhiteList = useWhiteList; this.useWhiteList = useWhiteList;
} }
@ -48,9 +50,10 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
/** /**
* Create a new {@link TypeTokenFilter} that filters tokens out * Create a new {@link TypeTokenFilter} that filters tokens out
* (useWhiteList=false). * (useWhiteList=false).
* @see #TypeTokenFilter(Version, TokenStream, Set, boolean)
*/ */
public TypeTokenFilter(TokenStream input, Set<String> stopTypes) { public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes) {
this(input, stopTypes, false); this(version, input, stopTypes, false);
} }
/** /**

View File

@ -72,7 +72,7 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
@Override @Override
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
final TokenStream filter = new TypeTokenFilter(input, stopTypes, useWhitelist); final TokenStream filter = new TypeTokenFilter(luceneMatchVersion, input, stopTypes, useWhitelist);
return filter; return filter;
} }
} }

View File

@ -23,9 +23,13 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/** /**
* Normalizes token text to UPPER CASE. * Normalizes token text to UPPER CASE.
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating UpperCaseFilter
* *
* <p><b>NOTE:</b> In Unicode, this transformation may lose information when the * <p><b>NOTE:</b> In Unicode, this transformation may lose information when the
* upper case character represents more than one lower case character. Use this filter * upper case character represents more than one lower case character. Use this filter
@ -33,16 +37,18 @@ import org.apache.lucene.analysis.util.CharacterUtils;
* general search matching * general search matching
*/ */
public final class UpperCaseFilter extends TokenFilter { public final class UpperCaseFilter extends TokenFilter {
private final CharacterUtils charUtils = CharacterUtils.getInstance(); private final CharacterUtils charUtils;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** /**
* Create a new UpperCaseFilter, that normalizes token text to upper case. * Create a new UpperCaseFilter, that normalizes token text to upper case.
* *
* @param matchVersion See <a href="#version">above</a>
* @param in TokenStream to filter * @param in TokenStream to filter
*/ */
public UpperCaseFilter(TokenStream in) { public UpperCaseFilter(Version matchVersion, TokenStream in) {
super(in); super(in);
charUtils = CharacterUtils.getInstance(matchVersion);
} }
@Override @Override

View File

@ -45,6 +45,7 @@ public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiT
/** Creates a new UpperCaseFilterFactory */ /** Creates a new UpperCaseFilterFactory */
public UpperCaseFilterFactory(Map<String,String> args) { public UpperCaseFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -52,7 +53,7 @@ public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiT
@Override @Override
public UpperCaseFilter create(TokenStream input) { public UpperCaseFilter create(TokenStream input) {
return new UpperCaseFilter(input); return new UpperCaseFilter(luceneMatchVersion,input);
} }
@Override @Override

View File

@ -17,21 +17,38 @@ package org.apache.lucene.analysis.core;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.Version;
/** /**
* An Analyzer that uses {@link WhitespaceTokenizer}. * An Analyzer that uses {@link WhitespaceTokenizer}.
* <p>
* <a name="version">You must specify the required {@link Version} compatibility
* when creating {@link CharTokenizer}:
* <ul>
* <li>As of 3.1, {@link WhitespaceTokenizer} uses an int based API to normalize and
* detect token codepoints. See {@link CharTokenizer#isTokenChar(int)} and
* {@link CharTokenizer#normalize(int)} for details.</li>
* </ul>
* <p>
**/ **/
public final class WhitespaceAnalyzer extends Analyzer { public final class WhitespaceAnalyzer extends Analyzer {
private final Version matchVersion;
/** /**
* Creates a new {@link WhitespaceAnalyzer} * Creates a new {@link WhitespaceAnalyzer}
* @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
*/ */
public WhitespaceAnalyzer() { public WhitespaceAnalyzer(Version matchVersion) {
this.matchVersion = matchVersion;
} }
@Override @Override
protected TokenStreamComponents createComponents(final String fieldName) { protected TokenStreamComponents createComponents(final String fieldName) {
return new TokenStreamComponents(new WhitespaceTokenizer()); return new TokenStreamComponents(new WhitespaceTokenizer(matchVersion));
} }
} }

View File

@ -17,31 +17,50 @@ package org.apache.lucene.analysis.core;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** /**
* A WhitespaceTokenizer is a tokenizer that divides text at whitespace. * A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
* Adjacent sequences of non-Whitespace characters form tokens. * Adjacent sequences of non-Whitespace characters form tokens. <a
* name="version"/>
* <p>
* You must specify the required {@link Version} compatibility when creating
* {@link WhitespaceTokenizer}:
* <ul>
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
* {@link CharTokenizer#normalize(int)} for details.</li>
* </ul>
*/ */
public final class WhitespaceTokenizer extends CharTokenizer { public final class WhitespaceTokenizer extends CharTokenizer {
/** /**
* Construct a new WhitespaceTokenizer. * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
* to match See {@link <a href="#version">above</a>}
*
*/ */
public WhitespaceTokenizer() { public WhitespaceTokenizer(Version matchVersion) {
super(matchVersion);
} }
/** /**
* Construct a new WhitespaceTokenizer using a given * Construct a new WhitespaceTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}. * {@link org.apache.lucene.util.AttributeFactory}.
* *
* @param
* matchVersion Lucene version to match See
* {@link <a href="#version">above</a>}
* @param factory * @param factory
* the attribute factory to use for this {@link Tokenizer} * the attribute factory to use for this {@link Tokenizer}
*/ */
public WhitespaceTokenizer(AttributeFactory factory) { public WhitespaceTokenizer(Version matchVersion, AttributeFactory factory) {
super(factory); super(matchVersion, factory);
} }
/** Collects only characters which do not satisfy /** Collects only characters which do not satisfy

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import java.io.Reader;
import java.util.Map; import java.util.Map;
/** /**
@ -36,6 +37,7 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
/** Creates a new WhitespaceTokenizerFactory */ /** Creates a new WhitespaceTokenizerFactory */
public WhitespaceTokenizerFactory(Map<String,String> args) { public WhitespaceTokenizerFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -43,6 +45,6 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
@Override @Override
public WhitespaceTokenizer create(AttributeFactory factory) { public WhitespaceTokenizer create(AttributeFactory factory) {
return new WhitespaceTokenizer(factory); return new WhitespaceTokenizer(luceneMatchVersion, factory);
} }
} }

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.*; import java.io.*;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
@ -60,7 +61,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class, DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#"); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -74,30 +75,34 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}). * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
*
* @param matchVersion Lucene version to match
*/ */
public CzechAnalyzer() { public CzechAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_SET); this(matchVersion, DefaultSetHolder.DEFAULT_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion Lucene version to match
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public CzechAnalyzer(CharArraySet stopwords) { public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
* Builds an analyzer with the given stop words and a set of work to be * Builds an analyzer with the given stop words and a set of work to be
* excluded from the {@link CzechStemFilter}. * excluded from the {@link CzechStemFilter}.
* *
* @param matchVersion Lucene version to match
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionTable a stemming exclusion set * @param stemExclusionTable a stemming exclusion set
*/ */
public CzechAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable) { public CzechAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable)); this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
} }
/** /**
@ -110,16 +115,16 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If * , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
* a stem exclusion set is provided via * a stem exclusion set is provided via
* {@link #CzechAnalyzer(CharArraySet, CharArraySet)} a * {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
* {@link SetKeywordMarkerFilter} is added before * {@link SetKeywordMarkerFilter} is added before
* {@link CzechStemFilter}. * {@link CzechStemFilter}.
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter( matchVersion, result, stopwords);
if(!this.stemExclusionTable.isEmpty()) if(!this.stemExclusionTable.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionTable); result = new SetKeywordMarkerFilter(result, stemExclusionTable);
result = new CzechStemFilter(result); result = new CzechStemFilter(result);

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.DanishStemmer; import org.tartarus.snowball.ext.DanishStemmer;
/** /**
@ -63,7 +64,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -75,17 +76,18 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public DanishAnalyzer() { public DanishAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public DanishAnalyzer(CharArraySet stopwords) { public DanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -93,12 +95,14 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public DanishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -115,10 +119,10 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new DanishStemmer()); result = new SnowballFilter(result, new DanishStemmer());

View File

@ -69,7 +69,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -91,31 +91,35 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
* Builds an analyzer with the default stop words: * Builds an analyzer with the default stop words:
* {@link #getDefaultStopSet()}. * {@link #getDefaultStopSet()}.
*/ */
public GermanAnalyzer() { public GermanAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_SET); this(matchVersion, DefaultSetHolder.DEFAULT_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public GermanAnalyzer(CharArraySet stopwords) { public GermanAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
* @param stemExclusionSet * @param stemExclusionSet
* a stemming exclusion set * a stemming exclusion set
*/ */
public GermanAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
} }
/** /**
@ -131,10 +135,10 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter( matchVersion, result, stopwords);
result = new SetKeywordMarkerFilter(result, exclusionSet); result = new SetKeywordMarkerFilter(result, exclusionSet);
result = new GermanNormalizationFilter(result); result = new GermanNormalizationFilter(result);
result = new GermanLightStemFilter(result); result = new GermanLightStemFilter(result);

View File

@ -69,9 +69,10 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words. * Builds an analyzer with the default stop words.
* @param matchVersion Lucene compatibility version
*/ */
public GreekAnalyzer() { public GreekAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_SET); this(matchVersion, DefaultSetHolder.DEFAULT_SET);
} }
/** /**
@ -80,10 +81,11 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
* <b>NOTE:</b> The stopwords set should be pre-processed with the logic of * <b>NOTE:</b> The stopwords set should be pre-processed with the logic of
* {@link GreekLowerCaseFilter} for best results. * {@link GreekLowerCaseFilter} for best results.
* *
* @param matchVersion Lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public GreekAnalyzer(CharArraySet stopwords) { public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) {
super(stopwords); super(matchVersion, stopwords);
} }
/** /**
@ -98,10 +100,10 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new GreekLowerCaseFilter(source); TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
result = new StandardFilter(result); result = new StandardFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
result = new GreekStemFilter(result); result = new GreekStemFilter(result);
return new TokenStreamComponents(source, result); return new TokenStreamComponents(source, result);
} }

View File

@ -22,22 +22,32 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/** /**
* Normalizes token text to lower case, removes some Greek diacritics, * Normalizes token text to lower case, removes some Greek diacritics,
* and standardizes final sigma to sigma. * and standardizes final sigma to sigma.
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating GreekLowerCaseFilter:
* <ul>
* <li> As of 3.1, supplementary characters are properly lowercased.
* </ul>
*/ */
public final class GreekLowerCaseFilter extends TokenFilter { public final class GreekLowerCaseFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final CharacterUtils charUtils = CharacterUtils.getInstance(); private final CharacterUtils charUtils;
/** /**
* Create a GreekLowerCaseFilter that normalizes Greek token text. * Create a GreekLowerCaseFilter that normalizes Greek token text.
* *
* @param matchVersion Lucene compatibility version,
* See <a href="#version">above</a>
* @param in TokenStream to filter * @param in TokenStream to filter
*/ */
public GreekLowerCaseFilter(TokenStream in) { public GreekLowerCaseFilter(Version matchVersion, TokenStream in) {
super(in); super(in);
this.charUtils = CharacterUtils.getInstance(matchVersion);
} }
@Override @Override

View File

@ -40,6 +40,7 @@ public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements M
/** Creates a new GreekLowerCaseFilterFactory */ /** Creates a new GreekLowerCaseFilterFactory */
public GreekLowerCaseFilterFactory(Map<String,String> args) { public GreekLowerCaseFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -47,7 +48,7 @@ public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements M
@Override @Override
public GreekLowerCaseFilter create(TokenStream in) { public GreekLowerCaseFilter create(TokenStream in) {
return new GreekLowerCaseFilter(in); return new GreekLowerCaseFilter(luceneMatchVersion, in);
} }
@Override @Override

View File

@ -1,6 +1,7 @@
package org.apache.lucene.analysis.el; package org.apache.lucene.analysis.el;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import java.util.Arrays; import java.util.Arrays;
@ -204,7 +205,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc4 = new CharArraySet( private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"), Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
false); false);
@ -230,7 +231,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc6 = new CharArraySet( private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ", Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ", "αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ", "μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
@ -255,7 +256,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc7 = new CharArraySet( private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ", Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
"πεθ", "πικρ", "ποτ", "σιχ", "χ"), "πεθ", "πικρ", "ποτ", "σιχ", "χ"),
false); false);
@ -282,11 +283,11 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc8a = new CharArraySet( private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("τρ", "τσ"), Arrays.asList("τρ", "τσ"),
false); false);
private static final CharArraySet exc8b = new CharArraySet( private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ", Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ", "καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ", "π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
@ -345,7 +346,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc9 = new CharArraySet( private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ", Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ", "βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"), "σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
@ -433,11 +434,11 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc12a = new CharArraySet( private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"), Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
false); false);
private static final CharArraySet exc12b = new CharArraySet( private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"), Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
false); false);
@ -457,7 +458,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc13 = new CharArraySet( private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"), Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
false); false);
@ -491,7 +492,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc14 = new CharArraySet( private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ", Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ", "λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε", "ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
@ -529,7 +530,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc15a = new CharArraySet( private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ", Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ", "αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ", "ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
@ -538,7 +539,7 @@ public class GreekStemmer {
"ουλαμ", "ουρ", "π", "τρ", "μ"), "ουλαμ", "ουρ", "π", "τρ", "μ"),
false); false);
private static final CharArraySet exc15b = new CharArraySet( private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("ψοφ", "ναυλοχ"), Arrays.asList("ψοφ", "ναυλοχ"),
false); false);
@ -575,7 +576,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc16 = new CharArraySet( private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"), Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
false); false);
@ -595,7 +596,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc17 = new CharArraySet( private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"), Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
false); false);
@ -609,7 +610,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc18 = new CharArraySet( private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"), Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
false); false);
@ -633,7 +634,7 @@ public class GreekStemmer {
return len; return len;
} }
private static final CharArraySet exc19 = new CharArraySet( private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"), Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
false); false);

View File

@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for English. * {@link Analyzer} for English.
@ -56,17 +57,18 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #getDefaultStopSet}. * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
*/ */
public EnglishAnalyzer() { public EnglishAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public EnglishAnalyzer(CharArraySet stopwords) { public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -74,12 +76,14 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public EnglishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -97,11 +101,11 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new EnglishPossessiveFilter(result); result = new EnglishPossessiveFilter(matchVersion, result);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new PorterStemFilter(result); result = new PorterStemFilter(result);

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
/** /**
* TokenFilter that removes possessives (trailing 's) from words. * TokenFilter that removes possessives (trailing 's) from words.
@ -29,7 +30,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public final class EnglishPossessiveFilter extends TokenFilter { public final class EnglishPossessiveFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public EnglishPossessiveFilter(TokenStream input) { // NOTE: version now unused
public EnglishPossessiveFilter(Version version, TokenStream input) {
super(input); super(input);
} }

View File

@ -39,6 +39,7 @@ public class EnglishPossessiveFilterFactory extends TokenFilterFactory {
/** Creates a new EnglishPossessiveFilterFactory */ /** Creates a new EnglishPossessiveFilterFactory */
public EnglishPossessiveFilterFactory(Map<String,String> args) { public EnglishPossessiveFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -46,6 +47,6 @@ public class EnglishPossessiveFilterFactory extends TokenFilterFactory {
@Override @Override
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new EnglishPossessiveFilter(input); return new EnglishPossessiveFilter(luceneMatchVersion, input);
} }
} }

View File

@ -64,6 +64,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
* <p>Copyright: Copyright 2008, Luicid Imagination, Inc. </p> * <p>Copyright: Copyright 2008, Luicid Imagination, Inc. </p>
* <p>Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu) </p> * <p>Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu) </p>
*/ */
import org.apache.lucene.util.Version;
/** /**
* This class implements the Kstem algorithm * This class implements the Kstem algorithm
@ -279,7 +280,7 @@ public class KStemmer {
DictEntry defaultEntry; DictEntry defaultEntry;
DictEntry entry; DictEntry entry;
CharArrayMap<DictEntry> d = new CharArrayMap<>(1000, false); CharArrayMap<DictEntry> d = new CharArrayMap<>(Version.LUCENE_CURRENT, 1000, false);
for (int i = 0; i < exceptionWords.length; i++) { for (int i = 0; i < exceptionWords.length; i++) {
if (!d.containsKey(exceptionWords[i])) { if (!d.containsKey(exceptionWords[i])) {
entry = new DictEntry(exceptionWords[i], true); entry = new DictEntry(exceptionWords[i], true);

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Spanish. * {@link Analyzer} for Spanish.
@ -62,7 +63,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -74,17 +75,18 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public SpanishAnalyzer() { public SpanishAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public SpanishAnalyzer(CharArraySet stopwords) { public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -92,12 +94,14 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public SpanishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -114,10 +118,10 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SpanishLightStemFilter(result); result = new SpanishLightStemFilter(result);

View File

@ -31,6 +31,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.BasqueStemmer; import org.tartarus.snowball.ext.BasqueStemmer;
/** /**
@ -72,17 +73,18 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public BasqueAnalyzer() { public BasqueAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public BasqueAnalyzer(CharArraySet stopwords) { public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -90,12 +92,14 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public BasqueAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -112,10 +116,10 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new BasqueStemmer()); result = new SnowballFilter(result, new BasqueStemmer());

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Persian. * {@link Analyzer} for Persian.
@ -86,18 +87,20 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
* Builds an analyzer with the default stop words: * Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}. * {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public PersianAnalyzer() { public PersianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public PersianAnalyzer(CharArraySet stopwords){ public PersianAnalyzer(Version matchVersion, CharArraySet stopwords){
super(stopwords); super(matchVersion, stopwords);
} }
/** /**
@ -112,8 +115,8 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new LowerCaseFilter(source); TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new ArabicNormalizationFilter(result); result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */ /* additional persian-specific normalization */
result = new PersianNormalizationFilter(result); result = new PersianNormalizationFilter(result);
@ -121,7 +124,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
* the order here is important: the stopword list is normalized with the * the order here is important: the stopword list is normalized with the
* above! * above!
*/ */
return new TokenStreamComponents(source, new StopFilter(result, stopwords)); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
} }
/** /**

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.FinnishStemmer; import org.tartarus.snowball.ext.FinnishStemmer;
/** /**
@ -63,7 +64,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -75,17 +76,18 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public FinnishAnalyzer() { public FinnishAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public FinnishAnalyzer(CharArraySet stopwords) { public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -93,12 +95,14 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public FinnishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -115,10 +119,10 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new FinnishStemmer()); result = new SnowballFilter(result, new FinnishStemmer());

View File

@ -59,7 +59,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
/** Default set of articles for ElisionFilter */ /** Default set of articles for ElisionFilter */
public static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( public static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet(Arrays.asList( new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
"l", "m", "t", "qu", "n", "s", "j", "d", "c", "jusqu", "quoiqu", "lorsqu", "puisqu"), true)); "l", "m", "t", "qu", "n", "s", "j", "d", "c", "jusqu", "quoiqu", "lorsqu", "puisqu"), true));
/** /**
@ -80,7 +80,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -92,33 +92,37 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet}). * Builds an analyzer with the default stop words ({@link #getDefaultStopSet}).
*/ */
public FrenchAnalyzer() { public FrenchAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public FrenchAnalyzer(CharArraySet stopwords){ public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords){
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
* @param stemExclutionSet * @param stemExclutionSet
* a stemming exclusion set * a stemming exclusion set
*/ */
public FrenchAnalyzer(CharArraySet stopwords, public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords,
CharArraySet stemExclutionSet) { CharArraySet stemExclutionSet) {
super(stopwords); super(matchVersion, stopwords);
this.excltable = CharArraySet.unmodifiableSet(CharArraySet this.excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(stemExclutionSet)); .copy(matchVersion, stemExclutionSet));
} }
/** /**
@ -135,11 +139,11 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!excltable.isEmpty()) if(!excltable.isEmpty())
result = new SetKeywordMarkerFilter(result, excltable); result = new SetKeywordMarkerFilter(result, excltable);
result = new FrenchLightStemFilter(result); result = new FrenchLightStemFilter(result);

View File

@ -32,6 +32,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.IrishStemmer; import org.tartarus.snowball.ext.IrishStemmer;
/** /**
@ -44,7 +45,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet( new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList( Arrays.asList(
"d", "m", "b" "d", "m", "b"
), true)); ), true));
@ -55,7 +56,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
* with phrase queries versus tAthair (which would not have a gap). * with phrase queries versus tAthair (which would not have a gap).
*/ */
private static final CharArraySet HYPHENATIONS = CharArraySet.unmodifiableSet( private static final CharArraySet HYPHENATIONS = CharArraySet.unmodifiableSet(
new CharArraySet( new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList( Arrays.asList(
"h", "n", "t" "h", "n", "t"
), true)); ), true));
@ -90,17 +91,18 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public IrishAnalyzer() { public IrishAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public IrishAnalyzer(CharArraySet stopwords) { public IrishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -108,12 +110,14 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public IrishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public IrishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -130,12 +134,12 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new StopFilter(result, HYPHENATIONS); result = new StopFilter(matchVersion, result, HYPHENATIONS);
result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new IrishLowerCaseFilter(result); result = new IrishLowerCaseFilter(result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new IrishStemmer()); result = new SnowballFilter(result, new IrishStemmer());

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Galician. * {@link Analyzer} for Galician.
@ -61,7 +62,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class, DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -73,17 +74,18 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public GalicianAnalyzer() { public GalicianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public GalicianAnalyzer(CharArraySet stopwords) { public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -91,12 +93,14 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public GalicianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -113,10 +117,10 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new GalicianStemFilter(result); result = new GalicianStemFilter(result);

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter; import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.util.Version;
/** /**
* Analyzer for Hindi. * Analyzer for Hindi.
@ -74,29 +75,32 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param version lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a stemming exclusion set * @param stemExclusionSet a stemming exclusion set
*/ */
public HindiAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public HindiAnalyzer(Version version, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(version, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(
CharArraySet.copy(matchVersion, stemExclusionSet));
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param version lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public HindiAnalyzer(CharArraySet stopwords) { public HindiAnalyzer(Version version, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(version, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
* Builds an analyzer with the default stop words: * Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}. * {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public HindiAnalyzer() { public HindiAnalyzer(Version version) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(version, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
@ -113,13 +117,13 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new LowerCaseFilter(source); TokenStream result = new LowerCaseFilter(matchVersion, source);
if (!stemExclusionSet.isEmpty()) if (!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new IndicNormalizationFilter(result); result = new IndicNormalizationFilter(result);
result = new HindiNormalizationFilter(result); result = new HindiNormalizationFilter(result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
result = new HindiStemFilter(result); result = new HindiStemFilter(result);
return new TokenStreamComponents(source, result); return new TokenStreamComponents(source, result);
} }

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.HungarianStemmer; import org.tartarus.snowball.ext.HungarianStemmer;
/** /**
@ -63,7 +64,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -75,17 +76,18 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public HungarianAnalyzer() { public HungarianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public HungarianAnalyzer(CharArraySet stopwords) { public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -93,12 +95,14 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public HungarianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -115,10 +119,10 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new HungarianStemmer()); result = new SnowballFilter(result, new HungarianStemmer());

View File

@ -215,7 +215,7 @@ final class Stemmer {
if (stems.size() < 2) { if (stems.size() < 2) {
return stems; return stems;
} }
CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, dictionary.ignoreCase);
List<CharsRef> deduped = new ArrayList<>(); List<CharsRef> deduped = new ArrayList<>();
for (CharsRef s : stems) { for (CharsRef s : stems) {
if (!terms.contains(s)) { if (!terms.contains(s)) {

View File

@ -31,6 +31,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.ArmenianStemmer; import org.tartarus.snowball.ext.ArmenianStemmer;
/** /**
@ -72,17 +73,18 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public ArmenianAnalyzer() { public ArmenianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public ArmenianAnalyzer(CharArraySet stopwords) { public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -90,12 +92,14 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public ArmenianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -112,10 +116,10 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new ArmenianStemmer()); result = new SnowballFilter(result, new ArmenianStemmer());

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/** /**
* Analyzer for Indonesian (Bahasa) * Analyzer for Indonesian (Bahasa)
@ -68,18 +69,20 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public IndonesianAnalyzer() { public IndonesianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public IndonesianAnalyzer(CharArraySet stopwords){ public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords){
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -87,14 +90,17 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* {@link IndonesianStemFilter}. * {@link IndonesianStemFilter}.
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
* @param stemExclusionSet * @param stemExclusionSet
* a set of terms not to be stemmed * a set of terms not to be stemmed
*/ */
public IndonesianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet){ public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -110,10 +116,10 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) { if (!stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
} }

View File

@ -36,6 +36,7 @@ import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Italian. * {@link Analyzer} for Italian.
@ -47,7 +48,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt"; public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet( new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList( Arrays.asList(
"c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell",
"gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d" "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"
@ -71,7 +72,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -83,17 +84,18 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public ItalianAnalyzer() { public ItalianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public ItalianAnalyzer(CharArraySet stopwords) { public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -101,12 +103,14 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public ItalianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -123,11 +127,11 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new ItalianLightStemFilter(result); result = new ItalianLightStemFilter(result);

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Latvian. * {@link Analyzer} for Latvian.
@ -61,7 +62,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class, DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -73,17 +74,18 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public LatvianAnalyzer() { public LatvianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public LatvianAnalyzer(CharArraySet stopwords) { public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -91,12 +93,14 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public LatvianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -113,10 +117,10 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new LatvianStemFilter(result); result = new LatvianStemFilter(result);

View File

@ -82,7 +82,7 @@ public class CapitalizationFilterFactory extends TokenFilterFactory {
boolean ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false); boolean ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false);
Set<String> k = getSet(args, KEEP); Set<String> k = getSet(args, KEEP);
if (k != null) { if (k != null) {
keep = new CharArraySet(10, ignoreCase); keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
keep.addAll(k); keep.addAll(k);
} }

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
/** /**
* Removes words that are too long or too short from the stream. * Removes words that are too long or too short from the stream.
@ -38,12 +39,13 @@ public final class CodepointCountFilter extends FilteringTokenFilter {
* Create a new {@link CodepointCountFilter}. This will filter out tokens whose * Create a new {@link CodepointCountFilter}. This will filter out tokens whose
* {@link CharTermAttribute} is either too short ({@link Character#codePointCount(char[], int, int)} * {@link CharTermAttribute} is either too short ({@link Character#codePointCount(char[], int, int)}
* &lt; min) or too long ({@link Character#codePointCount(char[], int, int)} &gt; max). * &lt; min) or too long ({@link Character#codePointCount(char[], int, int)} &gt; max).
* @param version the Lucene match version
* @param in the {@link TokenStream} to consume * @param in the {@link TokenStream} to consume
* @param min the minimum length * @param min the minimum length
* @param max the maximum length * @param max the maximum length
*/ */
public CodepointCountFilter(TokenStream in, int min, int max) { public CodepointCountFilter(Version version, TokenStream in, int min, int max) {
super(in); super(version, in);
if (min < 0) { if (min < 0) {
throw new IllegalArgumentException("minimum length must be greater than or equal to zero"); throw new IllegalArgumentException("minimum length must be greater than or equal to zero");
} }

View File

@ -50,6 +50,6 @@ public class CodepointCountFilterFactory extends TokenFilterFactory {
@Override @Override
public CodepointCountFilter create(TokenStream input) { public CodepointCountFilter create(TokenStream input) {
return new CodepointCountFilter(input, min, max); return new CodepointCountFilter(luceneMatchVersion, input, min, max);
} }
} }

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/** /**
* A TokenFilter that only keeps tokens with text contained in the * A TokenFilter that only keeps tokens with text contained in the
@ -36,11 +37,12 @@ public final class KeepWordFilter extends FilteringTokenFilter {
* Create a new {@link KeepWordFilter}. * Create a new {@link KeepWordFilter}.
* <p><b>NOTE</b>: The words set passed to this constructor will be directly * <p><b>NOTE</b>: The words set passed to this constructor will be directly
* used by this filter and should not be modified. * used by this filter and should not be modified.
* @param version the Lucene match version
* @param in the {@link TokenStream} to consume * @param in the {@link TokenStream} to consume
* @param words the words to keep * @param words the words to keep
*/ */
public KeepWordFilter(TokenStream in, CharArraySet words) { public KeepWordFilter(Version version, TokenStream in, CharArraySet words) {
super(in); super(version, in);
this.words = words; this.words = words;
} }

View File

@ -44,6 +44,7 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
/** Creates a new KeepWordFilterFactory */ /** Creates a new KeepWordFilterFactory */
public KeepWordFilterFactory(Map<String,String> args) { public KeepWordFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
wordFiles = get(args, "words"); wordFiles = get(args, "words");
ignoreCase = getBoolean(args, "ignoreCase", false); ignoreCase = getBoolean(args, "ignoreCase", false);
if (!args.isEmpty()) { if (!args.isEmpty()) {
@ -72,7 +73,7 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
if (words == null) { if (words == null) {
return input; return input;
} else { } else {
final TokenStream filter = new KeepWordFilter(input, words); final TokenStream filter = new KeepWordFilter(luceneMatchVersion, input, words);
return filter; return filter;
} }
} }

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
/** /**
* Removes words that are too long or too short from the stream. * Removes words that are too long or too short from the stream.
@ -38,12 +39,13 @@ public final class LengthFilter extends FilteringTokenFilter {
* Create a new {@link LengthFilter}. This will filter out tokens whose * Create a new {@link LengthFilter}. This will filter out tokens whose
* {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()} * {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()}
* &lt; min) or too long ({@link CharTermAttribute#length()} &gt; max). * &lt; min) or too long ({@link CharTermAttribute#length()} &gt; max).
* @param version the Lucene match version
* @param in the {@link TokenStream} to consume * @param in the {@link TokenStream} to consume
* @param min the minimum length * @param min the minimum length
* @param max the maximum length * @param max the maximum length
*/ */
public LengthFilter(TokenStream in, int min, int max) { public LengthFilter(Version version, TokenStream in, int min, int max) {
super(in); super(version, in);
if (min < 0) { if (min < 0) {
throw new IllegalArgumentException("minimum length must be greater than or equal to zero"); throw new IllegalArgumentException("minimum length must be greater than or equal to zero");
} }

View File

@ -50,7 +50,7 @@ public class LengthFilterFactory extends TokenFilterFactory {
@Override @Override
public LengthFilter create(TokenStream input) { public LengthFilter create(TokenStream input) {
final LengthFilter filter = new LengthFilter(input,min,max); final LengthFilter filter = new LengthFilter(luceneMatchVersion, input,min,max);
return filter; return filter;
} }
} }

View File

@ -22,6 +22,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
@ -33,7 +34,8 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final CharArraySet previous = new CharArraySet(8, false); // use a fixed version, as we don't care about case sensitivity.
private final CharArraySet previous = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
/** /**
* Creates a new RemoveDuplicatesTokenFilter * Creates a new RemoveDuplicatesTokenFilter

View File

@ -20,11 +20,15 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
/** /**
* Trims leading and trailing whitespace from Tokens in the stream. * Trims leading and trailing whitespace from Tokens in the stream.
* <p>As of Lucene 4.4, this filter does not support updateOffsets=true anymore
* as it can lead to broken token streams.
*/ */
public final class TrimFilter extends TokenFilter { public final class TrimFilter extends TokenFilter {
@ -32,9 +36,10 @@ public final class TrimFilter extends TokenFilter {
/** /**
* Create a new {@link TrimFilter}. * Create a new {@link TrimFilter}.
* @param version the Lucene match version
* @param in the stream to consume * @param in the stream to consume
*/ */
public TrimFilter(TokenStream in) { public TrimFilter(Version version, TokenStream in) {
super(in); super(in);
} }

View File

@ -47,7 +47,7 @@ public class TrimFilterFactory extends TokenFilterFactory {
@Override @Override
public TrimFilter create(TokenStream input) { public TrimFilter create(TokenStream input) {
final TrimFilter filter = new TrimFilter(input); final TrimFilter filter = new TrimFilter(luceneMatchVersion, input);
return filter; return filter;
} }
} }

View File

@ -80,7 +80,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
} }
this.charUtils = version.onOrAfter(Version.LUCENE_4_4) this.charUtils = version.onOrAfter(Version.LUCENE_4_4)
? CharacterUtils.getInstance() ? CharacterUtils.getInstance(version)
: CharacterUtils.getJava4Instance(); : CharacterUtils.getJava4Instance();
this.minGram = minGram; this.minGram = minGram;
this.maxGram = maxGram; this.maxGram = maxGram;

View File

@ -81,10 +81,10 @@ public final class NGramTokenFilter extends TokenFilter {
* @param maxGram the largest n-gram to generate * @param maxGram the largest n-gram to generate
*/ */
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) { public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE)); super(new CodepointCountFilter(version, input, minGram, Integer.MAX_VALUE));
this.version = version; this.version = version;
this.charUtils = version.onOrAfter(Version.LUCENE_4_4) this.charUtils = version.onOrAfter(Version.LUCENE_4_4)
? CharacterUtils.getInstance() ? CharacterUtils.getInstance(version)
: CharacterUtils.getJava4Instance(); : CharacterUtils.getJava4Instance();
if (minGram < 1) { if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero"); throw new IllegalArgumentException("minGram must be greater than zero");

View File

@ -121,7 +121,7 @@ public class NGramTokenizer extends Tokenizer {
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer"); throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
} }
charUtils = version.onOrAfter(Version.LUCENE_4_4) charUtils = version.onOrAfter(Version.LUCENE_4_4)
? CharacterUtils.getInstance() ? CharacterUtils.getInstance(version)
: CharacterUtils.getJava4Instance(); : CharacterUtils.getJava4Instance();
if (minGram < 1) { if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero"); throw new IllegalArgumentException("minGram must be greater than zero");

View File

@ -28,11 +28,13 @@ import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -48,8 +50,10 @@ import java.nio.charset.StandardCharsets;
* A default set of stopwords is used unless an alternative list is specified, but the * A default set of stopwords is used unless an alternative list is specified, but the
* exclusion list is empty by default. * exclusion list is empty by default.
* </p> * </p>
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/ */
// TODO: extend StopwordAnalyzerBase
public final class DutchAnalyzer extends Analyzer { public final class DutchAnalyzer extends Analyzer {
/** File containing default Dutch stopwords. */ /** File containing default Dutch stopwords. */
@ -69,14 +73,14 @@ public final class DutchAnalyzer extends Analyzer {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
throw new RuntimeException("Unable to load default stopword set"); throw new RuntimeException("Unable to load default stopword set");
} }
DEFAULT_STEM_DICT = new CharArrayMap<>(4, false); DEFAULT_STEM_DICT = new CharArrayMap<>(Version.LUCENE_CURRENT, 4, false);
DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet
DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet
DEFAULT_STEM_DICT.put("ei", "eier"); DEFAULT_STEM_DICT.put("ei", "eier");
@ -96,27 +100,29 @@ public final class DutchAnalyzer extends Analyzer {
private CharArraySet excltable = CharArraySet.EMPTY_SET; private CharArraySet excltable = CharArraySet.EMPTY_SET;
private final StemmerOverrideMap stemdict; private final StemmerOverrideMap stemdict;
private final Version matchVersion;
/** /**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}) * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()})
* and a few default entries for the stem exclusion table. * and a few default entries for the stem exclusion table.
* *
*/ */
public DutchAnalyzer() { public DutchAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
} }
public DutchAnalyzer(CharArraySet stopwords){ public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){
this(stopwords, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT); this(matchVersion, stopwords, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
} }
public DutchAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable){ public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){
this(stopwords, stemExclusionTable, DefaultSetHolder.DEFAULT_STEM_DICT); this(matchVersion, stopwords, stemExclusionTable, DefaultSetHolder.DEFAULT_STEM_DICT);
} }
public DutchAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) { public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords)); this.matchVersion = matchVersion;
this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable)); this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
if (stemOverrideDict.isEmpty()) { if (stemOverrideDict.isEmpty()) {
this.stemdict = null; this.stemdict = null;
} else { } else {
@ -148,10 +154,10 @@ public final class DutchAnalyzer extends Analyzer {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stoptable); result = new StopFilter(matchVersion, result, stoptable);
if (!excltable.isEmpty()) if (!excltable.isEmpty())
result = new SetKeywordMarkerFilter(result, excltable); result = new SetKeywordMarkerFilter(result, excltable);
if (stemdict != null) if (stemdict != null)

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.NorwegianStemmer; import org.tartarus.snowball.ext.NorwegianStemmer;
/** /**
@ -63,7 +64,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -75,17 +76,18 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public NorwegianAnalyzer() { public NorwegianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public NorwegianAnalyzer(CharArraySet stopwords) { public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -93,12 +95,14 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public NorwegianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -115,10 +119,10 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new NorwegianStemmer()); result = new SnowballFilter(result, new NorwegianStemmer());

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/** /**
* {@link Analyzer} for Portuguese. * {@link Analyzer} for Portuguese.
@ -62,7 +63,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -74,17 +75,18 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public PortugueseAnalyzer() { public PortugueseAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public PortugueseAnalyzer(CharArraySet stopwords) { public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -92,12 +94,14 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public PortugueseAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -114,10 +118,10 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new PortugueseLightStemFilter(result); result = new PortugueseLightStemFilter(result);

View File

@ -31,6 +31,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import static org.apache.lucene.analysis.util.StemmerUtil.*; import static org.apache.lucene.analysis.util.StemmerUtil.*;
@ -134,7 +135,8 @@ public abstract class RSLPStemmerBase {
if (!exceptions[i].endsWith(suffix)) if (!exceptions[i].endsWith(suffix))
throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'"); throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
} }
this.exceptions = new CharArraySet(Arrays.asList(exceptions), false); this.exceptions = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(exceptions), false);
} }
@Override @Override

View File

@ -31,6 +31,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;
/** /**
* An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
@ -49,20 +50,23 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
//The default maximum percentage (40%) of index documents which //The default maximum percentage (40%) of index documents which
//can contain a term, after which the term is considered to be a stop word. //can contain a term, after which the term is considered to be a stop word.
public static final float defaultMaxDocFreqPercent = 0.4f; public static final float defaultMaxDocFreqPercent = 0.4f;
private final Version matchVersion;
/** /**
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
* indexed fields from terms with a document frequency percentage greater than * indexed fields from terms with a document frequency percentage greater than
* {@link #defaultMaxDocFreqPercent} * {@link #defaultMaxDocFreqPercent}
* *
* @param matchVersion Version to be used in {@link StopFilter}
* @param delegate Analyzer whose TokenStream will be filtered * @param delegate Analyzer whose TokenStream will be filtered
* @param indexReader IndexReader to identify the stopwords from * @param indexReader IndexReader to identify the stopwords from
* @throws IOException Can be thrown while reading from the IndexReader * @throws IOException Can be thrown while reading from the IndexReader
*/ */
public QueryAutoStopWordAnalyzer( public QueryAutoStopWordAnalyzer(
Version matchVersion,
Analyzer delegate, Analyzer delegate,
IndexReader indexReader) throws IOException { IndexReader indexReader) throws IOException {
this(delegate, indexReader, defaultMaxDocFreqPercent); this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent);
} }
/** /**
@ -70,16 +74,18 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
* indexed fields from terms with a document frequency greater than the given * indexed fields from terms with a document frequency greater than the given
* maxDocFreq * maxDocFreq
* *
* @param matchVersion Version to be used in {@link StopFilter}
* @param delegate Analyzer whose TokenStream will be filtered * @param delegate Analyzer whose TokenStream will be filtered
* @param indexReader IndexReader to identify the stopwords from * @param indexReader IndexReader to identify the stopwords from
* @param maxDocFreq Document frequency terms should be above in order to be stopwords * @param maxDocFreq Document frequency terms should be above in order to be stopwords
* @throws IOException Can be thrown while reading from the IndexReader * @throws IOException Can be thrown while reading from the IndexReader
*/ */
public QueryAutoStopWordAnalyzer( public QueryAutoStopWordAnalyzer(
Version matchVersion,
Analyzer delegate, Analyzer delegate,
IndexReader indexReader, IndexReader indexReader,
int maxDocFreq) throws IOException { int maxDocFreq) throws IOException {
this(delegate, indexReader, MultiFields.getIndexedFields(indexReader), maxDocFreq); this(matchVersion, delegate, indexReader, MultiFields.getIndexedFields(indexReader), maxDocFreq);
} }
/** /**
@ -87,6 +93,7 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
* indexed fields from terms with a document frequency percentage greater than * indexed fields from terms with a document frequency percentage greater than
* the given maxPercentDocs * the given maxPercentDocs
* *
* @param matchVersion Version to be used in {@link StopFilter}
* @param delegate Analyzer whose TokenStream will be filtered * @param delegate Analyzer whose TokenStream will be filtered
* @param indexReader IndexReader to identify the stopwords from * @param indexReader IndexReader to identify the stopwords from
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
@ -94,10 +101,11 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
* @throws IOException Can be thrown while reading from the IndexReader * @throws IOException Can be thrown while reading from the IndexReader
*/ */
public QueryAutoStopWordAnalyzer( public QueryAutoStopWordAnalyzer(
Version matchVersion,
Analyzer delegate, Analyzer delegate,
IndexReader indexReader, IndexReader indexReader,
float maxPercentDocs) throws IOException { float maxPercentDocs) throws IOException {
this(delegate, indexReader, MultiFields.getIndexedFields(indexReader), maxPercentDocs); this(matchVersion, delegate, indexReader, MultiFields.getIndexedFields(indexReader), maxPercentDocs);
} }
/** /**
@ -105,6 +113,7 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
* given selection of fields from terms with a document frequency percentage * given selection of fields from terms with a document frequency percentage
* greater than the given maxPercentDocs * greater than the given maxPercentDocs
* *
* @param matchVersion Version to be used in {@link StopFilter}
* @param delegate Analyzer whose TokenStream will be filtered * @param delegate Analyzer whose TokenStream will be filtered
* @param indexReader IndexReader to identify the stopwords from * @param indexReader IndexReader to identify the stopwords from
* @param fields Selection of fields to calculate stopwords for * @param fields Selection of fields to calculate stopwords for
@ -113,11 +122,12 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
* @throws IOException Can be thrown while reading from the IndexReader * @throws IOException Can be thrown while reading from the IndexReader
*/ */
public QueryAutoStopWordAnalyzer( public QueryAutoStopWordAnalyzer(
Version matchVersion,
Analyzer delegate, Analyzer delegate,
IndexReader indexReader, IndexReader indexReader,
Collection<String> fields, Collection<String> fields,
float maxPercentDocs) throws IOException { float maxPercentDocs) throws IOException {
this(delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs)); this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
} }
/** /**
@ -125,6 +135,7 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
* given selection of fields from terms with a document frequency greater than * given selection of fields from terms with a document frequency greater than
* the given maxDocFreq * the given maxDocFreq
* *
* @param matchVersion Version to be used in {@link StopFilter}
* @param delegate Analyzer whose TokenStream will be filtered * @param delegate Analyzer whose TokenStream will be filtered
* @param indexReader IndexReader to identify the stopwords from * @param indexReader IndexReader to identify the stopwords from
* @param fields Selection of fields to calculate stopwords for * @param fields Selection of fields to calculate stopwords for
@ -132,11 +143,13 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
* @throws IOException Can be thrown while reading from the IndexReader * @throws IOException Can be thrown while reading from the IndexReader
*/ */
public QueryAutoStopWordAnalyzer( public QueryAutoStopWordAnalyzer(
Version matchVersion,
Analyzer delegate, Analyzer delegate,
IndexReader indexReader, IndexReader indexReader,
Collection<String> fields, Collection<String> fields,
int maxDocFreq) throws IOException { int maxDocFreq) throws IOException {
super(delegate.getReuseStrategy()); super(delegate.getReuseStrategy());
this.matchVersion = matchVersion;
this.delegate = delegate; this.delegate = delegate;
for (String field : fields) { for (String field : fields) {
@ -168,8 +181,8 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
if (stopWords == null) { if (stopWords == null) {
return components; return components;
} }
StopFilter stopFilter = new StopFilter(components.getTokenStream(), StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(),
new CharArraySet(stopWords, false)); new CharArraySet(matchVersion, stopWords, false));
return new TokenStreamComponents(components.getTokenizer(), stopFilter); return new TokenStreamComponents(components.getTokenizer(), stopFilter);
} }

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.reverse;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import java.io.IOException; import java.io.IOException;
@ -35,6 +36,7 @@ public final class ReverseStringFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final char marker; private final char marker;
private final Version matchVersion;
private static final char NOMARKER = '\uFFFF'; private static final char NOMARKER = '\uFFFF';
/** /**
@ -64,10 +66,11 @@ public final class ReverseStringFilter extends TokenFilter {
* The reversed tokens will not be marked. * The reversed tokens will not be marked.
* </p> * </p>
* *
* @param matchVersion Lucene compatibility version
* @param in {@link TokenStream} to filter * @param in {@link TokenStream} to filter
*/ */
public ReverseStringFilter(TokenStream in) { public ReverseStringFilter(Version matchVersion, TokenStream in) {
this(in, NOMARKER); this(matchVersion, in, NOMARKER);
} }
/** /**
@ -78,11 +81,13 @@ public final class ReverseStringFilter extends TokenFilter {
* character. * character.
* </p> * </p>
* *
* @param matchVersion compatibility version
* @param in {@link TokenStream} to filter * @param in {@link TokenStream} to filter
* @param marker A character used to mark reversed tokens * @param marker A character used to mark reversed tokens
*/ */
public ReverseStringFilter(TokenStream in, char marker) { public ReverseStringFilter(Version matchVersion, TokenStream in, char marker) {
super(in); super(in);
this.matchVersion = matchVersion;
this.marker = marker; this.marker = marker;
} }
@ -95,7 +100,7 @@ public final class ReverseStringFilter extends TokenFilter {
termAtt.resizeBuffer(len); termAtt.resizeBuffer(len);
termAtt.buffer()[len - 1] = marker; termAtt.buffer()[len - 1] = marker;
} }
reverse( termAtt.buffer(), 0, len ); reverse( matchVersion, termAtt.buffer(), 0, len );
termAtt.setLength(len); termAtt.setLength(len);
return true; return true;
} else { } else {
@ -106,43 +111,48 @@ public final class ReverseStringFilter extends TokenFilter {
/** /**
* Reverses the given input string * Reverses the given input string
* *
* @param matchVersion compatibility version
* @param input the string to reverse * @param input the string to reverse
* @return the given input string in reversed order * @return the given input string in reversed order
*/ */
public static String reverse(final String input ){ public static String reverse( Version matchVersion, final String input ){
final char[] charInput = input.toCharArray(); final char[] charInput = input.toCharArray();
reverse( charInput, 0, charInput.length ); reverse( matchVersion, charInput, 0, charInput.length );
return new String( charInput ); return new String( charInput );
} }
/** /**
* Reverses the given input buffer in-place * Reverses the given input buffer in-place
* @param matchVersion compatibility version
* @param buffer the input char array to reverse * @param buffer the input char array to reverse
*/ */
public static void reverse(final char[] buffer) { public static void reverse(Version matchVersion, final char[] buffer) {
reverse(buffer, 0, buffer.length); reverse(matchVersion, buffer, 0, buffer.length);
} }
/** /**
* Partially reverses the given input buffer in-place from offset 0 * Partially reverses the given input buffer in-place from offset 0
* up to the given length. * up to the given length.
* @param matchVersion compatibility version
* @param buffer the input char array to reverse * @param buffer the input char array to reverse
* @param len the length in the buffer up to where the * @param len the length in the buffer up to where the
* buffer should be reversed * buffer should be reversed
*/ */
public static void reverse(final char[] buffer, final int len) { public static void reverse(Version matchVersion, final char[] buffer,
reverse( buffer, 0, len ); final int len) {
reverse( matchVersion, buffer, 0, len );
} }
/** /**
* Partially reverses the given input buffer in-place from the given offset * Partially reverses the given input buffer in-place from the given offset
* up to the given length. * up to the given length.
* @param matchVersion compatibility version
* @param buffer the input char array to reverse * @param buffer the input char array to reverse
* @param start the offset from where to reverse the buffer * @param start the offset from where to reverse the buffer
* @param len the length in the buffer up to where the * @param len the length in the buffer up to where the
* buffer should be reversed * buffer should be reversed
*/ */
public static void reverse(final char[] buffer, public static void reverse(Version matchVersion, final char[] buffer,
final int start, final int len) { final int start, final int len) {
/* modified version of Apache Harmony AbstractStringBuilder reverse0() */ /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
if (len < 2) if (len < 2)

View File

@ -40,6 +40,7 @@ public class ReverseStringFilterFactory extends TokenFilterFactory {
/** Creates a new ReverseStringFilterFactory */ /** Creates a new ReverseStringFilterFactory */
public ReverseStringFilterFactory(Map<String,String> args) { public ReverseStringFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -47,7 +48,7 @@ public class ReverseStringFilterFactory extends TokenFilterFactory {
@Override @Override
public ReverseStringFilter create(TokenStream in) { public ReverseStringFilter create(TokenStream in) {
return new ReverseStringFilter(in); return new ReverseStringFilter(luceneMatchVersion,in);
} }
} }

View File

@ -78,17 +78,18 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public RomanianAnalyzer() { public RomanianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public RomanianAnalyzer(CharArraySet stopwords) { public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -96,12 +97,14 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public RomanianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -118,10 +121,10 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new RomanianStemmer()); result = new SnowballFilter(result, new RomanianStemmer());

View File

@ -54,7 +54,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -74,30 +74,34 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
return DefaultSetHolder.DEFAULT_STOP_SET; return DefaultSetHolder.DEFAULT_STOP_SET;
} }
public RussianAnalyzer() { public RussianAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
*/ */
public RussianAnalyzer(CharArraySet stopwords) { public RussianAnalyzer(Version matchVersion, CharArraySet stopwords){
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
* Builds an analyzer with the given stop words * Builds an analyzer with the given stop words
* *
* @param matchVersion
* lucene compatibility version
* @param stopwords * @param stopwords
* a stopword set * a stopword set
* @param stemExclusionSet a set of words not to be stemmed * @param stemExclusionSet a set of words not to be stemmed
*/ */
public RussianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public RussianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
} }
/** /**
@ -113,10 +117,10 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) if (!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.shingle;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
/** /**
* A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}. * A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}.
@ -100,15 +101,15 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
/** /**
* Wraps {@link StandardAnalyzer}. * Wraps {@link StandardAnalyzer}.
*/ */
public ShingleAnalyzerWrapper() { public ShingleAnalyzerWrapper(Version matchVersion) {
this(ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
} }
/** /**
* Wraps {@link StandardAnalyzer}. * Wraps {@link StandardAnalyzer}.
*/ */
public ShingleAnalyzerWrapper(int minShingleSize, int maxShingleSize) { public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) {
this(new StandardAnalyzer(), minShingleSize, maxShingleSize); this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize);
} }
/** /**

View File

@ -17,14 +17,16 @@ package org.apache.lucene.analysis.standard;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -32,6 +34,18 @@ import java.io.Reader;
* Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link * Filters {@link ClassicTokenizer} with {@link ClassicFilter}, {@link
* LowerCaseFilter} and {@link StopFilter}, using a list of * LowerCaseFilter} and {@link StopFilter}, using a list of
* English stop words. * English stop words.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating ClassicAnalyzer:
* <ul>
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
* supplementary characters in stopwords
* <li> As of 2.9, StopFilter preserves position
* increments
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
* </ul>
* *
* ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1. * ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
* As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation, * As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation,
@ -49,23 +63,29 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the given stop words. /** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords stop words */ * @param stopWords stop words */
public ClassicAnalyzer(CharArraySet stopWords) { public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(stopWords); super(matchVersion, stopWords);
} }
/** Builds an analyzer with the default stop words ({@link /** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS_SET}). * #STOP_WORDS_SET}).
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
*/ */
public ClassicAnalyzer() { public ClassicAnalyzer(Version matchVersion) {
this(STOP_WORDS_SET); this(matchVersion, STOP_WORDS_SET);
} }
/** Builds an analyzer with the stop words from the given reader. /** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader) * @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */ * @param stopwords Reader to read stop words from */
public ClassicAnalyzer(Reader stopwords) throws IOException { public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(loadStopwordSet(stopwords)); this(matchVersion, loadStopwordSet(stopwords, matchVersion));
} }
/** /**
@ -87,11 +107,11 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
@Override @Override
protected TokenStreamComponents createComponents(final String fieldName) { protected TokenStreamComponents createComponents(final String fieldName) {
final ClassicTokenizer src = new ClassicTokenizer(); final ClassicTokenizer src = new ClassicTokenizer(matchVersion);
src.setMaxTokenLength(maxTokenLength); src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new ClassicFilter(src); TokenStream tok = new ClassicFilter(src);
tok = new LowerCaseFilter(tok); tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(tok, stopwords); tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) { return new TokenStreamComponents(src, tok) {
@Override @Override
protected void setReader(final Reader reader) throws IOException { protected void setReader(final Reader reader) throws IOException {

View File

@ -18,6 +18,7 @@
package org.apache.lucene.analysis.standard; package org.apache.lucene.analysis.standard;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -25,6 +26,8 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** A grammar-based tokenizer constructed with JFlex /** A grammar-based tokenizer constructed with JFlex
* *
@ -99,19 +102,19 @@ public final class ClassicTokenizer extends Tokenizer {
* *
* See http://issues.apache.org/jira/browse/LUCENE-1068 * See http://issues.apache.org/jira/browse/LUCENE-1068
*/ */
public ClassicTokenizer() { public ClassicTokenizer(Version matchVersion) {
init(); init(matchVersion);
} }
/** /**
* Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeFactory} * Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
*/ */
public ClassicTokenizer(AttributeFactory factory) { public ClassicTokenizer(Version matchVersion, AttributeFactory factory) {
super(factory); super(factory);
init(); init(matchVersion);
} }
private void init() { private void init(Version matchVersion) {
this.scanner = new ClassicTokenizerImpl(input); this.scanner = new ClassicTokenizerImpl(input);
} }

View File

@ -37,6 +37,7 @@ public class ClassicTokenizerFactory extends TokenizerFactory {
/** Creates a new ClassicTokenizerFactory */ /** Creates a new ClassicTokenizerFactory */
public ClassicTokenizerFactory(Map<String,String> args) { public ClassicTokenizerFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
@ -45,7 +46,7 @@ public class ClassicTokenizerFactory extends TokenizerFactory {
@Override @Override
public ClassicTokenizer create(AttributeFactory factory) { public ClassicTokenizer create(AttributeFactory factory) {
ClassicTokenizer tokenizer = new ClassicTokenizer(factory); ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, factory);
tokenizer.setMaxTokenLength(maxTokenLength); tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer; return tokenizer;
} }

View File

@ -17,14 +17,16 @@ package org.apache.lucene.analysis.standard;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -32,9 +34,26 @@ import java.io.Reader;
* Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter} and {@link StopFilter}, using a list of * LowerCaseFilter} and {@link StopFilter}, using a list of
* English stop words. * English stop words.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer:
* <ul>
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
* from their combining characters. If you use a previous version number,
* you get the exact broken behavior for backwards compatibility.
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation,
* and StopFilter correctly handles Unicode 4.0 supplementary characters
* in stopwords. {@link ClassicTokenizer} and {@link ClassicAnalyzer}
* are the pre-3.1 implementations of StandardTokenizer and
* StandardAnalyzer.
* <li> As of 2.9, StopFilter preserves position increments
* <li> As of 2.4, Tokens incorrectly identified as acronyms
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
* </ul>
*/ */
public final class StandardAnalyzer extends StopwordAnalyzerBase { public final class StandardAnalyzer extends StopwordAnalyzerBase {
/** Default maximum allowed token length */ /** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
@ -45,22 +64,29 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the given stop words. /** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords stop words */ * @param stopWords stop words */
public StandardAnalyzer(CharArraySet stopWords) { public StandardAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(stopWords); super(matchVersion, stopWords);
} }
/** Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). /** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS_SET}).
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
*/ */
public StandardAnalyzer() { public StandardAnalyzer(Version matchVersion) {
this(STOP_WORDS_SET); this(matchVersion, STOP_WORDS_SET);
} }
/** Builds an analyzer with the stop words from the given reader. /** Builds an analyzer with the stop words from the given reader.
* @see WordlistLoader#getWordSet(Reader) * @see WordlistLoader#getWordSet(Reader, Version)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */ * @param stopwords Reader to read stop words from */
public StandardAnalyzer(Reader stopwords) throws IOException { public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(loadStopwordSet(stopwords)); this(matchVersion, loadStopwordSet(stopwords, matchVersion));
} }
/** /**
@ -82,11 +108,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
@Override @Override
protected TokenStreamComponents createComponents(final String fieldName) { protected TokenStreamComponents createComponents(final String fieldName) {
final StandardTokenizer src = new StandardTokenizer(); final StandardTokenizer src = new StandardTokenizer(matchVersion);
src.setMaxTokenLength(maxTokenLength); src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new StandardFilter(src); TokenStream tok = new StandardFilter(matchVersion, src);
tok = new LowerCaseFilter(tok); tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(tok, stopwords); tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) { return new TokenStreamComponents(src, tok) {
@Override @Override
protected void setReader(final Reader reader) throws IOException { protected void setReader(final Reader reader) throws IOException {

View File

@ -21,13 +21,14 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/** /**
* Normalizes tokens extracted with {@link StandardTokenizer}. * Normalizes tokens extracted with {@link StandardTokenizer}.
*/ */
public class StandardFilter extends TokenFilter { public class StandardFilter extends TokenFilter {
public StandardFilter(TokenStream in) { public StandardFilter(Version matchVersion, TokenStream in) {
super(in); super(in);
} }

View File

@ -38,6 +38,7 @@ public class StandardFilterFactory extends TokenFilterFactory {
/** Creates a new StandardFilterFactory */ /** Creates a new StandardFilterFactory */
public StandardFilterFactory(Map<String,String> args) { public StandardFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -45,6 +46,6 @@ public class StandardFilterFactory extends TokenFilterFactory {
@Override @Override
public StandardFilter create(TokenStream input) { public StandardFilter create(TokenStream input) {
return new StandardFilter(input); return new StandardFilter(luceneMatchVersion, input);
} }
} }

View File

@ -18,6 +18,7 @@
package org.apache.lucene.analysis.standard; package org.apache.lucene.analysis.standard;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -25,6 +26,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** A grammar-based tokenizer constructed with JFlex. /** A grammar-based tokenizer constructed with JFlex.
* <p> * <p>
@ -113,19 +116,19 @@ public final class StandardTokenizer extends Tokenizer {
* See http://issues.apache.org/jira/browse/LUCENE-1068 * See http://issues.apache.org/jira/browse/LUCENE-1068
*/ */
public StandardTokenizer() { public StandardTokenizer(Version matchVersion) {
init(); init(matchVersion);
} }
/** /**
* Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeFactory} * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
*/ */
public StandardTokenizer(AttributeFactory factory) { public StandardTokenizer(Version matchVersion, AttributeFactory factory) {
super(factory); super(factory);
init(); init(matchVersion);
} }
private void init() { private void init(Version matchVersion) {
this.scanner = new StandardTokenizerImpl(input); this.scanner = new StandardTokenizerImpl(input);
} }

View File

@ -37,6 +37,7 @@ public class StandardTokenizerFactory extends TokenizerFactory {
/** Creates a new StandardTokenizerFactory */ /** Creates a new StandardTokenizerFactory */
public StandardTokenizerFactory(Map<String,String> args) { public StandardTokenizerFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
@ -45,7 +46,7 @@ public class StandardTokenizerFactory extends TokenizerFactory {
@Override @Override
public StandardTokenizer create(AttributeFactory factory) { public StandardTokenizer create(AttributeFactory factory) {
StandardTokenizer tokenizer = new StandardTokenizer(factory); StandardTokenizer tokenizer = new StandardTokenizer(luceneMatchVersion, factory);
tokenizer.setMaxTokenLength(maxTokenLength); tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer; return tokenizer;
} }

View File

@ -34,9 +34,15 @@ import java.io.Reader;
* {@link org.apache.lucene.analysis.core.LowerCaseFilter} and * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and
* {@link org.apache.lucene.analysis.core.StopFilter}, using a list of * {@link org.apache.lucene.analysis.core.StopFilter}, using a list of
* English stop words. * English stop words.
*
* <a name="version"/>
* <p>
* You must specify the required {@link org.apache.lucene.util.Version}
* compatibility when creating UAX29URLEmailAnalyzer
* </p>
*/ */
public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase { public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
/** Default maximum allowed token length */ /** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; public static final int DEFAULT_MAX_TOKEN_LENGTH = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
@ -47,23 +53,29 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
/** Builds an analyzer with the given stop words. /** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords stop words */ * @param stopWords stop words */
public UAX29URLEmailAnalyzer(CharArraySet stopWords) { public UAX29URLEmailAnalyzer(Version matchVersion, CharArraySet stopWords) {
super(stopWords); super(matchVersion, stopWords);
} }
/** Builds an analyzer with the default stop words ({@link /** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS_SET}). * #STOP_WORDS_SET}).
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
*/ */
public UAX29URLEmailAnalyzer() { public UAX29URLEmailAnalyzer(Version matchVersion) {
this(STOP_WORDS_SET); this(matchVersion, STOP_WORDS_SET);
} }
/** Builds an analyzer with the stop words from the given reader. /** Builds an analyzer with the stop words from the given reader.
* @see org.apache.lucene.analysis.util.WordlistLoader#getWordSet(java.io.Reader) * @see org.apache.lucene.analysis.util.WordlistLoader#getWordSet(java.io.Reader, org.apache.lucene.util.Version)
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */ * @param stopwords Reader to read stop words from */
public UAX29URLEmailAnalyzer(Reader stopwords) throws IOException { public UAX29URLEmailAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(loadStopwordSet(stopwords)); this(matchVersion, loadStopwordSet(stopwords, matchVersion));
} }
/** /**
@ -85,11 +97,11 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
@Override @Override
protected TokenStreamComponents createComponents(final String fieldName) { protected TokenStreamComponents createComponents(final String fieldName) {
final UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(); final UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(matchVersion);
src.setMaxTokenLength(maxTokenLength); src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new StandardFilter(src); TokenStream tok = new StandardFilter(matchVersion, src);
tok = new LowerCaseFilter(tok); tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(tok, stopwords); tok = new StopFilter(matchVersion, tok, stopwords);
return new TokenStreamComponents(src, tok) { return new TokenStreamComponents(src, tok) {
@Override @Override
protected void setReader(final Reader reader) throws IOException { protected void setReader(final Reader reader) throws IOException {

View File

@ -18,6 +18,9 @@ package org.apache.lucene.analysis.standard;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -25,6 +28,8 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** /**
* This class implements Word Break rules from the Unicode Text Segmentation * This class implements Word Break rules from the Unicode Text Segmentation
@ -95,19 +100,19 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
* the <code>input</code> to the newly created JFlex scanner. * the <code>input</code> to the newly created JFlex scanner.
*/ */
public UAX29URLEmailTokenizer() { public UAX29URLEmailTokenizer(Version matchVersion) {
this.scanner = getScanner(); this.scanner = getScannerFor(matchVersion);
} }
/** /**
* Creates a new UAX29URLEmailTokenizer with a given {@link AttributeFactory} * Creates a new UAX29URLEmailTokenizer with a given {@link AttributeFactory}
*/ */
public UAX29URLEmailTokenizer(AttributeFactory factory) { public UAX29URLEmailTokenizer(Version matchVersion, AttributeFactory factory) {
super(factory); super(factory);
this.scanner = getScanner(); this.scanner = getScannerFor(matchVersion);
} }
private StandardTokenizerInterface getScanner() { private StandardTokenizerInterface getScannerFor(Version matchVersion) {
return new UAX29URLEmailTokenizerImpl(input); return new UAX29URLEmailTokenizerImpl(input);
} }

View File

@ -38,6 +38,7 @@ public class UAX29URLEmailTokenizerFactory extends TokenizerFactory {
/** Creates a new UAX29URLEmailTokenizerFactory */ /** Creates a new UAX29URLEmailTokenizerFactory */
public UAX29URLEmailTokenizerFactory(Map<String,String> args) { public UAX29URLEmailTokenizerFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
@ -46,7 +47,7 @@ public class UAX29URLEmailTokenizerFactory extends TokenizerFactory {
@Override @Override
public UAX29URLEmailTokenizer create(AttributeFactory factory) { public UAX29URLEmailTokenizer create(AttributeFactory factory) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(factory); UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, factory);
tokenizer.setMaxTokenLength(maxTokenLength); tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer; return tokenizer;
} }

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.SwedishStemmer; import org.tartarus.snowball.ext.SwedishStemmer;
/** /**
@ -63,7 +64,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
static { static {
try { try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) { } catch (IOException ex) {
// default set should always be present as it is part of the // default set should always be present as it is part of the
// distribution (JAR) // distribution (JAR)
@ -75,17 +76,18 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
/** /**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/ */
public SwedishAnalyzer() { public SwedishAnalyzer(Version matchVersion) {
this(DefaultSetHolder.DEFAULT_STOP_SET); this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
} }
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public SwedishAnalyzer(CharArraySet stopwords) { public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET); this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
} }
/** /**
@ -93,12 +95,14 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before * provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming. * stemming.
* *
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set * @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public SwedishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { public SwedishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
} }
/** /**
@ -115,10 +119,10 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new SwedishStemmer()); result = new SnowballFilter(result, new SwedishStemmer());

View File

@ -134,8 +134,8 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
analyzer = new Analyzer() { analyzer = new Analyzer() {
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create(); Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT) : factory.create();
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream); return new TokenStreamComponents(tokenizer, stream);
} }
}; };
@ -202,12 +202,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException { private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException {
Class<? extends Analyzer> clazz = loader.findClass(cname, Analyzer.class); Class<? extends Analyzer> clazz = loader.findClass(cname, Analyzer.class);
try { try {
Analyzer analyzer = null; Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_CURRENT);
try {
analyzer = clazz.getConstructor().newInstance();
} catch (NoSuchMethodException e) {
analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_CURRENT);
}
if (analyzer instanceof ResourceLoaderAware) { if (analyzer instanceof ResourceLoaderAware) {
((ResourceLoaderAware) analyzer).inform(loader); ((ResourceLoaderAware) analyzer).inform(loader);
} }

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -34,7 +35,6 @@ import org.apache.lucene.util.Version;
* {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words. * {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
*/ */
public final class ThaiAnalyzer extends StopwordAnalyzerBase { public final class ThaiAnalyzer extends StopwordAnalyzerBase {
private final Version matchVersion;
/** File containing default Thai stopwords. */ /** File containing default Thai stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -87,8 +87,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
* @param stopwords a stopword set * @param stopwords a stopword set
*/ */
public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) { public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) {
super(stopwords); super(matchVersion, stopwords);
this.matchVersion = matchVersion;
} }
/** /**
@ -105,15 +104,15 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
if (matchVersion.onOrAfter(Version.LUCENE_4_8)) { if (matchVersion.onOrAfter(Version.LUCENE_4_8)) {
final Tokenizer source = new ThaiTokenizer(); final Tokenizer source = new ThaiTokenizer();
TokenStream result = new LowerCaseFilter(source); TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
return new TokenStreamComponents(source, result); return new TokenStreamComponents(source, result);
} else { } else {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(matchVersion, result);
result = new ThaiWordFilter(result); result = new ThaiWordFilter(matchVersion, result);
return new TokenStreamComponents(source, new StopFilter(result, stopwords)); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
} }
} }
} }

View File

@ -28,6 +28,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArrayIterator; import org.apache.lucene.analysis.util.CharArrayIterator;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/** /**
* {@link TokenFilter} that use {@link java.text.BreakIterator} to break each * {@link TokenFilter} that use {@link java.text.BreakIterator} to break each
@ -60,7 +61,7 @@ public final class ThaiWordFilter extends TokenFilter {
private boolean hasIllegalOffsets = false; // only if the length changed before this filter private boolean hasIllegalOffsets = false; // only if the length changed before this filter
/** Creates a new ThaiWordFilter with the specified match version. */ /** Creates a new ThaiWordFilter with the specified match version. */
public ThaiWordFilter(TokenStream input) { public ThaiWordFilter(Version matchVersion, TokenStream input) {
super(input); super(input);
if (!DBBI_AVAILABLE) if (!DBBI_AVAILABLE)
throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation"); throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");

View File

@ -41,6 +41,7 @@ public class ThaiWordFilterFactory extends TokenFilterFactory {
/** Creates a new ThaiWordFilterFactory */ /** Creates a new ThaiWordFilterFactory */
public ThaiWordFilterFactory(Map<String,String> args) { public ThaiWordFilterFactory(Map<String,String> args) {
super(args); super(args);
assureMatchVersion();
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -48,7 +49,7 @@ public class ThaiWordFilterFactory extends TokenFilterFactory {
@Override @Override
public ThaiWordFilter create(TokenStream input) { public ThaiWordFilter create(TokenStream input) {
return new ThaiWordFilter(input); return new ThaiWordFilter(luceneMatchVersion, input);
} }
} }

View File

@ -38,7 +38,6 @@ import org.tartarus.snowball.ext.TurkishStemmer;
*/ */
public final class TurkishAnalyzer extends StopwordAnalyzerBase { public final class TurkishAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet; private final CharArraySet stemExclusionSet;
private final Version matchVersion;
/** File containing default Turkish stopwords. */ /** File containing default Turkish stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
@ -102,9 +101,9 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
* @param stemExclusionSet a set of terms not to be stemmed * @param stemExclusionSet a set of terms not to be stemmed
*/ */
public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords); super(matchVersion, stopwords);
this.matchVersion = matchVersion; this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); matchVersion, stemExclusionSet));
} }
/** /**
@ -121,12 +120,12 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer(); final Tokenizer source = new StandardTokenizer(matchVersion);
TokenStream result = new StandardFilter(source); TokenStream result = new StandardFilter(matchVersion, source);
if(matchVersion.onOrAfter(Version.LUCENE_4_8)) if(matchVersion.onOrAfter(Version.LUCENE_4_8))
result = new ApostropheFilter(result); result = new ApostropheFilter(result);
result = new TurkishLowerCaseFilter(result); result = new TurkishLowerCaseFilter(result);
result = new StopFilter(result, stopwords); result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty()) if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new TurkishStemmer()); result = new SnowballFilter(result, new TurkishStemmer());

View File

@ -238,10 +238,12 @@ public abstract class AbstractAnalysisFactory {
if (files.size() > 0) { if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that // default stopwords list has 35 or so words, but maybe don't make it that
// big to start // big to start
words = new CharArraySet(files.size() * 10, ignoreCase); words = new CharArraySet(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) { for (String file : files) {
List<String> wlist = getLines(loader, file.trim()); List<String> wlist = getLines(loader, file.trim());
words.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
ignoreCase));
} }
} }
return words; return words;
@ -264,7 +266,8 @@ public abstract class AbstractAnalysisFactory {
if (files.size() > 0) { if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that // default stopwords list has 35 or so words, but maybe don't make it that
// big to start // big to start
words = new CharArraySet(files.size() * 10, ignoreCase); words = new CharArraySet(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) { for (String file : files) {
InputStream stream = null; InputStream stream = null;
Reader reader = null; Reader reader = null;

View File

@ -25,6 +25,8 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/** /**
* A simple class that stores key Strings as char[]'s in a * A simple class that stores key Strings as char[]'s in a
@ -34,6 +36,19 @@ import org.apache.lucene.analysis.util.CharacterUtils;
* etc. It is designed to be quick to retrieve items * etc. It is designed to be quick to retrieve items
* by char[] keys without the necessity of converting * by char[] keys without the necessity of converting
* to a String first. * to a String first.
*
* <a name="version"></a>
* <p>You must specify the required {@link Version}
* compatibility when creating {@link CharArrayMap}:
* <ul>
* <li> As of 3.1, supplementary characters are
* properly lowercased.</li>
* </ul>
* Before 3.1 supplementary characters could not be
* lowercased correctly due to the lack of Unicode 4
* support in JDK 1.4. To use instances of
* {@link CharArrayMap} with the behavior before Lucene
* 3.1 pass a {@link Version} &lt; 3.1 to the constructors.
*/ */
public class CharArrayMap<V> extends AbstractMap<Object,V> { public class CharArrayMap<V> extends AbstractMap<Object,V> {
// private only because missing generics // private only because missing generics
@ -43,12 +58,16 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
private final CharacterUtils charUtils; private final CharacterUtils charUtils;
private boolean ignoreCase; private boolean ignoreCase;
private int count; private int count;
final Version matchVersion; // package private because used in CharArraySet
char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
/** /**
* Create map with enough capacity to hold startSize terms * Create map with enough capacity to hold startSize terms
* *
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details.
* @param startSize * @param startSize
* the initial capacity * the initial capacity
* @param ignoreCase * @param ignoreCase
@ -56,27 +75,31 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
* otherwise <code>true</code>. * otherwise <code>true</code>.
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public CharArrayMap(int startSize, boolean ignoreCase) { public CharArrayMap(Version matchVersion, int startSize, boolean ignoreCase) {
this.ignoreCase = ignoreCase; this.ignoreCase = ignoreCase;
int size = INIT_SIZE; int size = INIT_SIZE;
while(startSize + (startSize>>2) > size) while(startSize + (startSize>>2) > size)
size <<= 1; size <<= 1;
keys = new char[size][]; keys = new char[size][];
values = (V[]) new Object[size]; values = (V[]) new Object[size];
this.charUtils = CharacterUtils.getInstance(); this.charUtils = CharacterUtils.getInstance(matchVersion);
this.matchVersion = matchVersion;
} }
/** /**
* Creates a map from the mappings in another map. * Creates a map from the mappings in another map.
* *
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details.
* @param c * @param c
* a map whose mappings to be copied * a map whose mappings to be copied
* @param ignoreCase * @param ignoreCase
* <code>false</code> if and only if the set should be case sensitive * <code>false</code> if and only if the set should be case sensitive
* otherwise <code>true</code>. * otherwise <code>true</code>.
*/ */
public CharArrayMap(Map<?,? extends V> c, boolean ignoreCase) { public CharArrayMap(Version matchVersion, Map<?,? extends V> c, boolean ignoreCase) {
this(c.size(), ignoreCase); this(matchVersion, c.size(), ignoreCase);
putAll(c); putAll(c);
} }
@ -87,6 +110,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
this.ignoreCase = toCopy.ignoreCase; this.ignoreCase = toCopy.ignoreCase;
this.count = toCopy.count; this.count = toCopy.count;
this.charUtils = toCopy.charUtils; this.charUtils = toCopy.charUtils;
this.matchVersion = toCopy.matchVersion;
} }
/** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */ /** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
@ -541,7 +565,18 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
/** /**
* Returns a copy of the given map as a {@link CharArrayMap}. If the given map * Returns a copy of the given map as a {@link CharArrayMap}. If the given map
* is a {@link CharArrayMap} the ignoreCase property will be preserved. * is a {@link CharArrayMap} the ignoreCase property will be preserved.
* <p>
* <b>Note:</b> If you intend to create a copy of another {@link CharArrayMap} where
* the {@link Version} of the source map differs from its copy
* {@link #CharArrayMap(Version, Map, boolean)} should be used instead.
* The {@link #copy(Version, Map)} will preserve the {@link Version} of the
* source map it is an instance of {@link CharArrayMap}.
* </p>
* *
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details. This argument will be ignored if the
* given map is a {@link CharArrayMap}.
* @param map * @param map
* a map to copy * a map to copy
* @return a copy of the given map as a {@link CharArrayMap}. If the given map * @return a copy of the given map as a {@link CharArrayMap}. If the given map
@ -549,7 +584,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
* matchVersion will be of the given map will be preserved. * matchVersion will be of the given map will be preserved.
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static <V> CharArrayMap<V> copy(final Map<?,? extends V> map) { public static <V> CharArrayMap<V> copy(final Version matchVersion, final Map<?,? extends V> map) {
if(map == EMPTY_MAP) if(map == EMPTY_MAP)
return emptyMap(); return emptyMap();
if(map instanceof CharArrayMap) { if(map instanceof CharArrayMap) {
@ -565,7 +600,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
m.values = values; m.values = values;
return m; return m;
} }
return new CharArrayMap<>(map, false); return new CharArrayMap<>(matchVersion, map, false);
} }
/** Returns an empty, unmodifiable map. */ /** Returns an empty, unmodifiable map. */
@ -624,7 +659,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
*/ */
private static final class EmptyCharArrayMap<V> extends UnmodifiableCharArrayMap<V> { private static final class EmptyCharArrayMap<V> extends UnmodifiableCharArrayMap<V> {
EmptyCharArrayMap() { EmptyCharArrayMap() {
super(new CharArrayMap<V>(0, false)); super(new CharArrayMap<V>(Version.LUCENE_CURRENT, 0, false));
} }
@Override @Override

View File

@ -22,6 +22,9 @@ import java.util.Collection;
import java.util.Iterator; import java.util.Iterator;
import java.util.Set; import java.util.Set;
import org.apache.lucene.util.Version;
/** /**
* A simple class that stores Strings as char[]'s in a * A simple class that stores Strings as char[]'s in a
* hash table. Note that this is not a general purpose * hash table. Note that this is not a general purpose
@ -31,6 +34,18 @@ import java.util.Set;
* is in the set without the necessity of converting it * is in the set without the necessity of converting it
* to a String first. * to a String first.
* *
* <a name="version"></a>
* <p>You must specify the required {@link Version}
* compatibility when creating {@link CharArraySet}:
* <ul>
* <li> As of 3.1, supplementary characters are
* properly lowercased.</li>
* </ul>
* Before 3.1 supplementary characters could not be
* lowercased correctly due to the lack of Unicode 4
* support in JDK 1.4. To use instances of
* {@link CharArraySet} with the behavior before Lucene
* 3.1 pass a {@link Version} < 3.1 to the constructors.
* <P> * <P>
* <em>Please note:</em> This class implements {@link java.util.Set Set} but * <em>Please note:</em> This class implements {@link java.util.Set Set} but
* does not behave like it should in all cases. The generic type is * does not behave like it should in all cases. The generic type is
@ -49,27 +64,33 @@ public class CharArraySet extends AbstractSet<Object> {
/** /**
* Create set with enough capacity to hold startSize terms * Create set with enough capacity to hold startSize terms
* *
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details.
* @param startSize * @param startSize
* the initial capacity * the initial capacity
* @param ignoreCase * @param ignoreCase
* <code>false</code> if and only if the set should be case sensitive * <code>false</code> if and only if the set should be case sensitive
* otherwise <code>true</code>. * otherwise <code>true</code>.
*/ */
public CharArraySet(int startSize, boolean ignoreCase) { public CharArraySet(Version matchVersion, int startSize, boolean ignoreCase) {
this(new CharArrayMap<>(startSize, ignoreCase)); this(new CharArrayMap<>(matchVersion, startSize, ignoreCase));
} }
/** /**
* Creates a set from a Collection of objects. * Creates a set from a Collection of objects.
* *
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details.
* @param c * @param c
* a collection whose elements to be placed into the set * a collection whose elements to be placed into the set
* @param ignoreCase * @param ignoreCase
* <code>false</code> if and only if the set should be case sensitive * <code>false</code> if and only if the set should be case sensitive
* otherwise <code>true</code>. * otherwise <code>true</code>.
*/ */
public CharArraySet(Collection<?> c, boolean ignoreCase) { public CharArraySet(Version matchVersion, Collection<?> c, boolean ignoreCase) {
this(c.size(), ignoreCase); this(matchVersion, c.size(), ignoreCase);
addAll(c); addAll(c);
} }
@ -151,21 +172,32 @@ public class CharArraySet extends AbstractSet<Object> {
/** /**
* Returns a copy of the given set as a {@link CharArraySet}. If the given set * Returns a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property will be preserved. * is a {@link CharArraySet} the ignoreCase property will be preserved.
* <p>
* <b>Note:</b> If you intend to create a copy of another {@link CharArraySet} where
* the {@link Version} of the source set differs from its copy
* {@link #CharArraySet(Version, Collection, boolean)} should be used instead.
* The {@link #copy(Version, Set)} will preserve the {@link Version} of the
* source set it is an instance of {@link CharArraySet}.
* </p>
* *
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details. This argument will be ignored if the
* given set is a {@link CharArraySet}.
* @param set * @param set
* a set to copy * a set to copy
* @return a copy of the given set as a {@link CharArraySet}. If the given set * @return a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property as well as the * is a {@link CharArraySet} the ignoreCase property as well as the
* matchVersion will be of the given set will be preserved. * matchVersion will be of the given set will be preserved.
*/ */
public static CharArraySet copy(final Set<?> set) { public static CharArraySet copy(final Version matchVersion, final Set<?> set) {
if(set == EMPTY_SET) if(set == EMPTY_SET)
return EMPTY_SET; return EMPTY_SET;
if(set instanceof CharArraySet) { if(set instanceof CharArraySet) {
final CharArraySet source = (CharArraySet) set; final CharArraySet source = (CharArraySet) set;
return new CharArraySet(CharArrayMap.copy(source.map)); return new CharArraySet(CharArrayMap.copy(source.map.matchVersion, source.map));
} }
return new CharArraySet(set, false); return new CharArraySet(matchVersion, set, false);
} }
/** /**

View File

@ -18,12 +18,15 @@ package org.apache.lucene.analysis.util;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer; import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
/** /**
@ -33,18 +36,25 @@ public abstract class CharTokenizer extends Tokenizer {
/** /**
* Creates a new {@link CharTokenizer} instance * Creates a new {@link CharTokenizer} instance
*
* @param matchVersion
* Lucene version to match
*/ */
public CharTokenizer() { public CharTokenizer(Version matchVersion) {
charUtils = CharacterUtils.getInstance(matchVersion);
} }
/** /**
* Creates a new {@link CharTokenizer} instance * Creates a new {@link CharTokenizer} instance
* *
* @param matchVersion
* Lucene version to match
* @param factory * @param factory
* the attribute factory to use for this {@link Tokenizer} * the attribute factory to use for this {@link Tokenizer}
*/ */
public CharTokenizer(AttributeFactory factory) { public CharTokenizer(Version matchVersion, AttributeFactory factory) {
super(factory); super(factory);
charUtils = CharacterUtils.getInstance(matchVersion);
} }
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
@ -54,7 +64,7 @@ public abstract class CharTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharacterUtils charUtils = CharacterUtils.getInstance(); private final CharacterUtils charUtils;
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
/** /**

View File

@ -34,25 +34,29 @@ public abstract class CharacterUtils {
private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils(); private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
/** /**
* Returns a {@link CharacterUtils} implementation. * Returns a {@link CharacterUtils} implementation according to the given
* {@link Version} instance.
*
* @param matchVersion
* a version instance
* @return a {@link CharacterUtils} implementation according to the given * @return a {@link CharacterUtils} implementation according to the given
* {@link Version} instance. * {@link Version} instance.
*/ */
public static CharacterUtils getInstance() { public static CharacterUtils getInstance(final Version matchVersion) {
return JAVA_5; return JAVA_5;
} }
/** /** explicitly returns a version matching java 4 semantics */
* explicitly returns a version matching java 4 semantics
* @deprecated Only for n-gram backwards compat
*/
@Deprecated
public static CharacterUtils getJava4Instance() { public static CharacterUtils getJava4Instance() {
return JAVA_4; return JAVA_4;
} }
/** /**
* Returns the code point at the given index of the {@link CharSequence}. * Returns the code point at the given index of the {@link CharSequence}.
* Depending on the {@link Version} passed to
* {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
* of {@link Character#codePointAt(char[], int)} as it would have been
* available on a Java 1.4 JVM or on a later virtual machine version.
* *
* @param seq * @param seq
* a character sequence * a character sequence
@ -71,6 +75,10 @@ public abstract class CharacterUtils {
/** /**
* Returns the code point at the given index of the char array where only elements * Returns the code point at the given index of the char array where only elements
* with index less than the limit are used. * with index less than the limit are used.
* Depending on the {@link Version} passed to
* {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
* of {@link Character#codePointAt(char[], int)} as it would have been
* available on a Java 1.4 JVM or on a later virtual machine version.
* *
* @param chars * @param chars
* a character array * a character array
@ -180,7 +188,10 @@ public abstract class CharacterUtils {
* the middle of a surrogate pair, even if there are remaining characters in * the middle of a surrogate pair, even if there are remaining characters in
* the {@link Reader}. * the {@link Reader}.
* <p> * <p>
* This method guarantees * Depending on the {@link Version} passed to
* {@link CharacterUtils#getInstance(Version)} this method implements
* supplementary character awareness when filling the given buffer. For all
* {@link Version} &gt; 3.0 {@link #fill(CharacterBuffer, Reader, int)} guarantees
* that the given {@link CharacterBuffer} will never contain a high surrogate * that the given {@link CharacterBuffer} will never contain a high surrogate
* character as the last element in the buffer unless it is the last available * character as the last element in the buffer unless it is the last available
* character in the reader. In other words, high and low surrogate pairs will * character in the reader. In other words, high and low surrogate pairs will

Some files were not shown because too many files have changed in this diff Show More