mirror of https://github.com/apache/lucene.git
LUCENE-5859: Remove Version from Analyzer constructors
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1616901 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1ca2ef7eb3
commit
9938a39a87
|
@ -80,6 +80,11 @@ API Changes
|
|||
takes the same selectors. Add helper methods to DocValues.java that are better
|
||||
suited for search code (never return null, etc). (Mike McCandless, Robert Muir)
|
||||
|
||||
* LUCENE-5859: Remove Version from Analyzer constructors. Use Analyzer.setVersion()
|
||||
to set the version an analyzer should use to replicate behavior from a specific
|
||||
release.
|
||||
(Ryan Ernst, Robert Muir)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-5392: Add/improve analysis package documentation to reflect
|
||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Arabic.
|
||||
|
@ -89,20 +88,18 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public ArabicAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public ArabicAnalyzer(CharArraySet stopwords){
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -110,17 +107,14 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* {@link ArabicStemFilter}.
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclusionSet
|
||||
* a set of terms not to be stemmed
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public ArabicAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -136,10 +130,10 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new LowerCaseFilter(source);
|
||||
// the order here is important: the stopword list is not normalized!
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
result = new StopFilter(result, stopwords);
|
||||
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
if(!stemExclusionSet.isEmpty()) {
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.bg;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -31,7 +30,6 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Bulgarian.
|
||||
|
@ -42,6 +40,7 @@ import org.apache.lucene.util.Version;
|
|||
* <p>
|
||||
*/
|
||||
public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* File containing default Bulgarian stopwords.
|
||||
*
|
||||
|
@ -84,15 +83,15 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public BulgarianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public BulgarianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -100,10 +99,10 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* If a stem exclusion set is provided this analyzer will add a {@link SetKeywordMarkerFilter}
|
||||
* before {@link BulgarianStemFilter}.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet)); }
|
||||
public BulgarianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a
|
||||
|
@ -119,10 +118,10 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new BulgarianStemFilter(result);
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Brazilian Portuguese language.
|
||||
|
@ -44,7 +43,7 @@ import org.apache.lucene.util.Version;
|
|||
* not be stemmed, but indexed).
|
||||
* </p>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* <p><b>NOTE</b>: This class uses the same {@link org.apache.lucene.util.Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
@ -65,7 +64,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#");
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -83,35 +82,29 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public BrazilianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
super(matchVersion, stopwords);
|
||||
public BrazilianAnalyzer(CharArraySet stopwords) {
|
||||
super(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words and stemming exclusion words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords,
|
||||
CharArraySet stemExclusionSet) {
|
||||
this(matchVersion, stopwords);
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(matchVersion, stemExclusionSet));
|
||||
public BrazilianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
this(stopwords);
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -126,10 +119,10 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StandardFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new LowerCaseFilter(source);
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(excltable != null && !excltable.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, excltable);
|
||||
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.CatalanStemmer;
|
||||
|
||||
/**
|
||||
|
@ -46,7 +45,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT,
|
||||
new CharArraySet(
|
||||
Arrays.asList(
|
||||
"d", "l", "m", "n", "s", "t"
|
||||
), true));
|
||||
|
@ -81,18 +80,17 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public CatalanAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public CatalanAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public CatalanAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -100,14 +98,12 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public CatalanAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -124,11 +120,11 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new CatalanStemmer());
|
||||
|
|
|
@ -26,7 +26,6 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||
|
@ -29841,7 +29840,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
upperCaseVariantsAccepted.put("amp", "AMP");
|
||||
}
|
||||
private static final CharArrayMap<Character> entityValues
|
||||
= new CharArrayMap<>(Version.LUCENE_CURRENT, 253, false);
|
||||
= new CharArrayMap<>(253, false);
|
||||
static {
|
||||
String[] entities = {
|
||||
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
|
||||
|
@ -29980,7 +29979,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
escapeSTYLE = true;
|
||||
} else {
|
||||
if (null == this.escapedTags) {
|
||||
this.escapedTags = new CharArraySet(Version.LUCENE_CURRENT, 16, true);
|
||||
this.escapedTags = new CharArraySet(16, true);
|
||||
}
|
||||
this.escapedTags.add(tag);
|
||||
}
|
||||
|
|
|
@ -24,7 +24,6 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||
|
@ -195,7 +194,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
escapeSTYLE = true;
|
||||
} else {
|
||||
if (null == this.escapedTags) {
|
||||
this.escapedTags = new CharArraySet(Version.LUCENE_CURRENT, 16, true);
|
||||
this.escapedTags = new CharArraySet(16, true);
|
||||
}
|
||||
this.escapedTags.add(tag);
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.cjk;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -28,7 +27,6 @@ import org.apache.lucene.analysis.core.StopFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
|
||||
|
@ -37,6 +35,7 @@ import org.apache.lucene.util.Version;
|
|||
* and filters stopwords with {@link StopFilter}
|
||||
*/
|
||||
public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* File containing default CJK stopwords.
|
||||
* <p/>
|
||||
|
@ -70,29 +69,27 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer which removes words in {@link #getDefaultStopSet()}.
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public CJKAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
super(matchVersion, stopwords);
|
||||
public CJKAnalyzer(CharArraySet stopwords){
|
||||
super(stopwords);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
// run the widthfilter first before bigramming, it sometimes combines characters.
|
||||
TokenStream result = new CJKWidthFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new CJKBigramFilter(result);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Sorani Kurdish.
|
||||
|
@ -62,7 +61,7 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(SoraniAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -74,18 +73,17 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public SoraniAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public SoraniAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public SoraniAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,14 +91,12 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public SoraniAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -118,11 +114,11 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new SoraniNormalizationFilter(result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SoraniStemFilter(result);
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/*
|
||||
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
|
||||
|
@ -78,7 +77,7 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
* @param input TokenStream input in filter chain
|
||||
* @param commonWords The set of common words.
|
||||
*/
|
||||
public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) {
|
||||
public CommonGramsFilter(TokenStream input, CharArraySet commonWords) {
|
||||
super(input);
|
||||
this.commonWords = commonWords;
|
||||
}
|
||||
|
|
|
@ -76,7 +76,7 @@ public class CommonGramsFilterFactory extends TokenFilterFactory implements Reso
|
|||
|
||||
@Override
|
||||
public TokenFilter create(TokenStream input) {
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
|
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
|
||||
return commonGrams;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,31 +17,18 @@ package org.apache.lucene.analysis.compound;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* Base class for decomposition token filters.
|
||||
* <p>
|
||||
*
|
||||
* <a name="version"></a>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* CompoundWordTokenFilterBase:
|
||||
* <ul>
|
||||
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
|
||||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* <li>As of 4.4, {@link CompoundWordTokenFilterBase} doesn't update offsets.
|
||||
* </ul>
|
||||
*/
|
||||
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||
/**
|
||||
|
@ -59,7 +46,6 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
*/
|
||||
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
|
||||
|
||||
protected final Version matchVersion;
|
||||
protected final CharArraySet dictionary;
|
||||
protected final LinkedList<CompoundToken> tokens;
|
||||
protected final int minWordSize;
|
||||
|
@ -71,19 +57,18 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private AttributeSource.State current;
|
||||
private State current;
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) {
|
||||
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary) {
|
||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary) {
|
||||
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input);
|
||||
this.matchVersion = matchVersion;
|
||||
this.tokens=new LinkedList<>();
|
||||
if (minWordSize < 0) {
|
||||
throw new IllegalArgumentException("minWordSize cannot be negative");
|
||||
|
@ -154,20 +139,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
|
||||
|
||||
// offsets of the original word
|
||||
int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
|
||||
int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
|
||||
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_4_4) ||
|
||||
endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
this.startOffset = startOff;
|
||||
this.endOffset = endOff;
|
||||
} else {
|
||||
final int newStart = startOff + offset;
|
||||
this.startOffset = newStart;
|
||||
this.endOffset = newStart + length;
|
||||
}
|
||||
this.startOffset = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
|
||||
this.endOffset = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,45 +18,29 @@ package org.apache.lucene.analysis.compound;
|
|||
*/
|
||||
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
|
||||
* A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages.
|
||||
* <p>
|
||||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||
* "Donaudampfschiff" even when you only enter "schiff".
|
||||
* It uses a brute-force algorithm to achieve this.
|
||||
* <p>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* CompoundWordTokenFilterBase:
|
||||
* <ul>
|
||||
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
|
||||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
*/
|
||||
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
||||
|
||||
/**
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase.html#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* the {@link org.apache.lucene.analysis.TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against.
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) {
|
||||
super(matchVersion, input, dictionary);
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet dictionary) {
|
||||
super(input, dictionary);
|
||||
if (dictionary == null) {
|
||||
throw new IllegalArgumentException("dictionary cannot be null");
|
||||
}
|
||||
|
@ -65,13 +49,8 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
/**
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase.html#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* the {@link org.apache.lucene.analysis.TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against.
|
||||
* @param minWordSize
|
||||
|
@ -83,9 +62,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary,
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
if (dictionary == null) {
|
||||
throw new IllegalArgumentException("dictionary cannot be null");
|
||||
}
|
||||
|
|
|
@ -22,12 +22,13 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Factory for {@link DictionaryCompoundWordTokenFilter}.
|
||||
* Factory for {@link Lucene43DictionaryCompoundWordTokenFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_dictcomp" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
|
@ -50,9 +51,9 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
|
|||
super(args);
|
||||
assureMatchVersion();
|
||||
dictFile = require(args, "dictionary");
|
||||
minWordSize = getInt(args, "minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
|
||||
minSubwordSize = getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
minWordSize = getInt(args, "minWordSize", Lucene43CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
|
||||
minSubwordSize = getInt(args, "minSubwordSize", Lucene43CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize = getInt(args, "maxSubwordSize", Lucene43CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
|
@ -67,8 +68,13 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
|
|||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
// if the dictionary is null, it means it was empty
|
||||
return dictionary == null ? input : new DictionaryCompoundWordTokenFilter
|
||||
(luceneMatchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
if (dictionary == null) {
|
||||
return input;
|
||||
}
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4)) {
|
||||
return new DictionaryCompoundWordTokenFilter(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
return new Lucene43DictionaryCompoundWordTokenFilter(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,31 +17,21 @@ package org.apache.lucene.analysis.compound;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
|
||||
* <p>
|
||||
* A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages.
|
||||
*
|
||||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||
* "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
|
||||
* grammar and a word dictionary to achieve this.
|
||||
* <p>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* CompoundWordTokenFilterBase:
|
||||
* <ul>
|
||||
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
|
||||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
*/
|
||||
public class HyphenationCompoundWordTokenFilter extends
|
||||
CompoundWordTokenFilterBase {
|
||||
|
@ -50,34 +40,24 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase.html#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* the {@link org.apache.lucene.analysis.TokenStream} to process
|
||||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, CharArraySet dictionary) {
|
||||
this(matchVersion, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase.html#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* the {@link org.apache.lucene.analysis.TokenStream} to process
|
||||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
|
@ -91,10 +71,10 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||
onlyLongestMatch);
|
||||
|
||||
this.hyphenator = hyphenator;
|
||||
|
@ -103,27 +83,27 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
/**
|
||||
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
||||
* <p>
|
||||
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean)
|
||||
* Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.util.CharArraySet, int, int, int, boolean)
|
||||
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
||||
* null, minWordSize, minSubwordSize, maxSubwordSize }
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
|
||||
int maxSubwordSize) {
|
||||
this(matchVersion, input, hyphenator, null, minWordSize, minSubwordSize,
|
||||
this(input, hyphenator, null, minWordSize, minSubwordSize,
|
||||
maxSubwordSize, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
||||
* <p>
|
||||
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
|
||||
* Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, int, int, int)
|
||||
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
||||
* DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator) {
|
||||
this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
|
||||
this(input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
|
||||
DEFAULT_MAX_SUBWORD_SIZE);
|
||||
}
|
||||
|
||||
|
@ -132,7 +112,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
*
|
||||
* @param hyphenationFilename the filename of the XML grammar to load
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
* @throws java.io.IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
|
||||
throws IOException {
|
||||
|
@ -144,7 +124,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
*
|
||||
* @param hyphenationFile the file of the XML grammar to load
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
* @throws java.io.IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(File hyphenationFile)
|
||||
throws IOException {
|
||||
|
@ -156,7 +136,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
*
|
||||
* @param hyphenationSource the InputSource pointing to the XML grammar
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
* @throws java.io.IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
|
||||
throws IOException {
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.compound;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -28,10 +29,12 @@ import org.apache.lucene.util.IOUtils;
|
|||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
/**
|
||||
* Factory for {@link HyphenationCompoundWordTokenFilter}.
|
||||
* Factory for {@link Lucene43HyphenationCompoundWordTokenFilter}.
|
||||
* <p>
|
||||
* This factory accepts the following parameters:
|
||||
* <ul>
|
||||
|
@ -55,7 +58,7 @@ import org.xml.sax.InputSource;
|
|||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @see HyphenationCompoundWordTokenFilter
|
||||
* @see Lucene43HyphenationCompoundWordTokenFilter
|
||||
*/
|
||||
public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
private CharArraySet dictionary;
|
||||
|
@ -75,9 +78,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactor
|
|||
dictFile = get(args, "dictionary");
|
||||
encoding = get(args, "encoding");
|
||||
hypFile = require(args, "hyphenator");
|
||||
minWordSize = getInt(args, "minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
|
||||
minSubwordSize = getInt(args, "minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
minWordSize = getInt(args, "minWordSize", Lucene43CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
|
||||
minSubwordSize = getInt(args, "minSubwordSize", Lucene43CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize = getInt(args, "maxSubwordSize", Lucene43CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
onlyLongestMatch = getBoolean(args, "onlyLongestMatch", false);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
|
@ -96,14 +99,21 @@ public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactor
|
|||
final InputSource is = new InputSource(stream);
|
||||
is.setEncoding(encoding); // if it's null let xml parser decide
|
||||
is.setSystemId(hypFile);
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4)) {
|
||||
hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
||||
} else {
|
||||
hyphenator = Lucene43HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(stream);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public HyphenationCompoundWordTokenFilter create(TokenStream input) {
|
||||
return new HyphenationCompoundWordTokenFilter(luceneMatchVersion, input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
public TokenFilter create(TokenStream input) {
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4)) {
|
||||
return new HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
return new Lucene43HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,162 @@
|
|||
package org.apache.lucene.analysis.compound;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Base class for decomposition token filters using pre-4.4 behavior.
|
||||
* <p>
|
||||
* @deprecated Use {@link CompoundWordTokenFilterBase}
|
||||
*/
|
||||
@Deprecated
|
||||
public abstract class Lucene43CompoundWordTokenFilterBase extends TokenFilter {
|
||||
/**
|
||||
* The default for minimal word length that gets decomposed
|
||||
*/
|
||||
public static final int DEFAULT_MIN_WORD_SIZE = 5;
|
||||
|
||||
/**
|
||||
* The default for minimal length of subwords that get propagated to the output of this filter
|
||||
*/
|
||||
public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
|
||||
|
||||
/**
|
||||
* The default for maximal length of subwords that get propagated to the output of this filter
|
||||
*/
|
||||
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
|
||||
|
||||
protected final CharArraySet dictionary;
|
||||
protected final LinkedList<CompoundToken> tokens;
|
||||
protected final int minWordSize;
|
||||
protected final int minSubwordSize;
|
||||
protected final int maxSubwordSize;
|
||||
protected final boolean onlyLongestMatch;
|
||||
|
||||
protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private AttributeSource.State current;
|
||||
|
||||
protected Lucene43CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, boolean onlyLongestMatch) {
|
||||
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected Lucene43CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary) {
|
||||
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected Lucene43CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input);
|
||||
this.tokens=new LinkedList<>();
|
||||
if (minWordSize < 0) {
|
||||
throw new IllegalArgumentException("minWordSize cannot be negative");
|
||||
}
|
||||
this.minWordSize=minWordSize;
|
||||
if (minSubwordSize < 0) {
|
||||
throw new IllegalArgumentException("minSubwordSize cannot be negative");
|
||||
}
|
||||
this.minSubwordSize=minSubwordSize;
|
||||
if (maxSubwordSize < 0) {
|
||||
throw new IllegalArgumentException("maxSubwordSize cannot be negative");
|
||||
}
|
||||
this.maxSubwordSize=maxSubwordSize;
|
||||
this.onlyLongestMatch=onlyLongestMatch;
|
||||
this.dictionary = dictionary;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!tokens.isEmpty()) {
|
||||
assert current != null;
|
||||
CompoundToken token = tokens.removeFirst();
|
||||
restoreState(current); // keep all other attributes untouched
|
||||
termAtt.setEmpty().append(token.txt);
|
||||
offsetAtt.setOffset(token.startOffset, token.endOffset);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
return true;
|
||||
}
|
||||
|
||||
current = null; // not really needed, but for safety
|
||||
if (input.incrementToken()) {
|
||||
// Only words longer than minWordSize get processed
|
||||
if (termAtt.length() >= this.minWordSize) {
|
||||
decompose();
|
||||
// only capture the state if we really need it for producing new tokens
|
||||
if (!tokens.isEmpty()) {
|
||||
current = captureState();
|
||||
}
|
||||
}
|
||||
// return original token:
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list.
|
||||
* The original token may not be placed in the list, as it is automatically passed through this filter.
|
||||
*/
|
||||
protected abstract void decompose();
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokens.clear();
|
||||
current = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class to hold decompounded token information
|
||||
*/
|
||||
protected class CompoundToken {
|
||||
public final CharSequence txt;
|
||||
public final int startOffset, endOffset;
|
||||
|
||||
/** Construct the compound token based on a slice of the current {@link Lucene43CompoundWordTokenFilterBase#termAtt}. */
|
||||
public CompoundToken(int offset, int length) {
|
||||
this.txt = Lucene43CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
|
||||
|
||||
// offsets of the original word
|
||||
int startOff = Lucene43CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
|
||||
int endOff = Lucene43CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
|
||||
|
||||
if (endOff - startOff != Lucene43CompoundWordTokenFilterBase.this.termAtt.length()) {
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
this.startOffset = startOff;
|
||||
this.endOffset = endOff;
|
||||
} else {
|
||||
final int newStart = startOff + offset;
|
||||
this.startOffset = newStart;
|
||||
this.endOffset = newStart + length;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
package org.apache.lucene.analysis.compound;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages, using
|
||||
* pre-4.4 behavior.
|
||||
* @deprecated Use {@link DictionaryCompoundWordTokenFilter}.
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene43DictionaryCompoundWordTokenFilter extends Lucene43CompoundWordTokenFilterBase {
|
||||
|
||||
/**
|
||||
* Creates a new {@link Lucene43DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against.
|
||||
*/
|
||||
public Lucene43DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet dictionary) {
|
||||
super(input, dictionary);
|
||||
if (dictionary == null) {
|
||||
throw new IllegalArgumentException("dictionary cannot be null");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link Lucene43DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against.
|
||||
* @param minWordSize
|
||||
* only words longer than this get processed
|
||||
* @param minSubwordSize
|
||||
* only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize
|
||||
* only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public Lucene43DictionaryCompoundWordTokenFilter(TokenStream input, CharArraySet dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
if (dictionary == null) {
|
||||
throw new IllegalArgumentException("dictionary cannot be null");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void decompose() {
|
||||
final int len = termAtt.length();
|
||||
for (int i=0;i<=len-this.minSubwordSize;++i) {
|
||||
CompoundToken longestMatchToken=null;
|
||||
for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
|
||||
if(i+j>len) {
|
||||
break;
|
||||
}
|
||||
if(dictionary.contains(termAtt.buffer(), i, j)) {
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken!=null) {
|
||||
if (longestMatchToken.txt.length()<j) {
|
||||
longestMatchToken=new CompoundToken(i,j);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken=new CompoundToken(i,j);
|
||||
}
|
||||
} else {
|
||||
tokens.add(new CompoundToken(i,j));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this.onlyLongestMatch && longestMatchToken!=null) {
|
||||
tokens.add(longestMatchToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,217 @@
|
|||
package org.apache.lucene.analysis.compound;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages,
|
||||
* using pre-4.4 behavior.
|
||||
*
|
||||
* @deprecated Use {@link HyphenationCompoundWordTokenFilter}.
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene43HyphenationCompoundWordTokenFilter extends
|
||||
Lucene43CompoundWordTokenFilterBase {
|
||||
private HyphenationTree hyphenator;
|
||||
|
||||
/**
|
||||
* Creates a new {@link Lucene43HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against.
|
||||
*/
|
||||
public Lucene43HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, CharArraySet dictionary) {
|
||||
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link Lucene43HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against.
|
||||
* @param minWordSize
|
||||
* only words longer than this get processed
|
||||
* @param minSubwordSize
|
||||
* only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize
|
||||
* only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public Lucene43HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||
onlyLongestMatch);
|
||||
|
||||
this.hyphenator = hyphenator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
||||
* <p>
|
||||
* Calls {@link #Lucene43HyphenationCompoundWordTokenFilter(TokenStream, HyphenationTree, CharArraySet, int, int, int, boolean)
|
||||
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
||||
* null, minWordSize, minSubwordSize, maxSubwordSize }
|
||||
*/
|
||||
public Lucene43HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
|
||||
int maxSubwordSize) {
|
||||
this(input, hyphenator, null, minWordSize, minSubwordSize,
|
||||
maxSubwordSize, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
||||
* <p>
|
||||
* Calls {@link #Lucene43HyphenationCompoundWordTokenFilter(TokenStream, HyphenationTree, int, int, int)
|
||||
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
||||
* DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
|
||||
*/
|
||||
public Lucene43HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator) {
|
||||
this(input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
|
||||
DEFAULT_MAX_SUBWORD_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a hyphenator tree
|
||||
*
|
||||
* @param hyphenationFilename the filename of the XML grammar to load
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
|
||||
throws IOException {
|
||||
return getHyphenationTree(new InputSource(hyphenationFilename));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a hyphenator tree
|
||||
*
|
||||
* @param hyphenationFile the file of the XML grammar to load
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(File hyphenationFile)
|
||||
throws IOException {
|
||||
return getHyphenationTree(new InputSource(hyphenationFile.toURI().toASCIIString()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a hyphenator tree
|
||||
*
|
||||
* @param hyphenationSource the InputSource pointing to the XML grammar
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
|
||||
throws IOException {
|
||||
HyphenationTree tree = new HyphenationTree();
|
||||
tree.loadPatterns(hyphenationSource);
|
||||
return tree;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void decompose() {
|
||||
// get the hyphenation points
|
||||
Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
|
||||
// No hyphen points found -> exit
|
||||
if (hyphens == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
final int[] hyp = hyphens.getHyphenationPoints();
|
||||
|
||||
for (int i = 0; i < hyp.length; ++i) {
|
||||
int remaining = hyp.length - i;
|
||||
int start = hyp[i];
|
||||
CompoundToken longestMatchToken = null;
|
||||
for (int j = 1; j < remaining; j++) {
|
||||
int partLength = hyp[i + j] - start;
|
||||
|
||||
// if the part is longer than maxSubwordSize we
|
||||
// are done with this round
|
||||
if (partLength > this.maxSubwordSize) {
|
||||
break;
|
||||
}
|
||||
|
||||
// we only put subwords to the token stream
|
||||
// that are longer than minPartSize
|
||||
if (partLength < this.minSubwordSize) {
|
||||
// BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
|
||||
// calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
|
||||
continue;
|
||||
}
|
||||
|
||||
// check the dictionary
|
||||
if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken != null) {
|
||||
if (longestMatchToken.txt.length() < partLength) {
|
||||
longestMatchToken = new CompoundToken(start, partLength);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken = new CompoundToken(start, partLength);
|
||||
}
|
||||
} else {
|
||||
tokens.add(new CompoundToken(start, partLength));
|
||||
}
|
||||
} else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
|
||||
// check the dictionary again with a word that is one character
|
||||
// shorter
|
||||
// to avoid problems with genitive 's characters and other binding
|
||||
// characters
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken != null) {
|
||||
if (longestMatchToken.txt.length() < partLength - 1) {
|
||||
longestMatchToken = new CompoundToken(start, partLength - 1);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken = new CompoundToken(start, partLength - 1);
|
||||
}
|
||||
} else {
|
||||
tokens.add(new CompoundToken(start, partLength - 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this.onlyLongestMatch && longestMatchToken!=null) {
|
||||
tokens.add(longestMatchToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.core;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
||||
/**
|
||||
|
|
|
@ -18,13 +18,11 @@ package org.apache.lucene.analysis.core;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Emits the entire input as a single token.
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core;
|
|||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A LetterTokenizer is a tokenizer that divides text at non-letters. That's to
|
||||
|
@ -30,41 +29,25 @@ import org.apache.lucene.util.Version;
|
|||
* Note: this does a decent job for most European languages, but does a terrible
|
||||
* job for some Asian languages, where words are not separated by spaces.
|
||||
* </p>
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* {@link LetterTokenizer}:
|
||||
* <ul>
|
||||
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
|
||||
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
|
||||
* {@link CharTokenizer#normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
|
||||
public class LetterTokenizer extends CharTokenizer {
|
||||
|
||||
/**
|
||||
* Construct a new LetterTokenizer.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
*/
|
||||
public LetterTokenizer(Version matchVersion) {
|
||||
super(matchVersion);
|
||||
public LetterTokenizer() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new LetterTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
* @param factory
|
||||
* the attribute factory to use for this {@link Tokenizer}
|
||||
*/
|
||||
public LetterTokenizer(Version matchVersion, AttributeFactory factory) {
|
||||
super(matchVersion, factory);
|
||||
public LetterTokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/** Collects only characters which satisfy
|
||||
|
|
|
@ -36,7 +36,6 @@ public class LetterTokenizerFactory extends TokenizerFactory {
|
|||
/** Creates a new LetterTokenizerFactory */
|
||||
public LetterTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -44,6 +43,6 @@ public class LetterTokenizerFactory extends TokenizerFactory {
|
|||
|
||||
@Override
|
||||
public LetterTokenizer create(AttributeFactory factory) {
|
||||
return new LetterTokenizer(luceneMatchVersion, factory);
|
||||
return new LetterTokenizer(factory);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,30 +23,21 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case.
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating LowerCaseFilter:
|
||||
* <ul>
|
||||
* <li> As of 3.1, supplementary characters are properly lowercased.
|
||||
* </ul>
|
||||
*/
|
||||
public final class LowerCaseFilter extends TokenFilter {
|
||||
private final CharacterUtils charUtils;
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
* Create a new LowerCaseFilter, that normalizes token text to lower case.
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param in TokenStream to filter
|
||||
*/
|
||||
public LowerCaseFilter(Version matchVersion, TokenStream in) {
|
||||
public LowerCaseFilter(TokenStream in) {
|
||||
super(in);
|
||||
charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -40,7 +40,6 @@ public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiT
|
|||
/** Creates a new LowerCaseFilterFactory */
|
||||
public LowerCaseFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -48,7 +47,7 @@ public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiT
|
|||
|
||||
@Override
|
||||
public LowerCaseFilter create(TokenStream input) {
|
||||
return new LowerCaseFilter(luceneMatchVersion,input);
|
||||
return new LowerCaseFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,13 +17,8 @@ package org.apache.lucene.analysis.core;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* LowerCaseTokenizer performs the function of LetterTokenizer
|
||||
|
@ -35,41 +30,24 @@ import org.apache.lucene.util.Version;
|
|||
* Note: this does a decent job for most European languages, but does a terrible
|
||||
* job for some Asian languages, where words are not separated by spaces.
|
||||
* </p>
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* {@link LowerCaseTokenizer}:
|
||||
* <ul>
|
||||
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
|
||||
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
|
||||
* {@link CharTokenizer#normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public final class LowerCaseTokenizer extends LetterTokenizer {
|
||||
|
||||
/**
|
||||
* Construct a new LowerCaseTokenizer.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
*
|
||||
*/
|
||||
public LowerCaseTokenizer(Version matchVersion) {
|
||||
super(matchVersion);
|
||||
public LowerCaseTokenizer() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new LowerCaseTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
* @param factory
|
||||
* the attribute factory to use for this {@link Tokenizer}
|
||||
*/
|
||||
public LowerCaseTokenizer(Version matchVersion, AttributeFactory factory) {
|
||||
super(matchVersion, factory);
|
||||
public LowerCaseTokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/** Converts char to lower case
|
||||
|
|
|
@ -39,7 +39,6 @@ public class LowerCaseTokenizerFactory extends TokenizerFactory implements Multi
|
|||
/** Creates a new LowerCaseTokenizerFactory */
|
||||
public LowerCaseTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -47,7 +46,7 @@ public class LowerCaseTokenizerFactory extends TokenizerFactory implements Multi
|
|||
|
||||
@Override
|
||||
public LowerCaseTokenizer create(AttributeFactory factory) {
|
||||
return new LowerCaseTokenizer(luceneMatchVersion, factory);
|
||||
return new LowerCaseTokenizer(factory);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,38 +17,21 @@ package org.apache.lucene.analysis.core;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** An {@link Analyzer} that filters {@link LetterTokenizer}
|
||||
* with {@link LowerCaseFilter}
|
||||
* <p>
|
||||
* <a name="version">You must specify the required {@link Version} compatibility
|
||||
* when creating {@link CharTokenizer}:
|
||||
* <ul>
|
||||
* <li>As of 3.1, {@link LowerCaseTokenizer} uses an int based API to normalize and
|
||||
* detect token codepoints. See {@link CharTokenizer#isTokenChar(int)} and
|
||||
* {@link CharTokenizer#normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
**/
|
||||
public final class SimpleAnalyzer extends Analyzer {
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Creates a new {@link SimpleAnalyzer}
|
||||
* @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
|
||||
*/
|
||||
public SimpleAnalyzer(Version matchVersion) {
|
||||
this.matchVersion = matchVersion;
|
||||
public SimpleAnalyzer() {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||
return new TokenStreamComponents(new LowerCaseTokenizer(matchVersion));
|
||||
return new TokenStreamComponents(new LowerCaseTokenizer());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,20 +27,10 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StopAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords
|
||||
* <li> As of 2.9, position increments are preserved
|
||||
* </ul>
|
||||
/**
|
||||
* Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
|
||||
*/
|
||||
|
||||
public final class StopAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/** An unmodifiable set containing some common English words that are not usually useful
|
||||
|
@ -55,40 +45,35 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
|||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
);
|
||||
final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
stopWords, false);
|
||||
final CharArraySet stopSet = new CharArraySet(stopWords, false);
|
||||
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
|
||||
}
|
||||
|
||||
/** Builds an analyzer which removes words in
|
||||
* {@link #ENGLISH_STOP_WORDS_SET}.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
*/
|
||||
public StopAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, ENGLISH_STOP_WORDS_SET);
|
||||
public StopAnalyzer() {
|
||||
this(ENGLISH_STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given set.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopWords Set of stop words */
|
||||
public StopAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||
super(matchVersion, stopWords);
|
||||
public StopAnalyzer(CharArraySet stopWords) {
|
||||
super(stopWords);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @see WordlistLoader#getWordSet(Reader)
|
||||
* @param stopwordsFile File to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
|
||||
this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion));
|
||||
public StopAnalyzer(File stopwordsFile) throws IOException {
|
||||
this(loadStopwordSet(stopwordsFile));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @see WordlistLoader#getWordSet(Reader)
|
||||
* @param stopwords Reader to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
public StopAnalyzer(Reader stopwords) throws IOException {
|
||||
this(loadStopwordSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -102,9 +87,8 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new LowerCaseTokenizer(matchVersion);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion,
|
||||
source, stopwords));
|
||||
final Tokenizer source = new LowerCaseTokenizer();
|
||||
return new TokenStreamComponents(source, new StopFilter(source, stopwords));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,19 +24,9 @@ import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes stop words from a token stream.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StopFilter:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords and position
|
||||
* increments are preserved
|
||||
* </ul>
|
||||
*/
|
||||
public final class StopFilter extends FilteringTokenFilter {
|
||||
|
||||
|
@ -47,17 +37,14 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* Constructs a filter which removes words from the input TokenStream that are
|
||||
* named in the Set.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the stop
|
||||
* set if Version > 3.0. See <a href="#version">above</a> for details.
|
||||
* @param in
|
||||
* Input stream
|
||||
* @param stopWords
|
||||
* A {@link CharArraySet} representing the stopwords.
|
||||
* @see #makeStopSet(Version, java.lang.String...)
|
||||
* @see #makeStopSet(java.lang.String...)
|
||||
*/
|
||||
public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
|
||||
super(matchVersion, in);
|
||||
public StopFilter(TokenStream in, CharArraySet stopWords) {
|
||||
super(in);
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
|
@ -67,12 +54,11 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords An array of stopwords
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static CharArraySet makeStopSet(Version matchVersion, String... stopWords) {
|
||||
return makeStopSet(matchVersion, stopWords, false);
|
||||
public static CharArraySet makeStopSet(String... stopWords) {
|
||||
return makeStopSet(stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -81,38 +67,35 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords) {
|
||||
return makeStopSet(matchVersion, stopWords, false);
|
||||
public static CharArraySet makeStopSet(List<?> stopWords) {
|
||||
return makeStopSet(stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a stopword set from the given stopword array.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords An array of stopwords
|
||||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
*/
|
||||
public static CharArraySet makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
|
||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
|
||||
public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) {
|
||||
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
|
||||
stopSet.addAll(Arrays.asList(stopWords));
|
||||
return stopSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a stopword set from the given stopword list.
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @param ignoreCase if true, all words are lower cased first
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
*/
|
||||
public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
|
||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
|
||||
public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){
|
||||
CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
|
||||
stopSet.addAll(stopWords);
|
||||
return stopSet;
|
||||
}
|
||||
|
|
|
@ -81,7 +81,6 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
|
|||
/** Creates a new StopFilterFactory */
|
||||
public StopFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
stopWordFiles = get(args, "words");
|
||||
format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
|
||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||
|
@ -104,7 +103,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
|
|||
if (null != format) {
|
||||
throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
|
||||
}
|
||||
stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
|
||||
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -118,7 +117,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords);
|
||||
StopFilter stopFilter = new StopFilter(input,stopWords);
|
||||
return stopFilter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes tokens whose types appear in a set of blocked types from a token stream.
|
||||
|
@ -35,14 +34,13 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
|
|||
|
||||
/**
|
||||
* Create a new {@link TypeTokenFilter}.
|
||||
* @param version the Lucene match version
|
||||
* @param input the {@link TokenStream} to consume
|
||||
* @param stopTypes the types to filter
|
||||
* @param useWhiteList if true, then tokens whose type is in stopTypes will
|
||||
* be kept, otherwise they will be filtered out
|
||||
*/
|
||||
public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
|
||||
super(version, input);
|
||||
public TypeTokenFilter(TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
|
||||
super(input);
|
||||
this.stopTypes = stopTypes;
|
||||
this.useWhiteList = useWhiteList;
|
||||
}
|
||||
|
@ -50,10 +48,9 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
|
|||
/**
|
||||
* Create a new {@link TypeTokenFilter} that filters tokens out
|
||||
* (useWhiteList=false).
|
||||
* @see #TypeTokenFilter(Version, TokenStream, Set, boolean)
|
||||
*/
|
||||
public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes) {
|
||||
this(version, input, stopTypes, false);
|
||||
public TypeTokenFilter(TokenStream input, Set<String> stopTypes) {
|
||||
this(input, stopTypes, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -72,7 +72,7 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
final TokenStream filter = new TypeTokenFilter(luceneMatchVersion, input, stopTypes, useWhitelist);
|
||||
final TokenStream filter = new TypeTokenFilter(input, stopTypes, useWhitelist);
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,13 +23,9 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Normalizes token text to UPPER CASE.
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating UpperCaseFilter
|
||||
*
|
||||
* <p><b>NOTE:</b> In Unicode, this transformation may lose information when the
|
||||
* upper case character represents more than one lower case character. Use this filter
|
||||
|
@ -37,18 +33,16 @@ import org.apache.lucene.util.Version;
|
|||
* general search matching
|
||||
*/
|
||||
public final class UpperCaseFilter extends TokenFilter {
|
||||
private final CharacterUtils charUtils;
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
* Create a new UpperCaseFilter, that normalizes token text to upper case.
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param in TokenStream to filter
|
||||
*/
|
||||
public UpperCaseFilter(Version matchVersion, TokenStream in) {
|
||||
public UpperCaseFilter(TokenStream in) {
|
||||
super(in);
|
||||
charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -45,7 +45,6 @@ public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiT
|
|||
/** Creates a new UpperCaseFilterFactory */
|
||||
public UpperCaseFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -53,7 +52,7 @@ public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiT
|
|||
|
||||
@Override
|
||||
public UpperCaseFilter create(TokenStream input) {
|
||||
return new UpperCaseFilter(luceneMatchVersion,input);
|
||||
return new UpperCaseFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,38 +17,21 @@ package org.apache.lucene.analysis.core;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* An Analyzer that uses {@link WhitespaceTokenizer}.
|
||||
* <p>
|
||||
* <a name="version">You must specify the required {@link Version} compatibility
|
||||
* when creating {@link CharTokenizer}:
|
||||
* <ul>
|
||||
* <li>As of 3.1, {@link WhitespaceTokenizer} uses an int based API to normalize and
|
||||
* detect token codepoints. See {@link CharTokenizer#isTokenChar(int)} and
|
||||
* {@link CharTokenizer#normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
**/
|
||||
public final class WhitespaceAnalyzer extends Analyzer {
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Creates a new {@link WhitespaceAnalyzer}
|
||||
* @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
|
||||
*/
|
||||
public WhitespaceAnalyzer(Version matchVersion) {
|
||||
this.matchVersion = matchVersion;
|
||||
public WhitespaceAnalyzer() {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||
return new TokenStreamComponents(new WhitespaceTokenizer(matchVersion));
|
||||
return new TokenStreamComponents(new WhitespaceTokenizer());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,50 +17,31 @@ package org.apache.lucene.analysis.core;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
|
||||
* Adjacent sequences of non-Whitespace characters form tokens. <a
|
||||
* name="version"/>
|
||||
* <p>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* {@link WhitespaceTokenizer}:
|
||||
* <ul>
|
||||
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
|
||||
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
|
||||
* {@link CharTokenizer#normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* Adjacent sequences of non-Whitespace characters form tokens.
|
||||
*/
|
||||
public final class WhitespaceTokenizer extends CharTokenizer {
|
||||
|
||||
/**
|
||||
* Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
|
||||
* to match See {@link <a href="#version">above</a>}
|
||||
*
|
||||
* Construct a new WhitespaceTokenizer.
|
||||
*/
|
||||
public WhitespaceTokenizer(Version matchVersion) {
|
||||
super(matchVersion);
|
||||
public WhitespaceTokenizer() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new WhitespaceTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param
|
||||
* matchVersion Lucene version to match See
|
||||
* {@link <a href="#version">above</a>}
|
||||
* @param factory
|
||||
* the attribute factory to use for this {@link Tokenizer}
|
||||
*/
|
||||
public WhitespaceTokenizer(Version matchVersion, AttributeFactory factory) {
|
||||
super(matchVersion, factory);
|
||||
public WhitespaceTokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/** Collects only characters which do not satisfy
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.core;
|
|||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
@ -37,7 +36,6 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
|
|||
/** Creates a new WhitespaceTokenizerFactory */
|
||||
public WhitespaceTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -45,6 +43,6 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
|
|||
|
||||
@Override
|
||||
public WhitespaceTokenizer create(AttributeFactory factory) {
|
||||
return new WhitespaceTokenizer(luceneMatchVersion, factory);
|
||||
return new WhitespaceTokenizer(factory);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
@ -61,7 +60,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#");
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -75,34 +74,30 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||
*
|
||||
* @param matchVersion Lucene version to match
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
public CzechAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion Lucene version to match
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public CzechAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words and a set of work to be
|
||||
* excluded from the {@link CzechStemFilter}.
|
||||
*
|
||||
* @param matchVersion Lucene version to match
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionTable a stemming exclusion set
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||
public CzechAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable) {
|
||||
super(stopwords);
|
||||
this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -115,16 +110,16 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
|
||||
* a stem exclusion set is provided via
|
||||
* {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
|
||||
* {@link #CzechAnalyzer(CharArraySet, CharArraySet)} a
|
||||
* {@link SetKeywordMarkerFilter} is added before
|
||||
* {@link CzechStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!this.stemExclusionTable.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionTable);
|
||||
result = new CzechStemFilter(result);
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.DanishStemmer;
|
||||
|
||||
/**
|
||||
|
@ -64,7 +63,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -76,18 +75,17 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public DanishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public DanishAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public DanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public DanishAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -95,14 +93,12 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public DanishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -119,10 +115,10 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new DanishStemmer());
|
||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for German language.
|
||||
|
@ -48,7 +47,7 @@ import org.apache.lucene.util.Version;
|
|||
* exclusion list is empty by default.
|
||||
* </p>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* <p><b>NOTE</b>: This class uses the same {@link org.apache.lucene.util.Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||
|
@ -69,7 +68,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -91,35 +90,31 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
* Builds an analyzer with the default stop words:
|
||||
* {@link #getDefaultStopSet()}.
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
public GermanAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public GermanAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclusionSet
|
||||
* a stemming exclusion set
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
public GermanAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -135,10 +130,10 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
result = new SetKeywordMarkerFilter(result, exclusionSet);
|
||||
result = new GermanNormalizationFilter(result);
|
||||
result = new GermanLightStemFilter(result);
|
||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for the Greek language.
|
||||
|
@ -38,7 +37,7 @@ import org.apache.lucene.util.Version;
|
|||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
* </p>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* <p><b>NOTE</b>: This class uses the same {@link org.apache.lucene.util.Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
||||
|
@ -69,10 +68,9 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words.
|
||||
* @param matchVersion Lucene compatibility version
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
public GreekAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -81,11 +79,10 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
* <b>NOTE:</b> The stopwords set should be pre-processed with the logic of
|
||||
* {@link GreekLowerCaseFilter} for best results.
|
||||
*
|
||||
* @param matchVersion Lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
super(matchVersion, stopwords);
|
||||
public GreekAnalyzer(CharArraySet stopwords) {
|
||||
super(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -100,10 +97,10 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
|
||||
result = new StandardFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new GreekLowerCaseFilter(source);
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
result = new GreekStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
|
|
@ -22,32 +22,22 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case, removes some Greek diacritics,
|
||||
* and standardizes final sigma to sigma.
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating GreekLowerCaseFilter:
|
||||
* <ul>
|
||||
* <li> As of 3.1, supplementary characters are properly lowercased.
|
||||
* </ul>
|
||||
*/
|
||||
public final class GreekLowerCaseFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final CharacterUtils charUtils;
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
|
||||
/**
|
||||
* Create a GreekLowerCaseFilter that normalizes Greek token text.
|
||||
*
|
||||
* @param matchVersion Lucene compatibility version,
|
||||
* See <a href="#version">above</a>
|
||||
* @param in TokenStream to filter
|
||||
*/
|
||||
public GreekLowerCaseFilter(Version matchVersion, TokenStream in) {
|
||||
public GreekLowerCaseFilter(TokenStream in) {
|
||||
super(in);
|
||||
this.charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -40,7 +40,6 @@ public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements M
|
|||
/** Creates a new GreekLowerCaseFilterFactory */
|
||||
public GreekLowerCaseFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -48,7 +47,7 @@ public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements M
|
|||
|
||||
@Override
|
||||
public GreekLowerCaseFilter create(TokenStream in) {
|
||||
return new GreekLowerCaseFilter(luceneMatchVersion, in);
|
||||
return new GreekLowerCaseFilter(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
|
@ -205,7 +204,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc4 = new CharArraySet(
|
||||
Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
|
||||
false);
|
||||
|
||||
|
@ -231,7 +230,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc6 = new CharArraySet(
|
||||
Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
|
||||
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
|
||||
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
|
||||
|
@ -256,7 +255,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc7 = new CharArraySet(
|
||||
Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
|
||||
"πεθ", "πικρ", "ποτ", "σιχ", "χ"),
|
||||
false);
|
||||
|
@ -283,11 +282,11 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc8a = new CharArraySet(
|
||||
Arrays.asList("τρ", "τσ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc8b = new CharArraySet(
|
||||
Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
|
||||
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
|
||||
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
|
||||
|
@ -346,7 +345,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc9 = new CharArraySet(
|
||||
Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
|
||||
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
|
||||
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
|
||||
|
@ -434,11 +433,11 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc12a = new CharArraySet(
|
||||
Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc12b = new CharArraySet(
|
||||
Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
|
||||
false);
|
||||
|
||||
|
@ -458,7 +457,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc13 = new CharArraySet(
|
||||
Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
|
||||
false);
|
||||
|
||||
|
@ -492,7 +491,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc14 = new CharArraySet(
|
||||
Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
|
||||
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
|
||||
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
|
||||
|
@ -530,7 +529,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc15a = new CharArraySet(
|
||||
Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
|
||||
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
|
||||
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
|
||||
|
@ -539,7 +538,7 @@ public class GreekStemmer {
|
|||
"ουλαμ", "ουρ", "π", "τρ", "μ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc15b = new CharArraySet(
|
||||
Arrays.asList("ψοφ", "ναυλοχ"),
|
||||
false);
|
||||
|
||||
|
@ -576,7 +575,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc16 = new CharArraySet(
|
||||
Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
|
||||
false);
|
||||
|
||||
|
@ -596,7 +595,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc17 = new CharArraySet(
|
||||
Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
|
||||
false);
|
||||
|
||||
|
@ -610,7 +609,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc18 = new CharArraySet(
|
||||
Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
|
||||
false);
|
||||
|
||||
|
@ -634,7 +633,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
private static final CharArraySet exc19 = new CharArraySet(
|
||||
Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
|
||||
false);
|
||||
|
||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for English.
|
||||
|
@ -57,18 +56,17 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
|
||||
*/
|
||||
public EnglishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public EnglishAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public EnglishAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -76,14 +74,12 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public EnglishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -101,11 +97,11 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new EnglishPossessiveFilter(matchVersion, result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new EnglishPossessiveFilter(result);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new PorterStemFilter(result);
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* TokenFilter that removes possessives (trailing 's) from words.
|
||||
|
@ -30,8 +29,7 @@ import org.apache.lucene.util.Version;
|
|||
public final class EnglishPossessiveFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
// NOTE: version now unused
|
||||
public EnglishPossessiveFilter(Version version, TokenStream input) {
|
||||
public EnglishPossessiveFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
|
|
|
@ -39,7 +39,6 @@ public class EnglishPossessiveFilterFactory extends TokenFilterFactory {
|
|||
/** Creates a new EnglishPossessiveFilterFactory */
|
||||
public EnglishPossessiveFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -47,6 +46,6 @@ public class EnglishPossessiveFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new EnglishPossessiveFilter(luceneMatchVersion, input);
|
||||
return new EnglishPossessiveFilter(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -64,7 +64,6 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
|||
* <p>Copyright: Copyright 2008, Luicid Imagination, Inc. </p>
|
||||
* <p>Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu) </p>
|
||||
*/
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* This class implements the Kstem algorithm
|
||||
|
@ -280,7 +279,7 @@ public class KStemmer {
|
|||
DictEntry defaultEntry;
|
||||
DictEntry entry;
|
||||
|
||||
CharArrayMap<DictEntry> d = new CharArrayMap<>(Version.LUCENE_CURRENT, 1000, false);
|
||||
CharArrayMap<DictEntry> d = new CharArrayMap<>(1000, false);
|
||||
for (int i = 0; i < exceptionWords.length; i++) {
|
||||
if (!d.containsKey(exceptionWords[i])) {
|
||||
entry = new DictEntry(exceptionWords[i], true);
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Spanish.
|
||||
|
@ -63,7 +62,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -75,18 +74,17 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public SpanishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public SpanishAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public SpanishAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -94,14 +92,12 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public SpanishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -118,10 +114,10 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SpanishLightStemFilter(result);
|
||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.BasqueStemmer;
|
||||
|
||||
/**
|
||||
|
@ -73,18 +72,17 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public BasqueAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public BasqueAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public BasqueAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -92,14 +90,12 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public BasqueAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public BasqueAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -116,10 +112,10 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new BasqueStemmer());
|
||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.analysis.core.StopFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Persian.
|
||||
|
@ -87,20 +86,18 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public PersianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
super(matchVersion, stopwords);
|
||||
public PersianAnalyzer(CharArraySet stopwords){
|
||||
super(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -115,8 +112,8 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new LowerCaseFilter(source);
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
/* additional persian-specific normalization */
|
||||
result = new PersianNormalizationFilter(result);
|
||||
|
@ -124,7 +121,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
* the order here is important: the stopword list is normalized with the
|
||||
* above!
|
||||
*/
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.FinnishStemmer;
|
||||
|
||||
/**
|
||||
|
@ -64,7 +63,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -76,18 +75,17 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public FinnishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public FinnishAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public FinnishAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -95,14 +93,12 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public FinnishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -119,10 +115,10 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new FinnishStemmer());
|
||||
|
|
|
@ -32,7 +32,6 @@ import org.apache.lucene.analysis.util.ElisionFilter;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -49,7 +48,7 @@ import java.util.Arrays;
|
|||
* exclusion list is empty by default.
|
||||
* </p>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* <p><b>NOTE</b>: This class uses the same {@link org.apache.lucene.util.Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||
|
@ -59,7 +58,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/** Default set of articles for ElisionFilter */
|
||||
public static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
|
||||
new CharArraySet(Arrays.asList(
|
||||
"l", "m", "t", "qu", "n", "s", "j", "d", "c", "jusqu", "quoiqu", "lorsqu", "puisqu"), true));
|
||||
|
||||
/**
|
||||
|
@ -80,7 +79,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -92,37 +91,33 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet}).
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public FrenchAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public FrenchAnalyzer(CharArraySet stopwords){
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclutionSet
|
||||
* a stemming exclusion set
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, CharArraySet stopwords,
|
||||
public FrenchAnalyzer(CharArraySet stopwords,
|
||||
CharArraySet stemExclutionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
super(stopwords);
|
||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(matchVersion, stemExclutionSet));
|
||||
.copy(stemExclutionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -139,11 +134,11 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, excltable);
|
||||
result = new FrenchLightStemFilter(result);
|
||||
|
|
|
@ -32,7 +32,6 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.IrishStemmer;
|
||||
|
||||
/**
|
||||
|
@ -45,7 +44,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
|||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT,
|
||||
new CharArraySet(
|
||||
Arrays.asList(
|
||||
"d", "m", "b"
|
||||
), true));
|
||||
|
@ -56,7 +55,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
|||
* with phrase queries versus tAthair (which would not have a gap).
|
||||
*/
|
||||
private static final CharArraySet HYPHENATIONS = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT,
|
||||
new CharArraySet(
|
||||
Arrays.asList(
|
||||
"h", "n", "t"
|
||||
), true));
|
||||
|
@ -91,18 +90,17 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public IrishAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public IrishAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public IrishAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public IrishAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -110,14 +108,12 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public IrishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public IrishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -134,12 +130,12 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, HYPHENATIONS);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new StopFilter(result, HYPHENATIONS);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new IrishLowerCaseFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new IrishStemmer());
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Galician.
|
||||
|
@ -62,7 +61,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -74,18 +73,17 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public GalicianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public GalicianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public GalicianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,14 +91,12 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public GalicianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -117,10 +113,10 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new GalicianStemFilter(result);
|
||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Analyzer for Hindi.
|
||||
|
@ -75,32 +74,29 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param version lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a stemming exclusion set
|
||||
*/
|
||||
public HindiAnalyzer(Version version, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(version, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(
|
||||
CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
public HindiAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param version lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public HindiAnalyzer(Version version, CharArraySet stopwords) {
|
||||
this(version, stopwords, CharArraySet.EMPTY_SET);
|
||||
public HindiAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public HindiAnalyzer(Version version) {
|
||||
this(version, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public HindiAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -117,13 +113,13 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new LowerCaseFilter(source);
|
||||
if (!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
result = new HindiNormalizationFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new StopFilter(result, stopwords);
|
||||
result = new HindiStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.HungarianStemmer;
|
||||
|
||||
/**
|
||||
|
@ -64,7 +63,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -76,18 +75,17 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public HungarianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public HungarianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public HungarianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -95,14 +93,12 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public HungarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public HungarianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -119,10 +115,10 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new HungarianStemmer());
|
||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
|
@ -215,7 +214,7 @@ final class Stemmer {
|
|||
if (stems.size() < 2) {
|
||||
return stems;
|
||||
}
|
||||
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, dictionary.ignoreCase);
|
||||
CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
|
||||
List<CharsRef> deduped = new ArrayList<>();
|
||||
for (CharsRef s : stems) {
|
||||
if (!terms.contains(s)) {
|
||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.ArmenianStemmer;
|
||||
|
||||
/**
|
||||
|
@ -73,18 +72,17 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public ArmenianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public ArmenianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public ArmenianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -92,14 +90,12 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public ArmenianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public ArmenianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -116,10 +112,10 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new ArmenianStemmer());
|
||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Analyzer for Indonesian (Bahasa)
|
||||
|
@ -69,20 +68,18 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public IndonesianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public IndonesianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public IndonesianAnalyzer(CharArraySet stopwords){
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -90,17 +87,14 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* {@link IndonesianStemFilter}.
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclusionSet
|
||||
* a set of terms not to be stemmed
|
||||
*/
|
||||
public IndonesianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public IndonesianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -116,10 +110,10 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) {
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
}
|
||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.analysis.util.ElisionFilter;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Italian.
|
||||
|
@ -48,7 +47,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
|
||||
|
||||
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT,
|
||||
new CharArraySet(
|
||||
Arrays.asList(
|
||||
"c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell",
|
||||
"gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"
|
||||
|
@ -72,7 +71,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -84,18 +83,17 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public ItalianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public ItalianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public ItalianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -103,14 +101,12 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public ItalianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -127,11 +123,11 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new ItalianLightStemFilter(result);
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Latvian.
|
||||
|
@ -62,7 +61,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -74,18 +73,17 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public LatvianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public LatvianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,14 +91,12 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public LatvianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -117,10 +113,10 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new LatvianStemFilter(result);
|
||||
|
|
|
@ -82,7 +82,7 @@ public class CapitalizationFilterFactory extends TokenFilterFactory {
|
|||
boolean ignoreCase = getBoolean(args, KEEP_IGNORE_CASE, false);
|
||||
Set<String> k = getSet(args, KEEP);
|
||||
if (k != null) {
|
||||
keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase);
|
||||
keep = new CharArraySet(10, ignoreCase);
|
||||
keep.addAll(k);
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes words that are too long or too short from the stream.
|
||||
|
@ -39,13 +38,12 @@ public final class CodepointCountFilter extends FilteringTokenFilter {
|
|||
* Create a new {@link CodepointCountFilter}. This will filter out tokens whose
|
||||
* {@link CharTermAttribute} is either too short ({@link Character#codePointCount(char[], int, int)}
|
||||
* < min) or too long ({@link Character#codePointCount(char[], int, int)} > max).
|
||||
* @param version the Lucene match version
|
||||
* @param in the {@link TokenStream} to consume
|
||||
* @param min the minimum length
|
||||
* @param max the maximum length
|
||||
*/
|
||||
public CodepointCountFilter(Version version, TokenStream in, int min, int max) {
|
||||
super(version, in);
|
||||
public CodepointCountFilter(TokenStream in, int min, int max) {
|
||||
super(in);
|
||||
if (min < 0) {
|
||||
throw new IllegalArgumentException("minimum length must be greater than or equal to zero");
|
||||
}
|
||||
|
|
|
@ -50,6 +50,6 @@ public class CodepointCountFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public CodepointCountFilter create(TokenStream input) {
|
||||
return new CodepointCountFilter(luceneMatchVersion, input, min, max);
|
||||
return new CodepointCountFilter(input, min, max);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A TokenFilter that only keeps tokens with text contained in the
|
||||
|
@ -37,12 +36,11 @@ public final class KeepWordFilter extends FilteringTokenFilter {
|
|||
* Create a new {@link KeepWordFilter}.
|
||||
* <p><b>NOTE</b>: The words set passed to this constructor will be directly
|
||||
* used by this filter and should not be modified.
|
||||
* @param version the Lucene match version
|
||||
* @param in the {@link TokenStream} to consume
|
||||
* @param words the words to keep
|
||||
*/
|
||||
public KeepWordFilter(Version version, TokenStream in, CharArraySet words) {
|
||||
super(version, in);
|
||||
public KeepWordFilter(TokenStream in, CharArraySet words) {
|
||||
super(in);
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
|
|
|
@ -44,7 +44,6 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
|
|||
/** Creates a new KeepWordFilterFactory */
|
||||
public KeepWordFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
wordFiles = get(args, "words");
|
||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||
if (!args.isEmpty()) {
|
||||
|
@ -73,7 +72,7 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
|
|||
if (words == null) {
|
||||
return input;
|
||||
} else {
|
||||
final TokenStream filter = new KeepWordFilter(luceneMatchVersion, input, words);
|
||||
final TokenStream filter = new KeepWordFilter(input, words);
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes words that are too long or too short from the stream.
|
||||
|
@ -39,13 +38,12 @@ public final class LengthFilter extends FilteringTokenFilter {
|
|||
* Create a new {@link LengthFilter}. This will filter out tokens whose
|
||||
* {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()}
|
||||
* < min) or too long ({@link CharTermAttribute#length()} > max).
|
||||
* @param version the Lucene match version
|
||||
* @param in the {@link TokenStream} to consume
|
||||
* @param min the minimum length
|
||||
* @param max the maximum length
|
||||
*/
|
||||
public LengthFilter(Version version, TokenStream in, int min, int max) {
|
||||
super(version, in);
|
||||
public LengthFilter(TokenStream in, int min, int max) {
|
||||
super(in);
|
||||
if (min < 0) {
|
||||
throw new IllegalArgumentException("minimum length must be greater than or equal to zero");
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ public class LengthFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public LengthFilter create(TokenStream input) {
|
||||
final LengthFilter filter = new LengthFilter(luceneMatchVersion, input,min,max);
|
||||
final LengthFilter filter = new LengthFilter(input,min,max);
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -34,8 +33,7 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
|||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
// use a fixed version, as we don't care about case sensitivity.
|
||||
private final CharArraySet previous = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
|
||||
private final CharArraySet previous = new CharArraySet(8, false);
|
||||
|
||||
/**
|
||||
* Creates a new RemoveDuplicatesTokenFilter
|
||||
|
|
|
@ -20,15 +20,11 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Trims leading and trailing whitespace from Tokens in the stream.
|
||||
* <p>As of Lucene 4.4, this filter does not support updateOffsets=true anymore
|
||||
* as it can lead to broken token streams.
|
||||
*/
|
||||
public final class TrimFilter extends TokenFilter {
|
||||
|
||||
|
@ -36,10 +32,9 @@ public final class TrimFilter extends TokenFilter {
|
|||
|
||||
/**
|
||||
* Create a new {@link TrimFilter}.
|
||||
* @param version the Lucene match version
|
||||
* @param in the stream to consume
|
||||
*/
|
||||
public TrimFilter(Version version, TokenStream in) {
|
||||
public TrimFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ public class TrimFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TrimFilter create(TokenStream input) {
|
||||
final TrimFilter filter = new TrimFilter(luceneMatchVersion, input);
|
||||
final TrimFilter filter = new TrimFilter(input);
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.InPlaceMergeSorter;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
@ -206,11 +205,8 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param configurationFlags Flags configuring the filter
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(Version matchVersion, TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
|
||||
public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
|
||||
super(in);
|
||||
if (!matchVersion.onOrAfter(Version.LUCENE_4_8)) {
|
||||
throw new IllegalArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
|
||||
}
|
||||
this.flags = configurationFlags;
|
||||
this.protWords = protWords;
|
||||
this.iterator = new WordDelimiterIterator(
|
||||
|
@ -225,8 +221,8 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
* @param configurationFlags Flags configuring the filter
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(Version matchVersion, TokenStream in, int configurationFlags, CharArraySet protWords) {
|
||||
this(matchVersion, in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
|
||||
public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
|
||||
this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -119,7 +119,7 @@ public class WordDelimiterFilterFactory extends TokenFilterFactory implements Re
|
|||
@Override
|
||||
public TokenFilter create(TokenStream input) {
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_8)) {
|
||||
return new WordDelimiterFilter(luceneMatchVersion, input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
|
||||
return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
|
||||
flags, protectedWords);
|
||||
} else {
|
||||
return new Lucene47WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
|
||||
|
|
|
@ -18,8 +18,11 @@ package org.apache.lucene.analysis.ngram;
|
|||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Creates new instances of {@link EdgeNGramTokenFilter}.
|
||||
|
@ -46,7 +49,10 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public EdgeNGramTokenFilter create(TokenStream input) {
|
||||
return new EdgeNGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
|
||||
public TokenFilter create(TokenStream input) {
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4)) {
|
||||
return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize);
|
||||
}
|
||||
return new Lucene43EdgeNGramTokenFilter(input, minGramSize, maxGramSize);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tokenizes the given token into n-grams of given size(s).
|
||||
|
@ -59,18 +58,13 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
/**
|
||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the Lucene match version
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
|
||||
if (version == null) {
|
||||
throw new IllegalArgumentException("version must not be null");
|
||||
}
|
||||
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -79,9 +73,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
|
||||
this.charUtils = version.onOrAfter(Version.LUCENE_4_4)
|
||||
? CharacterUtils.getInstance(version)
|
||||
: CharacterUtils.getJava4Instance();
|
||||
this.charUtils = CharacterUtils.getInstance();
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
}
|
||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -38,24 +36,22 @@ public class EdgeNGramTokenizer extends NGramTokenizer {
|
|||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the Lucene match version
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(Version version, int minGram, int maxGram) {
|
||||
super(version, minGram, maxGram, true);
|
||||
public EdgeNGramTokenizer(int minGram, int maxGram) {
|
||||
super(minGram, maxGram, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param version the Lucene match version
|
||||
* @param factory {@link org.apache.lucene.util.AttributeFactory} to use
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(Version version, AttributeFactory factory, int minGram, int maxGram) {
|
||||
super(version, factory, minGram, maxGram, true);
|
||||
public EdgeNGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
|
||||
super(factory, minGram, maxGram, true);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,8 +17,10 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
@ -47,7 +49,10 @@ public class EdgeNGramTokenizerFactory extends TokenizerFactory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public EdgeNGramTokenizer create(AttributeFactory factory) {
|
||||
return new EdgeNGramTokenizer(luceneMatchVersion, factory, minGramSize, maxGramSize);
|
||||
public Tokenizer create(AttributeFactory factory) {
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4)) {
|
||||
return new EdgeNGramTokenizer(factory, minGramSize, maxGramSize);
|
||||
}
|
||||
return new Lucene43NGramTokenizer(factory, minGramSize, maxGramSize);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Tokenizes the given token into n-grams of given size(s), using pre-4.4 behavior.
|
||||
*
|
||||
* @deprecated Use {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter}.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene43EdgeNGramTokenFilter extends TokenFilter {
|
||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
|
||||
private final CharacterUtils charUtils;
|
||||
private final int minGram;
|
||||
private final int maxGram;
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curGramSize;
|
||||
private int tokStart;
|
||||
private int tokEnd; // only used if the length changed before this filter
|
||||
private int savePosIncr;
|
||||
private int savePosLen;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public Lucene43EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
|
||||
this.charUtils = CharacterUtils.getJava4Instance();
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (curTermBuffer == null) {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = charUtils.codePointCount(termAtt);
|
||||
curGramSize = minGram;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
savePosIncr += posIncrAtt.getPositionIncrement();
|
||||
savePosLen = posLenAtt.getPositionLength();
|
||||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
|
||||
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
|
||||
// grab gramSize chars from front or back
|
||||
clearAttributes();
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
// first ngram gets increment, others don't
|
||||
if (curGramSize == minGram) {
|
||||
posIncrAtt.setPositionIncrement(savePosIncr);
|
||||
savePosIncr = 0;
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
posLenAtt.setPositionLength(savePosLen);
|
||||
final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, charLength);
|
||||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
savePosIncr = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
/**
|
||||
* Tokenizes the input from an edge into n-grams of given size(s), using pre-4.4 behavior.
|
||||
*
|
||||
* @deprecated Use {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenizer}.
|
||||
*/
|
||||
@Deprecated
|
||||
public class Lucene43EdgeNGramTokenizer extends Lucene43NGramTokenizer {
|
||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public Lucene43EdgeNGramTokenizer(int minGram, int maxGram) {
|
||||
super(minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
* @param factory {@link org.apache.lucene.util.AttributeFactory} to use
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public Lucene43EdgeNGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
|
||||
super(factory, minGram, maxGram);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s), matching Lucene 4.3 and before behavior.
|
||||
*
|
||||
* @deprecated Use {@link org.apache.lucene.analysis.ngram.NGramTokenFilter} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene43NGramTokenFilter extends TokenFilter {
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
private final int minGram, maxGram;
|
||||
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curGramSize;
|
||||
private int curPos;
|
||||
private int curPosInc, curPosLen;
|
||||
private int tokStart;
|
||||
private int tokEnd;
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
private final CharacterUtils charUtils;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final PositionLengthAttribute posLenAtt;
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates Lucene43NGramTokenFilter with given min and max n-grams.
|
||||
* @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public Lucene43NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
|
||||
this.charUtils = CharacterUtils.getJava4Instance();
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
|
||||
posIncAtt = new PositionIncrementAttribute() {
|
||||
@Override
|
||||
public void setPositionIncrement(int positionIncrement) {}
|
||||
@Override
|
||||
public int getPositionIncrement() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
posLenAtt = new PositionLengthAttribute() {
|
||||
@Override
|
||||
public void setPositionLength(int positionLength) {}
|
||||
@Override
|
||||
public int getPositionLength() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with default min and max n-grams.
|
||||
* @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized
|
||||
*/
|
||||
public Lucene43NGramTokenFilter(TokenStream input) {
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (curTermBuffer == null) {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = charUtils.codePointCount(termAtt);
|
||||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
curPosInc = posIncAtt.getPositionIncrement();
|
||||
curPosLen = posLenAtt.getPositionLength();
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||
}
|
||||
}
|
||||
|
||||
while (curGramSize <= maxGram) {
|
||||
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
} else {
|
||||
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
||||
}
|
||||
curPos++;
|
||||
return true;
|
||||
}
|
||||
curGramSize++; // increase n-gram size
|
||||
curPos = 0;
|
||||
}
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.util.AttributeFactory;
|
|||
* Old broken version of {@link NGramTokenizer}.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class Lucene43NGramTokenizer extends Tokenizer {
|
||||
public class Lucene43NGramTokenizer extends Tokenizer {
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
|
|
|
@ -18,8 +18,11 @@ package org.apache.lucene.analysis.ngram;
|
|||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Factory for {@link NGramTokenFilter}.
|
||||
|
@ -46,7 +49,10 @@ public class NGramFilterFactory extends TokenFilterFactory {
|
|||
}
|
||||
|
||||
@Override
|
||||
public NGramTokenFilter create(TokenStream input) {
|
||||
return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
|
||||
public TokenFilter create(TokenStream input) {
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4)) {
|
||||
return new NGramTokenFilter(input, minGramSize, maxGramSize);
|
||||
}
|
||||
return new Lucene43NGramTokenFilter(input, minGramSize, maxGramSize);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,21 +27,18 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s).
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version} compatibility when
|
||||
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
|
||||
* As of Lucene 4.4, this token filter:<ul>
|
||||
* <li>handles supplementary characters correctly,</li>
|
||||
* <li>emits all n-grams for the same token at the same position,</li>
|
||||
* <li>does not modify offsets,</li>
|
||||
* <li>sorts n-grams by their offset in the original token first, then
|
||||
* increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
|
||||
* "c").</li></ul>
|
||||
* <p>You can make this filter use the old behavior by providing a version <
|
||||
* {@link Version#LUCENE_4_4} in the constructor but this is not recommended as
|
||||
* <p>You can make this filter use the old behavior by using
|
||||
* {@link org.apache.lucene.analysis.ngram.Lucene43NGramTokenFilter} but this is not recommended as
|
||||
* it will lead to broken {@link TokenStream}s that will cause highlighting
|
||||
* bugs.
|
||||
* <p>If you were using this {@link TokenFilter} to perform partial highlighting,
|
||||
|
@ -65,7 +62,6 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
private int tokEnd;
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
private final Version version;
|
||||
private final CharacterUtils charUtils;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
|
@ -74,18 +70,13 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
|
||||
/**
|
||||
* Creates NGramTokenFilter with given min and max n-grams.
|
||||
* @param version Lucene version to enable correct position increments.
|
||||
* See <a href="#version">above</a> for details.
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||
super(new CodepointCountFilter(version, input, minGram, Integer.MAX_VALUE));
|
||||
this.version = version;
|
||||
this.charUtils = version.onOrAfter(Version.LUCENE_4_4)
|
||||
? CharacterUtils.getInstance(version)
|
||||
: CharacterUtils.getJava4Instance();
|
||||
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
|
||||
this.charUtils = CharacterUtils.getInstance();
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -94,37 +85,17 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
if (version.onOrAfter(Version.LUCENE_4_4)) {
|
||||
|
||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
} else {
|
||||
posIncAtt = new PositionIncrementAttribute() {
|
||||
@Override
|
||||
public void setPositionIncrement(int positionIncrement) {}
|
||||
@Override
|
||||
public int getPositionIncrement() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
posLenAtt = new PositionLengthAttribute() {
|
||||
@Override
|
||||
public void setPositionLength(int positionLength) {}
|
||||
@Override
|
||||
public int getPositionLength() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with default min and max n-grams.
|
||||
* @param version Lucene version to enable correct position increments.
|
||||
* See <a href="#version">above</a> for details.
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
*/
|
||||
public NGramTokenFilter(Version version, TokenStream input) {
|
||||
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
public NGramTokenFilter(TokenStream input) {
|
||||
this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
|
@ -149,7 +120,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||
}
|
||||
}
|
||||
if (version.onOrAfter(Version.LUCENE_4_4)) {
|
||||
|
||||
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
|
||||
++curPos;
|
||||
curGramSize = minGram;
|
||||
|
@ -166,23 +137,6 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
while (curGramSize <= maxGram) {
|
||||
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
} else {
|
||||
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
||||
}
|
||||
curPos++;
|
||||
return true;
|
||||
}
|
||||
curGramSize++; // increase n-gram size
|
||||
curPos = 0;
|
||||
}
|
||||
}
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.ngram;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -27,7 +26,6 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s).
|
||||
|
@ -78,51 +76,43 @@ public class NGramTokenizer extends Tokenizer {
|
|||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
NGramTokenizer(Version version, int minGram, int maxGram, boolean edgesOnly) {
|
||||
init(version, minGram, maxGram, edgesOnly);
|
||||
NGramTokenizer(int minGram, int maxGram, boolean edgesOnly) {
|
||||
init(minGram, maxGram, edgesOnly);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param version the lucene compatibility <a href="#version">version</a>
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenizer(Version version, int minGram, int maxGram) {
|
||||
this(version, minGram, maxGram, false);
|
||||
public NGramTokenizer(int minGram, int maxGram) {
|
||||
this(minGram, maxGram, false);
|
||||
}
|
||||
|
||||
NGramTokenizer(Version version, AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) {
|
||||
NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) {
|
||||
super(factory);
|
||||
init(version, minGram, maxGram, edgesOnly);
|
||||
init(minGram, maxGram, edgesOnly);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param version the lucene compatibility <a href="#version">version</a>
|
||||
* @param factory {@link org.apache.lucene.util.AttributeFactory} to use
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenizer(Version version, AttributeFactory factory, int minGram, int maxGram) {
|
||||
this(version, factory, minGram, maxGram, false);
|
||||
public NGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
|
||||
this(factory, minGram, maxGram, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with default min and max n-grams.
|
||||
* @param version the lucene compatibility <a href="#version">version</a>
|
||||
*/
|
||||
public NGramTokenizer(Version version) {
|
||||
this(version, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
public NGramTokenizer() {
|
||||
this(DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
private void init(Version version, int minGram, int maxGram, boolean edgesOnly) {
|
||||
if (!edgesOnly && !version.onOrAfter(Version.LUCENE_4_4)) {
|
||||
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
|
||||
}
|
||||
charUtils = version.onOrAfter(Version.LUCENE_4_4)
|
||||
? CharacterUtils.getInstance(version)
|
||||
: CharacterUtils.getJava4Instance();
|
||||
private void init(int minGram, int maxGram, boolean edgesOnly) {
|
||||
charUtils = CharacterUtils.getInstance();
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
|
|
@ -53,7 +53,7 @@ public class NGramTokenizerFactory extends TokenizerFactory {
|
|||
@Override
|
||||
public Tokenizer create(AttributeFactory factory) {
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4)) {
|
||||
return new NGramTokenizer(luceneMatchVersion, factory, minGramSize, maxGramSize);
|
||||
return new NGramTokenizer(factory, minGramSize, maxGramSize);
|
||||
} else {
|
||||
return new Lucene43NGramTokenizer(factory, minGramSize, maxGramSize);
|
||||
}
|
||||
|
|
|
@ -28,13 +28,11 @@ import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
|||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -50,10 +48,8 @@ import java.nio.charset.StandardCharsets;
|
|||
* A default set of stopwords is used unless an alternative list is specified, but the
|
||||
* exclusion list is empty by default.
|
||||
* </p>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
// TODO: extend StopwordAnalyzerBase
|
||||
public final class DutchAnalyzer extends Analyzer {
|
||||
|
||||
/** File containing default Dutch stopwords. */
|
||||
|
@ -73,14 +69,14 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
|
||||
DEFAULT_STEM_DICT = new CharArrayMap<>(Version.LUCENE_CURRENT, 4, false);
|
||||
DEFAULT_STEM_DICT = new CharArrayMap<>(4, false);
|
||||
DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet
|
||||
DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||
DEFAULT_STEM_DICT.put("ei", "eier");
|
||||
|
@ -100,29 +96,27 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
private CharArraySet excltable = CharArraySet.EMPTY_SET;
|
||||
|
||||
private final StemmerOverrideMap stemdict;
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()})
|
||||
* and a few default entries for the stem exclusion table.
|
||||
*
|
||||
*/
|
||||
public DutchAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
public DutchAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
public DutchAnalyzer(CharArraySet stopwords){
|
||||
this(stopwords, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){
|
||||
this(matchVersion, stopwords, stemExclusionTable, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
public DutchAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable){
|
||||
this(stopwords, stemExclusionTable, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||
public DutchAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
|
||||
if (stemOverrideDict.isEmpty()) {
|
||||
this.stemdict = null;
|
||||
} else {
|
||||
|
@ -154,10 +148,10 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, excltable);
|
||||
if (stemdict != null)
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.NorwegianStemmer;
|
||||
|
||||
/**
|
||||
|
@ -64,7 +63,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -76,18 +75,17 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public NorwegianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public NorwegianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public NorwegianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -95,14 +93,12 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public NorwegianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public NorwegianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -119,10 +115,10 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new NorwegianStemmer());
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Portuguese.
|
||||
|
@ -63,7 +62,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -75,18 +74,17 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public PortugueseAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public PortugueseAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public PortugueseAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -94,14 +92,12 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public PortugueseAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -118,10 +114,10 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new PortugueseLightStemFilter(result);
|
||||
|
|
|
@ -31,7 +31,6 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||
|
||||
|
@ -135,8 +134,7 @@ public abstract class RSLPStemmerBase {
|
|||
if (!exceptions[i].endsWith(suffix))
|
||||
throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||
}
|
||||
this.exceptions = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(exceptions), false);
|
||||
this.exceptions = new CharArraySet(Arrays.asList(exceptions), false);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
|
||||
|
@ -50,23 +49,20 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
//The default maximum percentage (40%) of index documents which
|
||||
//can contain a term, after which the term is considered to be a stop word.
|
||||
public static final float defaultMaxDocFreqPercent = 0.4f;
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
|
||||
* indexed fields from terms with a document frequency percentage greater than
|
||||
* {@link #defaultMaxDocFreqPercent}
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader) throws IOException {
|
||||
this(matchVersion, delegate, indexReader, defaultMaxDocFreqPercent);
|
||||
this(delegate, indexReader, defaultMaxDocFreqPercent);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -74,18 +70,16 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
* indexed fields from terms with a document frequency greater than the given
|
||||
* maxDocFreq
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @param maxDocFreq Document frequency terms should be above in order to be stopwords
|
||||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader,
|
||||
int maxDocFreq) throws IOException {
|
||||
this(matchVersion, delegate, indexReader, MultiFields.getIndexedFields(indexReader), maxDocFreq);
|
||||
this(delegate, indexReader, MultiFields.getIndexedFields(indexReader), maxDocFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,7 +87,6 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
* indexed fields from terms with a document frequency percentage greater than
|
||||
* the given maxPercentDocs
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||
|
@ -101,11 +94,10 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader,
|
||||
float maxPercentDocs) throws IOException {
|
||||
this(matchVersion, delegate, indexReader, MultiFields.getIndexedFields(indexReader), maxPercentDocs);
|
||||
this(delegate, indexReader, MultiFields.getIndexedFields(indexReader), maxPercentDocs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -113,7 +105,6 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
* given selection of fields from terms with a document frequency percentage
|
||||
* greater than the given maxPercentDocs
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @param fields Selection of fields to calculate stopwords for
|
||||
|
@ -122,12 +113,11 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader,
|
||||
Collection<String> fields,
|
||||
float maxPercentDocs) throws IOException {
|
||||
this(matchVersion, delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
|
||||
this(delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -135,7 +125,6 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
* given selection of fields from terms with a document frequency greater than
|
||||
* the given maxDocFreq
|
||||
*
|
||||
* @param matchVersion Version to be used in {@link StopFilter}
|
||||
* @param delegate Analyzer whose TokenStream will be filtered
|
||||
* @param indexReader IndexReader to identify the stopwords from
|
||||
* @param fields Selection of fields to calculate stopwords for
|
||||
|
@ -143,13 +132,11 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
* @throws IOException Can be thrown while reading from the IndexReader
|
||||
*/
|
||||
public QueryAutoStopWordAnalyzer(
|
||||
Version matchVersion,
|
||||
Analyzer delegate,
|
||||
IndexReader indexReader,
|
||||
Collection<String> fields,
|
||||
int maxDocFreq) throws IOException {
|
||||
super(delegate.getReuseStrategy());
|
||||
this.matchVersion = matchVersion;
|
||||
this.delegate = delegate;
|
||||
|
||||
for (String field : fields) {
|
||||
|
@ -181,8 +168,8 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
if (stopWords == null) {
|
||||
return components;
|
||||
}
|
||||
StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(),
|
||||
new CharArraySet(matchVersion, stopWords, false));
|
||||
StopFilter stopFilter = new StopFilter(components.getTokenStream(),
|
||||
new CharArraySet(stopWords, false));
|
||||
return new TokenStreamComponents(components.getTokenizer(), stopFilter);
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.reverse;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -36,7 +35,6 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final char marker;
|
||||
private final Version matchVersion;
|
||||
private static final char NOMARKER = '\uFFFF';
|
||||
|
||||
/**
|
||||
|
@ -66,11 +64,10 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
* The reversed tokens will not be marked.
|
||||
* </p>
|
||||
*
|
||||
* @param matchVersion Lucene compatibility version
|
||||
* @param in {@link TokenStream} to filter
|
||||
*/
|
||||
public ReverseStringFilter(Version matchVersion, TokenStream in) {
|
||||
this(matchVersion, in, NOMARKER);
|
||||
public ReverseStringFilter(TokenStream in) {
|
||||
this(in, NOMARKER);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -81,13 +78,11 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
* character.
|
||||
* </p>
|
||||
*
|
||||
* @param matchVersion compatibility version
|
||||
* @param in {@link TokenStream} to filter
|
||||
* @param marker A character used to mark reversed tokens
|
||||
*/
|
||||
public ReverseStringFilter(Version matchVersion, TokenStream in, char marker) {
|
||||
public ReverseStringFilter(TokenStream in, char marker) {
|
||||
super(in);
|
||||
this.matchVersion = matchVersion;
|
||||
this.marker = marker;
|
||||
}
|
||||
|
||||
|
@ -100,7 +95,7 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
termAtt.resizeBuffer(len);
|
||||
termAtt.buffer()[len - 1] = marker;
|
||||
}
|
||||
reverse( matchVersion, termAtt.buffer(), 0, len );
|
||||
reverse( termAtt.buffer(), 0, len );
|
||||
termAtt.setLength(len);
|
||||
return true;
|
||||
} else {
|
||||
|
@ -111,48 +106,43 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
/**
|
||||
* Reverses the given input string
|
||||
*
|
||||
* @param matchVersion compatibility version
|
||||
* @param input the string to reverse
|
||||
* @return the given input string in reversed order
|
||||
*/
|
||||
public static String reverse( Version matchVersion, final String input ){
|
||||
public static String reverse(final String input ){
|
||||
final char[] charInput = input.toCharArray();
|
||||
reverse( matchVersion, charInput, 0, charInput.length );
|
||||
reverse( charInput, 0, charInput.length );
|
||||
return new String( charInput );
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverses the given input buffer in-place
|
||||
* @param matchVersion compatibility version
|
||||
* @param buffer the input char array to reverse
|
||||
*/
|
||||
public static void reverse(Version matchVersion, final char[] buffer) {
|
||||
reverse(matchVersion, buffer, 0, buffer.length);
|
||||
public static void reverse(final char[] buffer) {
|
||||
reverse(buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Partially reverses the given input buffer in-place from offset 0
|
||||
* up to the given length.
|
||||
* @param matchVersion compatibility version
|
||||
* @param buffer the input char array to reverse
|
||||
* @param len the length in the buffer up to where the
|
||||
* buffer should be reversed
|
||||
*/
|
||||
public static void reverse(Version matchVersion, final char[] buffer,
|
||||
final int len) {
|
||||
reverse( matchVersion, buffer, 0, len );
|
||||
public static void reverse(final char[] buffer, final int len) {
|
||||
reverse( buffer, 0, len );
|
||||
}
|
||||
|
||||
/**
|
||||
* Partially reverses the given input buffer in-place from the given offset
|
||||
* up to the given length.
|
||||
* @param matchVersion compatibility version
|
||||
* @param buffer the input char array to reverse
|
||||
* @param start the offset from where to reverse the buffer
|
||||
* @param len the length in the buffer up to where the
|
||||
* buffer should be reversed
|
||||
*/
|
||||
public static void reverse(Version matchVersion, final char[] buffer,
|
||||
public static void reverse(final char[] buffer,
|
||||
final int start, final int len) {
|
||||
/* modified version of Apache Harmony AbstractStringBuilder reverse0() */
|
||||
if (len < 2)
|
||||
|
|
|
@ -40,7 +40,6 @@ public class ReverseStringFilterFactory extends TokenFilterFactory {
|
|||
/** Creates a new ReverseStringFilterFactory */
|
||||
public ReverseStringFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -48,7 +47,7 @@ public class ReverseStringFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public ReverseStringFilter create(TokenStream in) {
|
||||
return new ReverseStringFilter(luceneMatchVersion,in);
|
||||
return new ReverseStringFilter(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.RomanianStemmer;
|
||||
|
||||
/**
|
||||
|
@ -78,18 +77,17 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public RomanianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public RomanianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public RomanianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -97,14 +95,12 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a set of terms not to be stemmed
|
||||
*/
|
||||
public RomanianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
|
||||
matchVersion, stemExclusionSet));
|
||||
public RomanianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -121,10 +117,10 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new RomanianStemmer());
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Russian language.
|
||||
|
@ -54,7 +53,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
|
||||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
@ -74,34 +73,30 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
|
|||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
public RussianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
public RussianAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
public RussianAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclusionSet a set of words not to be stemmed
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
|
||||
super(matchVersion, stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
public RussianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -117,10 +112,10 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new StandardFilter(source);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.shingle;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.AnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A ShingleAnalyzerWrapper wraps a {@link ShingleFilter} around another {@link Analyzer}.
|
||||
|
@ -101,15 +100,15 @@ public final class ShingleAnalyzerWrapper extends AnalyzerWrapper {
|
|||
/**
|
||||
* Wraps {@link StandardAnalyzer}.
|
||||
*/
|
||||
public ShingleAnalyzerWrapper(Version matchVersion) {
|
||||
this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
public ShingleAnalyzerWrapper() {
|
||||
this(ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps {@link StandardAnalyzer}.
|
||||
*/
|
||||
public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) {
|
||||
this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize);
|
||||
public ShingleAnalyzerWrapper(int minShingleSize, int maxShingleSize) {
|
||||
this(new StandardAnalyzer(), minShingleSize, maxShingleSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -17,16 +17,14 @@ package org.apache.lucene.analysis.standard;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
|
@ -35,18 +33,6 @@ import java.io.Reader;
|
|||
* LowerCaseFilter} and {@link StopFilter}, using a list of
|
||||
* English stop words.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ClassicAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* <li> As of 2.4, Tokens incorrectly identified as acronyms
|
||||
* are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
|
||||
* </ul>
|
||||
*
|
||||
* ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
|
||||
* As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation,
|
||||
* as specified by UAX#29.
|
||||
|
@ -63,29 +49,23 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
|||
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
|
||||
/** Builds an analyzer with the given stop words.
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @param stopWords stop words */
|
||||
public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) {
|
||||
super(matchVersion, stopWords);
|
||||
public ClassicAnalyzer(CharArraySet stopWords) {
|
||||
super(stopWords);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the default stop words ({@link
|
||||
* #STOP_WORDS_SET}).
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
*/
|
||||
public ClassicAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, STOP_WORDS_SET);
|
||||
public ClassicAnalyzer() {
|
||||
this(STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
* @see WordlistLoader#getWordSet(Reader, Version)
|
||||
* @param matchVersion Lucene version to match See {@link
|
||||
* <a href="#version">above</a>}
|
||||
* @see WordlistLoader#getWordSet(Reader)
|
||||
* @param stopwords Reader to read stop words from */
|
||||
public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
public ClassicAnalyzer(Reader stopwords) throws IOException {
|
||||
this(loadStopwordSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -107,11 +87,11 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||
final ClassicTokenizer src = new ClassicTokenizer(matchVersion);
|
||||
final ClassicTokenizer src = new ClassicTokenizer();
|
||||
src.setMaxTokenLength(maxTokenLength);
|
||||
TokenStream tok = new ClassicFilter(src);
|
||||
tok = new LowerCaseFilter(matchVersion, tok);
|
||||
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||
tok = new LowerCaseFilter(tok);
|
||||
tok = new StopFilter(tok, stopwords);
|
||||
return new TokenStreamComponents(src, tok) {
|
||||
@Override
|
||||
protected void setReader(final Reader reader) throws IOException {
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
@ -26,8 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** A grammar-based tokenizer constructed with JFlex
|
||||
*
|
||||
|
@ -102,19 +99,19 @@ public final class ClassicTokenizer extends Tokenizer {
|
|||
*
|
||||
* See http://issues.apache.org/jira/browse/LUCENE-1068
|
||||
*/
|
||||
public ClassicTokenizer(Version matchVersion) {
|
||||
init(matchVersion);
|
||||
public ClassicTokenizer() {
|
||||
init();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeFactory}
|
||||
*/
|
||||
public ClassicTokenizer(Version matchVersion, AttributeFactory factory) {
|
||||
public ClassicTokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
init(matchVersion);
|
||||
init();
|
||||
}
|
||||
|
||||
private void init(Version matchVersion) {
|
||||
private void init() {
|
||||
this.scanner = new ClassicTokenizerImpl(input);
|
||||
}
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue