LUCENE-4822: Add PatternKeywordTokenFilter to marks keywords based on regular expressions

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455321 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2013-03-11 20:38:35 +00:00
parent a5c4101d9b
commit 45e65d12b8
91 changed files with 380 additions and 259 deletions

View File

@ -27,6 +27,12 @@ Changes in backwards compatibility policy
multiple ngrams derived from the same input token. (Walter Underwood
via Mike McCandless)
* LUCENE-4822: KeywordTokenFilter is now an abstract class. Subclasses
need to implement #isKeyword() in order to mark terms as keywords.
The existing functionality has been factored out into a new
SetKeywordTokenFilter class. (Simon Willnauer, Uwe Schindler)
New Features
* LUCENE-4815: DrillSideways now allows more than one FacetRequest per
@ -41,6 +47,9 @@ New Features
once as a keyword and once as an ordinary token allow stemmers to emit
a stemmed version along with the un-stemmed version. (Simon Willnauer)
* LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based
on regular expressions. (Simon Willnauer, Uwe Schindler)
======================= Lucene 4.2.0 =======================
Changes in backwards compatibility policy

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
@ -107,7 +107,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* {@link ArabicStemFilter}.
*
* @param matchVersion
@ -131,7 +131,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter},
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
* {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
*/
@Override
@ -144,7 +144,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
result = new ArabicNormalizationFilter(result);
if(!stemExclusionSet.isEmpty()) {
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new ArabicStemFilter(result));
}

View File

@ -19,7 +19,7 @@ package org.apache.lucene.analysis.ar;
import java.io.IOException;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -29,10 +29,10 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerFilter */
* @see SetKeywordMarkerFilter */
public final class ArabicStemFilter extends TokenFilter {
private final ArabicStemmer stemmer = new ArabicStemmer();

View File

@ -24,7 +24,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@ -97,7 +97,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words and a stem exclusion set.
* If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter}
* If a stem exclusion set is provided this analyzer will add a {@link SetKeywordMarkerFilter}
* before {@link BulgarianStemFilter}.
*/
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
@ -114,7 +114,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link BulgarianStemFilter}.
*/
@Override
@ -124,7 +124,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new BulgarianStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -19,7 +19,7 @@ package org.apache.lucene.analysis.bg;
import java.io.IOException;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -25,7 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -131,7 +131,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
result = new StandardFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(excltable != null && !excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
result = new SetKeywordMarkerFilter(result, excltable);
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
}
}

View File

@ -22,7 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,10 +30,10 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* A {@link TokenFilter} that applies {@link BrazilianStemmer}.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerFilter
* @see SetKeywordMarkerFilter
*
*/
public final class BrazilianStemFilter extends TokenFilter {

View File

@ -24,7 +24,7 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -97,7 +97,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -119,7 +119,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is
* {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -131,7 +131,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new CatalanStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -52,13 +52,7 @@ public final class LowerCaseFilter extends TokenFilter {
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.buffer();
final int length = termAtt.length();
for (int i = 0; i < length;) {
i += Character.toChars(
Character.toLowerCase(
charUtils.codePointAt(buffer, i)), buffer, i);
}
charUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;

View File

@ -20,7 +20,7 @@ package org.apache.lucene.analysis.cz;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@ -115,7 +115,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
* a stem exclusion set is provided via
* {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
* {@link KeywordMarkerFilter} is added before
* {@link SetKeywordMarkerFilter} is added before
* {@link CzechStemFilter}.
*/
@Override
@ -126,7 +126,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
if(!this.stemExclusionTable.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionTable);
result = new SetKeywordMarkerFilter(result, stemExclusionTable);
result = new CzechStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -2,7 +2,7 @@ package org.apache.lucene.analysis.cz;
import java.io.IOException;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -29,12 +29,12 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* <p><b>NOTE</b>: Input is expected to be in lowercase,
* but with diacritical marks</p>
* @see KeywordMarkerFilter
* @see SetKeywordMarkerFilter
*/
public final class CzechStemFilter extends TokenFilter {
private final CzechStemmer stemmer = new CzechStemmer();

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -91,7 +91,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -113,7 +113,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -124,7 +124,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new DanishStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -24,7 +24,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -129,7 +129,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided, {@link GermanNormalizationFilter} and {@link GermanLightStemFilter}
*/
@Override
@ -139,7 +139,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
result = new KeywordMarkerFilter(result, exclusionSet);
result = new SetKeywordMarkerFilter(result, exclusionSet);
result = new GermanNormalizationFilter(result);
result = new GermanLightStemFilter(result);
return new TokenStreamComponents(source, result);

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -34,10 +34,10 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* </p>
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerFilter
* @see SetKeywordMarkerFilter
*/
public final class GermanStemFilter extends TokenFilter
{

View File

@ -19,7 +19,7 @@ package org.apache.lucene.analysis.el;
import java.io.IOException;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* <p>

View File

@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -73,7 +73,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -96,7 +96,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link EnglishPossessiveFilter},
* {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link PorterStemFilter}.
*/
@Override
@ -108,7 +108,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* English words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -90,7 +90,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -112,7 +112,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SpanishLightStemFilter}.
*/
@Override
@ -123,7 +123,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SpanishLightStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -89,7 +89,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -111,7 +111,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -122,7 +122,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new BasqueStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -91,7 +91,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -113,7 +113,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -124,7 +124,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new FinnishStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -20,7 +20,7 @@ package org.apache.lucene.analysis.fr;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -133,7 +133,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
* built from a {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link ElisionFilter},
* {@link LowerCaseFilter}, {@link StopFilter},
* {@link KeywordMarkerFilter} if a stem exclusion set is
* {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided, and {@link FrenchLightStemFilter}
*/
@Override
@ -145,7 +145,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
result = new SetKeywordMarkerFilter(result, excltable);
result = new FrenchLightStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -23,7 +23,7 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -107,7 +107,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -129,7 +129,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link IrishLowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -144,7 +144,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
result = new IrishLowerCaseFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new IrishStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@ -89,7 +89,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -111,7 +111,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link GalicianStemFilter}.
*/
@Override
@ -122,7 +122,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new GalicianStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* Galician words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* Galician words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -20,7 +20,7 @@ package org.apache.lucene.analysis.hi;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
@ -111,7 +111,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
* {@link HindiNormalizationFilter}, {@link KeywordMarkerFilter}
* {@link HindiNormalizationFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided, {@link HindiStemFilter}, and
* Hindi Stop words
*/
@ -121,7 +121,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
if (!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new IndicNormalizationFilter(result);
result = new HindiNormalizationFilter(result);
result = new StopFilter(matchVersion, result, stopwords);

View File

@ -19,7 +19,7 @@ package org.apache.lucene.analysis.hi;
import java.io.IOException;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -31,7 +31,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* <p>
* In some cases the normalization may cause unrelated terms to conflate, so
* to prevent terms from being normalized use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see HindiNormalizer

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -91,7 +91,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -113,7 +113,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -124,7 +124,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new HungarianStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* Hungarian words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -175,10 +175,7 @@ public class HunspellStemmer {
@SuppressWarnings("unchecked")
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
if(dictionary.isIgnoreCase()) {
for(int i=0;i<strippedWord.length;){
i += Character.toChars(
Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
}
charUtils.toLowerCase(strippedWord, 0, strippedWord.length);
}
segment.setLength(0);
segment.append(strippedWord, 0, length);

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -89,7 +89,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -111,7 +111,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -122,7 +122,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new ArmenianStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -22,7 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@ -87,7 +87,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* {@link IndonesianStemFilter}.
*
* @param matchVersion
@ -111,7 +111,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, {@link KeywordMarkerFilter}
* {@link StopFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided and {@link IndonesianStemFilter}.
*/
@Override
@ -122,7 +122,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty()) {
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
}

View File

@ -24,7 +24,8 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -99,7 +100,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -121,7 +122,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link ItalianLightStemFilter}.
*/
@Override
@ -133,7 +134,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new ItalianLightStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@ -89,7 +89,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -111,7 +111,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link LatvianStemFilter}.
*/
@Override
@ -122,7 +122,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new LatvianStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -22,41 +22,28 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
/**
* Marks terms as keywords via the {@link KeywordAttribute}. Each token
* contained in the provided is marked as a keyword by setting
* {@link KeywordAttribute#setKeyword(boolean)} to <code>true</code>.
* Marks terms as keywords via the {@link KeywordAttribute}.
*
* @see KeywordAttribute
*/
public final class KeywordMarkerFilter extends TokenFilter {
public abstract class KeywordMarkerFilter extends TokenFilter {
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final CharArraySet keywordSet;
/**
* Create a new KeywordMarkerFilter, that marks the current token as a
* keyword if the tokens term buffer is contained in the given set via the
* {@link KeywordAttribute}.
*
* @param in
* TokenStream to filter
* @param keywordSet
* the keywords set to lookup the current termbuffer
* Creates a new {@link KeywordMarkerFilter}
* @param in the input stream
*/
public KeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet) {
protected KeywordMarkerFilter(TokenStream in) {
super(in);
this.keywordSet = keywordSet;
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (keywordSet.contains(termAtt.buffer(), 0, termAtt.length())) {
if (isKeyword()) {
keywordAttr.setKeyword(true);
}
return true;
@ -64,4 +51,7 @@ public final class KeywordMarkerFilter extends TokenFilter {
return false;
}
}
protected abstract boolean isKeyword();
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.miscellaneous;
*/
import java.io.IOException;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.*;
@ -29,23 +30,30 @@ import org.apache.lucene.analysis.TokenStream;
* &lt;fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" ignoreCase="false"/&gt;
* &lt;filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" pattern="^.+er$" ignoreCase="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
public class KeywordMarkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final String PROTECTED_TOKENS = "protected";
public static final String PATTERN = "pattern";
private CharArraySet protectedWords;
private boolean ignoreCase;
private Pattern pattern;
@Override
public void inform(ResourceLoader loader) throws IOException {
String wordFiles = args.get(PROTECTED_TOKENS);
String stringPattern = args.get(PATTERN);
ignoreCase = getBoolean("ignoreCase", false);
if (wordFiles != null) {
protectedWords = getWordSet(loader, wordFiles, ignoreCase);
}
if (stringPattern != null) {
pattern = ignoreCase ? Pattern.compile(stringPattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE) : Pattern.compile(stringPattern);
}
}
public boolean isIgnoreCase() {
@ -54,6 +62,12 @@ public class KeywordMarkerFilterFactory extends TokenFilterFactory implements Re
@Override
public TokenStream create(TokenStream input) {
return protectedWords == null ? input : new KeywordMarkerFilter(input, protectedWords);
if (pattern != null) {
input = new PatternKeywordMarkerFilter(input, pattern);
}
if (protectedWords != null) {
input = new SetKeywordMarkerFilter(input, protectedWords);
}
return input;
}
}

View File

@ -20,7 +20,7 @@ package org.apache.lucene.analysis.nl;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
@ -129,7 +129,7 @@ public final class DutchAnalyzer extends Analyzer {
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is provided,
* {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is provided,
* {@link StemmerOverrideFilter}, and {@link SnowballFilter}
*/
@Override
@ -140,7 +140,7 @@ public final class DutchAnalyzer extends Analyzer {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stoptable);
if (!excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
result = new SetKeywordMarkerFilter(result, excltable);
if (!stemdict.isEmpty())
result = new StemmerOverrideFilter(matchVersion, result, stemdict);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -91,7 +91,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -113,7 +113,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -124,7 +124,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new NorwegianStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -90,7 +90,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -112,7 +112,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link PortugueseLightStemFilter}.
*/
@Override
@ -123,7 +123,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new PortugueseLightStemFilter(result);
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* Portuguese words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* Portuguese words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* Portuguese words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -94,7 +94,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -116,7 +116,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -127,7 +127,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new RomanianStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -29,7 +29,7 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.IOUtils;
@ -111,7 +111,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided, and {@link SnowballFilter}
*/
@Override
@ -122,7 +122,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if (!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -20,7 +20,7 @@ package org.apache.lucene.analysis.snowball;
import java.util.Map;
import java.io.IOException;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -74,7 +74,7 @@ public class SnowballPorterFilterFactory extends TokenFilterFactory implements R
}
if (protectedWords != null)
input = new KeywordMarkerFilter(input, protectedWords);
input = new SetKeywordMarkerFilter(input, protectedWords);
return new SnowballFilter(input, program);
}
}

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -91,7 +91,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -113,7 +113,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
@ -124,7 +124,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new SwedishStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -22,7 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -93,7 +93,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -115,7 +115,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link TurkishLowerCaseFilter},
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem
* {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem
* exclusion set is provided and {@link SnowballFilter}.
*/
@Override
@ -126,7 +126,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
result = new TurkishLowerCaseFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new TurkishStemmer());
return new TokenStreamComponents(source, result);
}

View File

@ -215,12 +215,9 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
* The user should never modify this text array after calling this method.
*/
public V put(char[] text, V value) {
if (ignoreCase)
for(int i=0;i<text.length;){
i += Character.toChars(
Character.toLowerCase(
charUtils.codePointAt(text, i)), text, i);
}
if (ignoreCase) {
charUtils.toLowerCase(text, 0, text.length);
}
int slot = getSlot(text, 0, text.length);
if (keys[slot] != null) {
final V oldValue = values[slot];

View File

@ -131,7 +131,25 @@ public abstract class CharacterUtils {
}
return new CharacterBuffer(new char[bufferSize], 0, 0);
}
/**
* Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
* at the given offset.
* @param buffer the char buffer to lowercase
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
public void toLowerCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert offset <=0 && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toLowerCase(
codePointAt(buffer, i)), buffer, i);
}
}
/**
* Fills the {@link CharacterBuffer} with characters read from the given
* reader {@link Reader}. This method tries to read as many characters into

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
/**
@ -124,7 +124,7 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
set.add("ساهدهات");
MockTokenizer tokenStream = new MockTokenizer(new StringReader("ساهدهات"), MockTokenizer.WHITESPACE, false);
ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerFilter(tokenStream, set));
ArabicStemFilter filter = new ArabicStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
assertTokenStreamContents(filter, new String[]{"ساهدهات"});
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
@ -222,7 +222,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);
BulgarianStemFilter filter = new BulgarianStemFilter(
new KeywordMarkerFilter(tokenStream, set));
new SetKeywordMarkerFilter(tokenStream, set));
assertTokenStreamContents(filter, new String[] { "строй", "строеве" });
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
/**
@ -147,7 +147,7 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("Brasília");
BrazilianStemFilter filter = new BrazilianStemFilter(
new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
"Brasília Brasilia")), set));
assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}

View File

@ -39,6 +39,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.sinks.TeeSinkTokenFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -88,7 +90,9 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
static {
Collections.<Class<?>>addAll(oddlyNamedComponents,
ReversePathHierarchyTokenizer.class, // this is supported via an option to PathHierarchyTokenizer's factory
SnowballFilter.class // this is called SnowballPorterFilterFactory
SnowballFilter.class, // this is called SnowballPorterFilterFactory
PatternKeywordMarkerFilter.class,
SetKeywordMarkerFilter.class
);
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
/**
@ -281,7 +281,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("hole");
CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerFilter(
CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
}

View File

@ -23,7 +23,7 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
@ -38,7 +38,7 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("fischen");
GermanStemFilter filter = new GermanStemFilter(
new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
"Fischen Trinken")), set));
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -55,7 +55,7 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
}
};

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -62,7 +62,7 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
}
};

View File

@ -28,7 +28,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -68,7 +68,7 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanStemFilter(sink));
}
};

View File

@ -23,7 +23,7 @@ import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
@ -57,7 +57,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("yourselves");
Tokenizer tokenizer = new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false);
TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set));
TokenStream filter = new PorterStemFilter(new SetKeywordMarkerFilter(tokenizer, set));
assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -55,7 +55,7 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
}
};

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -185,7 +185,7 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
}
};

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -64,7 +64,7 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink));
}
};

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
/**
@ -59,7 +59,7 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink));
}
};

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -55,7 +55,7 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new HungarianLightStemFilter(sink));
}
};

View File

@ -28,7 +28,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@ -63,7 +63,7 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
// assert with keywork marker
tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY);
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}

View File

@ -3,6 +3,7 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
@ -36,34 +37,68 @@ import org.junit.Test;
public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
@Test
public void testIncrementToken() throws IOException {
public void testSetFilterIncrementToken() throws IOException {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 5, true);
set.add("lucenefox");
String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
"jumps" };
assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
new SetKeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output);
CharArraySet mixedCaseSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("LuceneFox"), false);
assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
new SetKeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), mixedCaseSet)), output);
CharArraySet set2 = set;
assertTokenStreamContents(new LowerCaseFilterMock(
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
new SetKeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output);
}
@Test
public void testPatternFilterIncrementToken() throws IOException {
String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
"jumps" };
assertTokenStreamContents(new LowerCaseFilterMock(
new PatternKeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), Pattern.compile("[a-zA-Z]+[fF]ox"))), output);
output = new String[] { "the", "quick", "brown", "lucenefox",
"jumps" };
assertTokenStreamContents(new LowerCaseFilterMock(
new PatternKeywordMarkerFilter(new MockTokenizer(new StringReader(
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), Pattern.compile("[a-zA-Z]+[f]ox"))), output);
}
// LUCENE-2901
public void testComposition() throws Exception {
TokenStream ts = new LowerCaseFilterMock(
new KeywordMarkerFilter(
new KeywordMarkerFilter(
new SetKeywordMarkerFilter(
new SetKeywordMarkerFilter(
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
new CharArraySet(TEST_VERSION_CURRENT, asSet("Birds", "Houses"), false)),
new CharArraySet(TEST_VERSION_CURRENT, asSet("Dogs", "Trees"), false)));
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
ts = new LowerCaseFilterMock(
new PatternKeywordMarkerFilter(
new PatternKeywordMarkerFilter(
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
Pattern.compile("Birds|Houses")),
Pattern.compile("Dogs|Trees")));
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
ts = new LowerCaseFilterMock(
new SetKeywordMarkerFilter(
new PatternKeywordMarkerFilter(
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
Pattern.compile("Birds|Houses")),
new CharArraySet(TEST_VERSION_CURRENT, asSet("Dogs", "Trees"), false)));
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
}
public static final class LowerCaseFilterMock extends TokenFilter {

View File

@ -35,6 +35,7 @@ import org.apache.lucene.analysis.util.StringMockResourceLoader;
* Simple tests to ensure the keyword marker filter factory is working.
*/
public class TestKeywordMarkerFilterFactory extends BaseTokenStreamTestCase {
public void testKeywords() throws IOException {
Reader reader = new StringReader("dogs cats");
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
@ -48,6 +49,36 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamTestCase {
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats" });
reader = new StringReader("dogs cats");
tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
factory = new KeywordMarkerFilterFactory();
args = new HashMap<String,String>();
args.put("pattern", "cats|Dogs");
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
factory.init(args);
factory.inform(null);
ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats" });
}
public void testKeywordsMixed() throws IOException {
Reader reader = new StringReader("dogs cats birds");
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
Map<String,String> args = new HashMap<String,String>();
ResourceLoader loader = new StringMockResourceLoader("cats");
args.put("protected", "protwords.txt");
args.put("pattern", "birds|Dogs");
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
factory.init(args);
factory.inform(loader);
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats", "birds" });
}
public void testKeywordsCaseInsensitive() throws IOException {
@ -64,5 +95,36 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamTestCase {
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" });
reader = new StringReader("dogs cats Cats");
tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
factory = new KeywordMarkerFilterFactory();
args = new HashMap<String,String>();
args.put("pattern", "Cats");
args.put("ignoreCase", "true");
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
factory.init(args);
factory.inform(null);
ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" });
}
public void testKeywordsCaseInsensitiveMixed() throws IOException {
Reader reader = new StringReader("dogs cats Cats Birds birds");
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
Map<String,String> args = new HashMap<String,String>();
ResourceLoader loader = new StringMockResourceLoader("cats");
args.put("protected", "protwords.txt");
args.put("pattern", "birds");
args.put("ignoreCase", "true");
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
factory.init(args);
factory.inform(loader);
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats", "Birds", "birds" });
}
}

View File

@ -28,7 +28,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -57,7 +57,7 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
}
};

View File

@ -28,7 +28,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -57,7 +57,7 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink));
}
};

View File

@ -28,7 +28,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
@ -103,7 +103,7 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
}
};

View File

@ -28,7 +28,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
@ -77,7 +77,7 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink));
}
};

View File

@ -29,7 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
@ -76,7 +76,7 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new PortugueseStemFilter(sink));
}
};

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -55,7 +55,7 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
}
};

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@ -55,7 +55,7 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new SwedishLightStemFilter(sink));
}
};

View File

@ -22,7 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@ -32,7 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* This acts as a lemmatizer for verbs and adjectives.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/

View File

@ -35,7 +35,7 @@ import java.io.IOException;
* </p>
* <p>
* In order to prevent terms from being stemmed, use an instance of
* {@link org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter}
* {@link org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter}
* or a custom {@link TokenFilter} that sets the {@link KeywordAttribute}
* before this {@link TokenStream}.
* </p>

View File

@ -25,7 +25,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
@ -49,7 +49,7 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
}
};

View File

@ -23,7 +23,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import java.io.IOException;
@ -70,7 +70,7 @@ public class TestJapaneseKatakanaStemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
}
};

View File

@ -23,7 +23,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@ -112,7 +112,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
@ -135,7 +135,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link StempelFilter}.
*/
@Override
@ -146,7 +146,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new StempelFilter(result, new StempelStemmer(stemTable));
return new TokenStreamComponents(source, result);
}