mirror of
https://github.com/apache/lucene.git
synced 2025-02-17 15:35:20 +00:00
LUCENE-4822: Add PatternKeywordTokenFilter to marks keywords based on regular expressions
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455321 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a5c4101d9b
commit
45e65d12b8
@ -27,6 +27,12 @@ Changes in backwards compatibility policy
|
||||
multiple ngrams derived from the same input token. (Walter Underwood
|
||||
via Mike McCandless)
|
||||
|
||||
|
||||
* LUCENE-4822: KeywordTokenFilter is now an abstract class. Subclasses
|
||||
need to implement #isKeyword() in order to mark terms as keywords.
|
||||
The existing functionality has been factored out into a new
|
||||
SetKeywordTokenFilter class. (Simon Willnauer, Uwe Schindler)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-4815: DrillSideways now allows more than one FacetRequest per
|
||||
@ -41,6 +47,9 @@ New Features
|
||||
once as a keyword and once as an ordinary token allow stemmers to emit
|
||||
a stemmed version along with the un-stemmed version. (Simon Willnauer)
|
||||
|
||||
* LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based
|
||||
on regular expressions. (Simon Willnauer, Uwe Schindler)
|
||||
|
||||
======================= Lucene 4.2.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
@ -107,7 +107,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* {@link ArabicStemFilter}.
|
||||
*
|
||||
* @param matchVersion
|
||||
@ -131,7 +131,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
|
||||
* {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
|
||||
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -144,7 +144,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
if(!stemExclusionSet.isEmpty()) {
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
}
|
||||
return new TokenStreamComponents(source, new ArabicStemFilter(result));
|
||||
}
|
||||
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.ar;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc @link
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
@ -29,10 +29,10 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see KeywordMarkerFilter */
|
||||
* @see SetKeywordMarkerFilter */
|
||||
|
||||
public final class ArabicStemFilter extends TokenFilter {
|
||||
private final ArabicStemmer stemmer = new ArabicStemmer();
|
||||
|
@ -24,7 +24,7 @@ import java.util.Set;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
@ -97,7 +97,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words and a stem exclusion set.
|
||||
* If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter}
|
||||
* If a stem exclusion set is provided this analyzer will add a {@link SetKeywordMarkerFilter}
|
||||
* before {@link BulgarianStemFilter}.
|
||||
*/
|
||||
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
@ -114,7 +114,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link BulgarianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -124,7 +124,7 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new BulgarianStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.bg;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
@ -131,7 +131,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new StandardFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(excltable != null && !excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
result = new SetKeywordMarkerFilter(result, excltable);
|
||||
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
|
||||
}
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,10 +30,10 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* A {@link TokenFilter} that applies {@link BrazilianStemmer}.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see KeywordMarkerFilter
|
||||
* @see SetKeywordMarkerFilter
|
||||
*
|
||||
*/
|
||||
public final class BrazilianStemFilter extends TokenFilter {
|
||||
|
@ -24,7 +24,7 @@ import java.util.Arrays;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -97,7 +97,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -119,7 +119,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -131,7 +131,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new CatalanStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -52,13 +52,7 @@ public final class LowerCaseFilter extends TokenFilter {
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
final char[] buffer = termAtt.buffer();
|
||||
final int length = termAtt.length();
|
||||
for (int i = 0; i < length;) {
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(
|
||||
charUtils.codePointAt(buffer, i)), buffer, i);
|
||||
}
|
||||
charUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.cz;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
@ -115,7 +115,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
||||
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
|
||||
* a stem exclusion set is provided via
|
||||
* {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
|
||||
* {@link KeywordMarkerFilter} is added before
|
||||
* {@link SetKeywordMarkerFilter} is added before
|
||||
* {@link CzechStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -126,7 +126,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
if(!this.stemExclusionTable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionTable);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionTable);
|
||||
result = new CzechStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ package org.apache.lucene.analysis.cz;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
@ -29,12 +29,12 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
* A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* <p><b>NOTE</b>: Input is expected to be in lowercase,
|
||||
* but with diacritical marks</p>
|
||||
* @see KeywordMarkerFilter
|
||||
* @see SetKeywordMarkerFilter
|
||||
*/
|
||||
public final class CzechStemFilter extends TokenFilter {
|
||||
private final CzechStemmer stemmer = new CzechStemmer();
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -91,7 +91,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -113,7 +113,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -124,7 +124,7 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new DanishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -24,7 +24,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -129,7 +129,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided, {@link GermanNormalizationFilter} and {@link GermanLightStemFilter}
|
||||
*/
|
||||
@Override
|
||||
@ -139,7 +139,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
result = new KeywordMarkerFilter(result, exclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, exclusionSet);
|
||||
result = new GermanNormalizationFilter(result);
|
||||
result = new GermanLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -34,10 +34,10 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* </p>
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see KeywordMarkerFilter
|
||||
* @see SetKeywordMarkerFilter
|
||||
*/
|
||||
public final class GermanStemFilter extends TokenFilter
|
||||
{
|
||||
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.el;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* <p>
|
||||
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
@ -73,7 +73,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -96,7 +96,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link EnglishPossessiveFilter},
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link PorterStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -108,7 +108,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new PorterStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* English words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -90,7 +90,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -112,7 +112,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SpanishLightStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -123,7 +123,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SpanishLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -89,7 +89,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -111,7 +111,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -122,7 +122,7 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new BasqueStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -91,7 +91,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -113,7 +113,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -124,7 +124,7 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new FinnishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.fr;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -133,7 +133,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link ElisionFilter},
|
||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||
* {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided, and {@link FrenchLightStemFilter}
|
||||
*/
|
||||
@Override
|
||||
@ -145,7 +145,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
result = new SetKeywordMarkerFilter(result, excltable);
|
||||
result = new FrenchLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -23,7 +23,7 @@ import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -107,7 +107,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -129,7 +129,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link IrishLowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -144,7 +144,7 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new IrishLowerCaseFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new IrishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
@ -89,7 +89,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -111,7 +111,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link GalicianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -122,7 +122,7 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new GalicianStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* Galician words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* Galician words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.hi;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
@ -111,7 +111,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
|
||||
* {@link HindiNormalizationFilter}, {@link KeywordMarkerFilter}
|
||||
* {@link HindiNormalizationFilter}, {@link SetKeywordMarkerFilter}
|
||||
* if a stem exclusion set is provided, {@link HindiStemFilter}, and
|
||||
* Hindi Stop words
|
||||
*/
|
||||
@ -121,7 +121,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
if (!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
result = new HindiNormalizationFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.hi;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc @link
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
@ -31,7 +31,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
* <p>
|
||||
* In some cases the normalization may cause unrelated terms to conflate, so
|
||||
* to prevent terms from being normalized use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see HindiNormalizer
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -91,7 +91,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -113,7 +113,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -124,7 +124,7 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new HungarianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* Hungarian words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -175,10 +175,7 @@ public class HunspellStemmer {
|
||||
@SuppressWarnings("unchecked")
|
||||
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
|
||||
if(dictionary.isIgnoreCase()) {
|
||||
for(int i=0;i<strippedWord.length;){
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(charUtils.codePointAt(strippedWord, i)), strippedWord, i);
|
||||
}
|
||||
charUtils.toLowerCase(strippedWord, 0, strippedWord.length);
|
||||
}
|
||||
segment.setLength(0);
|
||||
segment.append(strippedWord, 0, length);
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -89,7 +89,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -111,7 +111,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -122,7 +122,7 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new ArmenianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
@ -87,7 +87,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* {@link IndonesianStemFilter}.
|
||||
*
|
||||
* @param matchVersion
|
||||
@ -111,7 +111,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerFilter}
|
||||
* {@link StopFilter}, {@link SetKeywordMarkerFilter}
|
||||
* if a stem exclusion set is provided and {@link IndonesianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -122,7 +122,7 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) {
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
}
|
||||
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
|
||||
}
|
||||
|
@ -24,7 +24,8 @@ import java.util.Arrays;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -99,7 +100,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -121,7 +122,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link ItalianLightStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -133,7 +134,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new ItalianLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
@ -89,7 +89,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -111,7 +111,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link LatvianStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -122,7 +122,7 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new LatvianStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -22,41 +22,28 @@ import java.io.IOException;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* Marks terms as keywords via the {@link KeywordAttribute}. Each token
|
||||
* contained in the provided is marked as a keyword by setting
|
||||
* {@link KeywordAttribute#setKeyword(boolean)} to <code>true</code>.
|
||||
* Marks terms as keywords via the {@link KeywordAttribute}.
|
||||
*
|
||||
* @see KeywordAttribute
|
||||
*/
|
||||
public final class KeywordMarkerFilter extends TokenFilter {
|
||||
public abstract class KeywordMarkerFilter extends TokenFilter {
|
||||
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final CharArraySet keywordSet;
|
||||
|
||||
/**
|
||||
* Create a new KeywordMarkerFilter, that marks the current token as a
|
||||
* keyword if the tokens term buffer is contained in the given set via the
|
||||
* {@link KeywordAttribute}.
|
||||
*
|
||||
* @param in
|
||||
* TokenStream to filter
|
||||
* @param keywordSet
|
||||
* the keywords set to lookup the current termbuffer
|
||||
* Creates a new {@link KeywordMarkerFilter}
|
||||
* @param in the input stream
|
||||
*/
|
||||
public KeywordMarkerFilter(final TokenStream in, final CharArraySet keywordSet) {
|
||||
protected KeywordMarkerFilter(TokenStream in) {
|
||||
super(in);
|
||||
this.keywordSet = keywordSet;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (keywordSet.contains(termAtt.buffer(), 0, termAtt.length())) {
|
||||
if (isKeyword()) {
|
||||
keywordAttr.setKeyword(true);
|
||||
}
|
||||
return true;
|
||||
@ -64,4 +51,7 @@ public final class KeywordMarkerFilter extends TokenFilter {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract boolean isKeyword();
|
||||
|
||||
}
|
||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.*;
|
||||
@ -29,23 +30,30 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
* <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" ignoreCase="false"/>
|
||||
* <filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" pattern="^.+er$" ignoreCase="false"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
*/
|
||||
public class KeywordMarkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
public static final String PROTECTED_TOKENS = "protected";
|
||||
public static final String PATTERN = "pattern";
|
||||
private CharArraySet protectedWords;
|
||||
private boolean ignoreCase;
|
||||
private Pattern pattern;
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
String wordFiles = args.get(PROTECTED_TOKENS);
|
||||
String stringPattern = args.get(PATTERN);
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
if (wordFiles != null) {
|
||||
protectedWords = getWordSet(loader, wordFiles, ignoreCase);
|
||||
}
|
||||
if (stringPattern != null) {
|
||||
pattern = ignoreCase ? Pattern.compile(stringPattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE) : Pattern.compile(stringPattern);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
@ -54,6 +62,12 @@ public class KeywordMarkerFilterFactory extends TokenFilterFactory implements Re
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return protectedWords == null ? input : new KeywordMarkerFilter(input, protectedWords);
|
||||
if (pattern != null) {
|
||||
input = new PatternKeywordMarkerFilter(input, pattern);
|
||||
}
|
||||
if (protectedWords != null) {
|
||||
input = new SetKeywordMarkerFilter(input, protectedWords);
|
||||
}
|
||||
return input;
|
||||
}
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.nl;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
||||
@ -129,7 +129,7 @@ public final class DutchAnalyzer extends Analyzer {
|
||||
*
|
||||
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
|
||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is provided,
|
||||
* {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is provided,
|
||||
* {@link StemmerOverrideFilter}, and {@link SnowballFilter}
|
||||
*/
|
||||
@Override
|
||||
@ -140,7 +140,7 @@ public final class DutchAnalyzer extends Analyzer {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
result = new SetKeywordMarkerFilter(result, excltable);
|
||||
if (!stemdict.isEmpty())
|
||||
result = new StemmerOverrideFilter(matchVersion, result, stemdict);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -91,7 +91,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -113,7 +113,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -124,7 +124,7 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new NorwegianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -90,7 +90,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -112,7 +112,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link PortugueseLightStemFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -123,7 +123,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new PortugueseLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* Portuguese words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* Portuguese words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* Portuguese words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -94,7 +94,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -116,7 +116,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -127,7 +127,7 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new RomanianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
@ -111,7 +111,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided, and {@link SnowballFilter}
|
||||
*/
|
||||
@Override
|
||||
@ -122,7 +122,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.snowball;
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -74,7 +74,7 @@ public class SnowballPorterFilterFactory extends TokenFilterFactory implements R
|
||||
}
|
||||
|
||||
if (protectedWords != null)
|
||||
input = new KeywordMarkerFilter(input, protectedWords);
|
||||
input = new SetKeywordMarkerFilter(input, protectedWords);
|
||||
return new SnowballFilter(input, program);
|
||||
}
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -91,7 +91,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -113,7 +113,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -124,7 +124,7 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new SwedishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -22,7 +22,7 @@ import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -93,7 +93,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -115,7 +115,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link TurkishLowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem
|
||||
* {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem
|
||||
* exclusion set is provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -126,7 +126,7 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new TurkishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
@ -215,12 +215,9 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
||||
* The user should never modify this text array after calling this method.
|
||||
*/
|
||||
public V put(char[] text, V value) {
|
||||
if (ignoreCase)
|
||||
for(int i=0;i<text.length;){
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(
|
||||
charUtils.codePointAt(text, i)), text, i);
|
||||
}
|
||||
if (ignoreCase) {
|
||||
charUtils.toLowerCase(text, 0, text.length);
|
||||
}
|
||||
int slot = getSlot(text, 0, text.length);
|
||||
if (keys[slot] != null) {
|
||||
final V oldValue = values[slot];
|
||||
|
@ -131,7 +131,25 @@ public abstract class CharacterUtils {
|
||||
}
|
||||
return new CharacterBuffer(new char[bufferSize], 0, 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
|
||||
* at the given offset.
|
||||
* @param buffer the char buffer to lowercase
|
||||
* @param offset the offset to start at
|
||||
* @param limit the max char in the buffer to lower case
|
||||
*/
|
||||
public void toLowerCase(final char[] buffer, final int offset, final int limit) {
|
||||
assert buffer.length >= limit;
|
||||
assert offset <=0 && offset <= buffer.length;
|
||||
for (int i = offset; i < limit;) {
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(
|
||||
codePointAt(buffer, i)), buffer, i);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the {@link CharacterBuffer} with characters read from the given
|
||||
* reader {@link Reader}. This method tries to read as many characters into
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
@ -124,7 +124,7 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
|
||||
set.add("ساهدهات");
|
||||
MockTokenizer tokenStream = new MockTokenizer(new StringReader("ساهدهات"), MockTokenizer.WHITESPACE, false);
|
||||
|
||||
ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerFilter(tokenStream, set));
|
||||
ArabicStemFilter filter = new ArabicStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
|
||||
assertTokenStreamContents(filter, new String[]{"ساهدهات"});
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
@ -222,7 +222,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||
MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);
|
||||
|
||||
BulgarianStemFilter filter = new BulgarianStemFilter(
|
||||
new KeywordMarkerFilter(tokenStream, set));
|
||||
new SetKeywordMarkerFilter(tokenStream, set));
|
||||
assertTokenStreamContents(filter, new String[] { "строй", "строеве" });
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
@ -147,7 +147,7 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("Brasília");
|
||||
BrazilianStemFilter filter = new BrazilianStemFilter(
|
||||
new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"Brasília Brasilia")), set));
|
||||
assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
|
||||
}
|
||||
|
@ -39,6 +39,8 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
|
||||
import org.apache.lucene.analysis.sinks.TeeSinkTokenFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
@ -88,7 +90,9 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
|
||||
static {
|
||||
Collections.<Class<?>>addAll(oddlyNamedComponents,
|
||||
ReversePathHierarchyTokenizer.class, // this is supported via an option to PathHierarchyTokenizer's factory
|
||||
SnowballFilter.class // this is called SnowballPorterFilterFactory
|
||||
SnowballFilter.class, // this is called SnowballPorterFilterFactory
|
||||
PatternKeywordMarkerFilter.class,
|
||||
SetKeywordMarkerFilter.class
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
@ -281,7 +281,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
||||
public void testWithKeywordAttribute() throws IOException {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("hole");
|
||||
CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerFilter(
|
||||
CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
|
||||
new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
|
||||
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ import java.io.StringReader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||
@ -38,7 +38,7 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("fischen");
|
||||
GermanStemFilter filter = new GermanStemFilter(
|
||||
new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
"Fischen Trinken")), set));
|
||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -55,7 +55,7 @@ public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -62,7 +62,7 @@ public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -68,7 +68,7 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GermanStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -23,7 +23,7 @@ import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
@ -57,7 +57,7 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("yourselves");
|
||||
Tokenizer tokenizer = new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false);
|
||||
TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set));
|
||||
TokenStream filter = new PorterStemFilter(new SetKeywordMarkerFilter(tokenizer, set));
|
||||
assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -55,7 +55,7 @@ public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -185,7 +185,7 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -64,7 +64,7 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
@ -59,7 +59,7 @@ public class TestGalicianMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -55,7 +55,7 @@ public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new HungarianLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
@ -63,7 +63,7 @@ public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
|
||||
// assert with keywork marker
|
||||
tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
|
||||
filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
|
||||
filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY);
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
@ -36,34 +37,68 @@ import org.junit.Test;
|
||||
public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
@Test
|
||||
public void testIncrementToken() throws IOException {
|
||||
public void testSetFilterIncrementToken() throws IOException {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 5, true);
|
||||
set.add("lucenefox");
|
||||
String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
|
||||
"jumps" };
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
new SetKeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set)), output);
|
||||
CharArraySet mixedCaseSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("LuceneFox"), false);
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
new SetKeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), mixedCaseSet)), output);
|
||||
CharArraySet set2 = set;
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
new SetKeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), set2)), output);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPatternFilterIncrementToken() throws IOException {
|
||||
String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
|
||||
"jumps" };
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new PatternKeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), Pattern.compile("[a-zA-Z]+[fF]ox"))), output);
|
||||
|
||||
output = new String[] { "the", "quick", "brown", "lucenefox",
|
||||
"jumps" };
|
||||
|
||||
assertTokenStreamContents(new LowerCaseFilterMock(
|
||||
new PatternKeywordMarkerFilter(new MockTokenizer(new StringReader(
|
||||
"The quIck browN LuceneFox Jumps"), MockTokenizer.WHITESPACE, false), Pattern.compile("[a-zA-Z]+[f]ox"))), output);
|
||||
}
|
||||
|
||||
// LUCENE-2901
|
||||
public void testComposition() throws Exception {
|
||||
TokenStream ts = new LowerCaseFilterMock(
|
||||
new KeywordMarkerFilter(
|
||||
new KeywordMarkerFilter(
|
||||
new SetKeywordMarkerFilter(
|
||||
new SetKeywordMarkerFilter(
|
||||
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
|
||||
new CharArraySet(TEST_VERSION_CURRENT, asSet("Birds", "Houses"), false)),
|
||||
new CharArraySet(TEST_VERSION_CURRENT, asSet("Dogs", "Trees"), false)));
|
||||
|
||||
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
|
||||
|
||||
ts = new LowerCaseFilterMock(
|
||||
new PatternKeywordMarkerFilter(
|
||||
new PatternKeywordMarkerFilter(
|
||||
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
|
||||
Pattern.compile("Birds|Houses")),
|
||||
Pattern.compile("Dogs|Trees")));
|
||||
|
||||
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
|
||||
|
||||
ts = new LowerCaseFilterMock(
|
||||
new SetKeywordMarkerFilter(
|
||||
new PatternKeywordMarkerFilter(
|
||||
new MockTokenizer(new StringReader("Dogs Trees Birds Houses"), MockTokenizer.WHITESPACE, false),
|
||||
Pattern.compile("Birds|Houses")),
|
||||
new CharArraySet(TEST_VERSION_CURRENT, asSet("Dogs", "Trees"), false)));
|
||||
|
||||
assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" });
|
||||
}
|
||||
|
||||
public static final class LowerCaseFilterMock extends TokenFilter {
|
||||
|
@ -35,6 +35,7 @@ import org.apache.lucene.analysis.util.StringMockResourceLoader;
|
||||
* Simple tests to ensure the keyword marker filter factory is working.
|
||||
*/
|
||||
public class TestKeywordMarkerFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testKeywords() throws IOException {
|
||||
Reader reader = new StringReader("dogs cats");
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
@ -48,6 +49,36 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
|
||||
assertTokenStreamContents(ts, new String[] { "dog", "cats" });
|
||||
|
||||
|
||||
reader = new StringReader("dogs cats");
|
||||
tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
factory = new KeywordMarkerFilterFactory();
|
||||
args = new HashMap<String,String>();
|
||||
|
||||
args.put("pattern", "cats|Dogs");
|
||||
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
|
||||
factory.init(args);
|
||||
factory.inform(null);
|
||||
|
||||
ts = new PorterStemFilter(factory.create(tokenizer));
|
||||
assertTokenStreamContents(ts, new String[] { "dog", "cats" });
|
||||
}
|
||||
|
||||
public void testKeywordsMixed() throws IOException {
|
||||
Reader reader = new StringReader("dogs cats birds");
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
ResourceLoader loader = new StringMockResourceLoader("cats");
|
||||
args.put("protected", "protwords.txt");
|
||||
args.put("pattern", "birds|Dogs");
|
||||
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
|
||||
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
|
||||
assertTokenStreamContents(ts, new String[] { "dog", "cats", "birds" });
|
||||
}
|
||||
|
||||
public void testKeywordsCaseInsensitive() throws IOException {
|
||||
@ -64,5 +95,36 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
|
||||
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" });
|
||||
|
||||
reader = new StringReader("dogs cats Cats");
|
||||
tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
factory = new KeywordMarkerFilterFactory();
|
||||
args = new HashMap<String,String>();
|
||||
|
||||
args.put("pattern", "Cats");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
|
||||
factory.init(args);
|
||||
factory.inform(null);
|
||||
|
||||
ts = new PorterStemFilter(factory.create(tokenizer));
|
||||
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" });
|
||||
}
|
||||
|
||||
public void testKeywordsCaseInsensitiveMixed() throws IOException {
|
||||
Reader reader = new StringReader("dogs cats Cats Birds birds");
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
ResourceLoader loader = new StringMockResourceLoader("cats");
|
||||
args.put("protected", "protwords.txt");
|
||||
args.put("pattern", "birds");
|
||||
args.put("ignoreCase", "true");
|
||||
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
|
||||
TokenStream ts = new PorterStemFilter(factory.create(tokenizer));
|
||||
assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats", "Birds", "birds" });
|
||||
}
|
||||
}
|
||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -57,7 +57,7 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -57,7 +57,7 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
@ -103,7 +103,7 @@ public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
@ -77,7 +77,7 @@ public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
@ -76,7 +76,7 @@ public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new PortugueseStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -55,7 +55,7 @@ public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
@ -55,7 +55,7 @@ public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new SwedishLightStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
@ -32,7 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
* This acts as a lemmatizer for verbs and adjectives.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
|
@ -35,7 +35,7 @@ import java.io.IOException;
|
||||
* </p>
|
||||
* <p>
|
||||
* In order to prevent terms from being stemmed, use an instance of
|
||||
* {@link org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter}
|
||||
* {@link org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter}
|
||||
* or a custom {@link TokenFilter} that sets the {@link KeywordAttribute}
|
||||
* before this {@link TokenStream}.
|
||||
* </p>
|
||||
|
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
||||
@ -49,7 +49,7 @@ public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -70,7 +70,7 @@ public class TestJapaneseKatakanaStemFilter extends BaseTokenStreamTestCase {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
|
||||
}
|
||||
};
|
||||
|
@ -23,7 +23,7 @@ import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
@ -112,7 +112,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
|
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before
|
||||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
|
||||
* stemming.
|
||||
*
|
||||
* @param matchVersion lucene compatibility version
|
||||
@ -135,7 +135,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link StempelFilter}.
|
||||
*/
|
||||
@Override
|
||||
@ -146,7 +146,7 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new StempelFilter(result, new StempelStemmer(stemTable));
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user