From c6ac3031c1ba59b1da3e9105ec4a6f208ac665a5 Mon Sep 17 00:00:00 2001 From: Erick Erickson Date: Fri, 8 Mar 2013 13:41:51 +0000 Subject: [PATCH] docs for KeywordRepeatFilter git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1454384 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/analysis/en/KStemFilter.java | 12 ++++++++++++ .../apache/lucene/analysis/en/PorterStemFilter.java | 3 +++ .../lucene/analysis/hunspell/HunspellStemFilter.java | 12 ++++++++++++ .../miscellaneous/KeywordRepeatFilterFactory.java | 11 ++++------- .../lucene/analysis/snowball/SnowballFilter.java | 12 ++++++++++++ 5 files changed, 43 insertions(+), 7 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilter.java index 257a562f3ce..e0e705fce3b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilter.java @@ -32,6 +32,18 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; * Conference on Research and Development in Information Retrieval, 191-203, 1993). *

* All terms must already be lowercased for this filter to work correctly. + * + *

+ * Note: This filter is aware of the {@link KeywordAttribute}. To prevent + * certain terms from being passed to the stemmer + * {@link KeywordAttribute#isKeyword()} should be set to true + * in a previous {@link TokenStream}. + * + * Note: For including the original term as well as the stemmed version, see + * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} + *

+ * + * */ public final class KStemFilter extends TokenFilter { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilter.java index 5b8f0b8d954..6b011976a03 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilter.java @@ -48,6 +48,9 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; certain terms from being passed to the stemmer {@link KeywordAttribute#isKeyword()} should be set to true in a previous {@link TokenStream}. + + Note: For including the original term as well as the stemmed version, see + {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}

*/ public final class PorterStemFilter extends TokenFilter { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java index 9e5e95dd029..b08da4199bb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java @@ -30,6 +30,18 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple * stems, this filter can emit multiple tokens for each consumed token + * + *

+ * Note: This filter is aware of the {@link KeywordAttribute}. To prevent + * certain terms from being passed to the stemmer + * {@link KeywordAttribute#isKeyword()} should be set to true + * in a previous {@link TokenStream}. + * + * Note: For including the original term as well as the stemmed version, see + * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} + *

+ * + * */ public final class HunspellStemFilter extends TokenFilter { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java index fa5b566a4b7..8dccd15af4f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java @@ -22,13 +22,10 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; /** * Factory for {@link KeywordRepeatFilter}. - *
- * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.KeywordRepeatFilter"/>
- *   </analyzer>
- * </fieldType>
+ * + * Since {@link KeywordRepeatFilter} emits two tokens for every input token, and any tokens that aren't transformed + * later in the analysis chain will be in the document twice. Therefore, consider adding + * {@link RemoveDuplicatesTokenFilterFactory} later in the analysis chain. */ public final class KeywordRepeatFilterFactory extends TokenFilterFactory { @Override diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java index 0b716e86f4b..9218c44acfc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java @@ -37,6 +37,18 @@ import org.tartarus.snowball.SnowballProgram; *
  • For other languages, see {@link LowerCaseFilter}. * *

    + * + *

    + * Note: This filter is aware of the {@link KeywordAttribute}. To prevent + * certain terms from being passed to the stemmer + * {@link KeywordAttribute#isKeyword()} should be set to true + * in a previous {@link TokenStream}. + * + * Note: For including the original term as well as the stemmed version, see + * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} + *

    + * + * */ public final class SnowballFilter extends TokenFilter {