LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1499312 13f79535-47bb-0310-9956-ffa450edef68
2013-07-03 10:30:43 +00:00 · 2013-07-03 10:30:43 +00:00 · 4496a2172f
parent d34a8fdaba
commit 4496a2172f
3 changed files with 51 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -39,6 +39,9 @@ Optimizations
 Changes in backwards compatibility policy
 * LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
  (Dawid Weiss, Grzegorz Sobczyk)
 * LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
  same position and preserves the position length and the offsets of the
  original token. (Simon Willnauer, Adrien Grand)
@ -217,6 +220,9 @@ Optimizations
 New Features
 * LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
  (Dawid Weiss, Grzegorz Sobczyk)
 * LUCENE-5064: Added PagedMutable (internal), a paged extension of
  PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@ -27,6 +27,7 @@ import morfologik.stemming.PolishStemmer.DICTIONARY;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.*;
@ -44,6 +45,7 @@ public class MorfologikFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
  private final CharsRef scratch = new CharsRef(0);
  private final CharacterUtils charUtils;
@ -140,7 +142,8 @@ public class MorfologikFilter extends TokenFilter {
      popNextLemma();
      return true;
    } else if (this.input.incrementToken()) {
-      if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
+      if (!keywordAttr.isKeyword() && 
          (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
        current = captureState();
        popNextLemma();
      } else {
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@ -18,11 +18,22 @@ package org.apache.lucene.analysis.morfologik;
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.TreeSet;
-import org.apache.lucene.analysis.*;
+import morfologik.stemming.PolishStemmer.DICTIONARY;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.Version;
 /**
 * TODO: The tests below rely on the order of returned lemmas, which is probably not good. 
@ -144,6 +155,35 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
    ts.close();
  }
  /** */
  public final void testKeywordAttrTokens() throws IOException {
    final Version version = TEST_VERSION_CURRENT;
    final DICTIONARY dictionary = DICTIONARY.COMBINED;
    Analyzer a = new MorfologikAnalyzer(version, dictionary) {
      @Override
      protected TokenStreamComponents createComponents(String field, Reader reader) {
        final CharArraySet keywords = new CharArraySet(version, 1, false);
        keywords.add("liście");
        final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
        TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
        result = new SetKeywordMarkerFilter(result, keywords);
        result = new MorfologikFilter(result, dictionary, TEST_VERSION_CURRENT); 
        return new TokenStreamComponents(src, result);
      }
    };
    assertAnalyzesToReuse(
      a,
      "liście danych",
      new String[] { "liście", "dany", "dana", "dane", "dać" },
      new int[] { 0, 7, 7, 7, 7 },
      new int[] { 6, 13, 13, 13, 13 },
      new int[] { 1, 1, 0, 0, 0 });
  }
  /** blast some random strings through the analyzer */
  public void testRandom() throws Exception {
    checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);