LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1499312 13f79535-47bb-0310-9956-ffa450edef68
2013-07-03 10:30:43 +00:00 · 2013-07-03 10:30:43 +00:00 · 4496a2172f
parent d34a8fdaba
commit 4496a2172f
3 changed files with 51 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -39,6 +39,9 @@ Optimizations

 Changes in backwards compatibility policy

+* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
+  (Dawid Weiss, Grzegorz Sobczyk)
+
 * LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
  same position and preserves the position length and the offsets of the
  original token. (Simon Willnauer, Adrien Grand)
@ -217,6 +220,9 @@ Optimizations

 New Features

+* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
+  (Dawid Weiss, Grzegorz Sobczyk)
+
 * LUCENE-5064: Added PagedMutable (internal), a paged extension of
  PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)

--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@ -27,6 +27,7 @@ import morfologik.stemming.PolishStemmer.DICTIONARY;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.*;
@ -44,6 +45,7 @@ public class MorfologikFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

  private final CharsRef scratch = new CharsRef(0);
  private final CharacterUtils charUtils;
@ -140,7 +142,8 @@ public class MorfologikFilter extends TokenFilter {
      popNextLemma();
      return true;
    } else if (this.input.incrementToken()) {
-      if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
+      if (!keywordAttr.isKeyword() && 
+          (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
        current = captureState();
        popNextLemma();
      } else {
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@ -18,11 +18,22 @@ package org.apache.lucene.analysis.morfologik;
 */

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.TreeSet;

-import org.apache.lucene.analysis.*;
+import morfologik.stemming.PolishStemmer.DICTIONARY;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;

 /**
 * TODO: The tests below rely on the order of returned lemmas, which is probably not good. 
@ -144,6 +155,35 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
    ts.close();
  }

+  /** */
+  public final void testKeywordAttrTokens() throws IOException {
+    final Version version = TEST_VERSION_CURRENT;
+    final DICTIONARY dictionary = DICTIONARY.COMBINED;
+
+    Analyzer a = new MorfologikAnalyzer(version, dictionary) {
+      @Override
+      protected TokenStreamComponents createComponents(String field, Reader reader) {
+        final CharArraySet keywords = new CharArraySet(version, 1, false);
+        keywords.add("liście");
+
+        final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+        TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
+        result = new SetKeywordMarkerFilter(result, keywords);
+        result = new MorfologikFilter(result, dictionary, TEST_VERSION_CURRENT); 
+
+        return new TokenStreamComponents(src, result);
+      }
+    };
+
+    assertAnalyzesToReuse(
+      a,
+      "liście danych",
+      new String[] { "liście", "dany", "dana", "dane", "dać" },
+      new int[] { 0, 7, 7, 7, 7 },
+      new int[] { 6, 13, 13, 13, 13 },
+      new int[] { 1, 1, 0, 0, 0 });
+  }
+
  /** blast some random strings through the analyzer */
  public void testRandom() throws Exception {
    checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);