LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1499312 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2013-07-03 10:30:43 +00:00
parent d34a8fdaba
commit 4496a2172f
3 changed files with 51 additions and 2 deletions

View File

@ -39,6 +39,9 @@ Optimizations
Changes in backwards compatibility policy Changes in backwards compatibility policy
* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
(Dawid Weiss, Grzegorz Sobczyk)
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the * LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
same position and preserves the position length and the offsets of the same position and preserves the position length and the offsets of the
original token. (Simon Willnauer, Adrien Grand) original token. (Simon Willnauer, Adrien Grand)
@ -217,6 +220,9 @@ Optimizations
New Features New Features
* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
(Dawid Weiss, Grzegorz Sobczyk)
* LUCENE-5064: Added PagedMutable (internal), a paged extension of * LUCENE-5064: Added PagedMutable (internal), a paged extension of
PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand) PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)

View File

@ -27,6 +27,7 @@ import morfologik.stemming.PolishStemmer.DICTIONARY;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.*; import org.apache.lucene.util.*;
@ -44,6 +45,7 @@ public class MorfologikFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class); private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final CharsRef scratch = new CharsRef(0); private final CharsRef scratch = new CharsRef(0);
private final CharacterUtils charUtils; private final CharacterUtils charUtils;
@ -140,7 +142,8 @@ public class MorfologikFilter extends TokenFilter {
popNextLemma(); popNextLemma();
return true; return true;
} else if (this.input.incrementToken()) { } else if (this.input.incrementToken()) {
if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) { if (!keywordAttr.isKeyword() &&
(lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
current = captureState(); current = captureState();
popNextLemma(); popNextLemma();
} else { } else {

View File

@ -18,11 +18,22 @@ package org.apache.lucene.analysis.morfologik;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.TreeSet; import java.util.TreeSet;
import org.apache.lucene.analysis.*; import morfologik.stemming.PolishStemmer.DICTIONARY;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/** /**
* TODO: The tests below rely on the order of returned lemmas, which is probably not good. * TODO: The tests below rely on the order of returned lemmas, which is probably not good.
@ -144,6 +155,35 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
ts.close(); ts.close();
} }
/** */
public final void testKeywordAttrTokens() throws IOException {
final Version version = TEST_VERSION_CURRENT;
final DICTIONARY dictionary = DICTIONARY.COMBINED;
Analyzer a = new MorfologikAnalyzer(version, dictionary) {
@Override
protected TokenStreamComponents createComponents(String field, Reader reader) {
final CharArraySet keywords = new CharArraySet(version, 1, false);
keywords.add("liście");
final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
result = new SetKeywordMarkerFilter(result, keywords);
result = new MorfologikFilter(result, dictionary, TEST_VERSION_CURRENT);
return new TokenStreamComponents(src, result);
}
};
assertAnalyzesToReuse(
a,
"liście danych",
new String[] { "liście", "dany", "dana", "dane", "dać" },
new int[] { 0, 7, 7, 7, 7 },
new int[] { 6, 13, 13, 13, 13 },
new int[] { 1, 1, 0, 0, 0 });
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandom() throws Exception { public void testRandom() throws Exception {
checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER); checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);