LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1499312 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2013-07-03 10:30:43 +00:00
parent d34a8fdaba
commit 4496a2172f
3 changed files with 51 additions and 2 deletions

View File

@ -39,6 +39,9 @@ Optimizations
Changes in backwards compatibility policy
* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
(Dawid Weiss, Grzegorz Sobczyk)
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
same position and preserves the position length and the offsets of the
original token. (Simon Willnauer, Adrien Grand)
@ -217,6 +220,9 @@ Optimizations
New Features
* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
(Dawid Weiss, Grzegorz Sobczyk)
* LUCENE-5064: Added PagedMutable (internal), a paged extension of
PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)

View File

@ -27,6 +27,7 @@ import morfologik.stemming.PolishStemmer.DICTIONARY;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.*;
@ -44,6 +45,7 @@ public class MorfologikFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final CharsRef scratch = new CharsRef(0);
private final CharacterUtils charUtils;
@ -140,7 +142,8 @@ public class MorfologikFilter extends TokenFilter {
popNextLemma();
return true;
} else if (this.input.incrementToken()) {
if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
if (!keywordAttr.isKeyword() &&
(lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
current = captureState();
popNextLemma();
} else {

View File

@ -18,11 +18,22 @@ package org.apache.lucene.analysis.morfologik;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.TreeSet;
import org.apache.lucene.analysis.*;
import morfologik.stemming.PolishStemmer.DICTIONARY;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
@ -144,6 +155,35 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
ts.close();
}
/** */
public final void testKeywordAttrTokens() throws IOException {
final Version version = TEST_VERSION_CURRENT;
final DICTIONARY dictionary = DICTIONARY.COMBINED;
Analyzer a = new MorfologikAnalyzer(version, dictionary) {
@Override
protected TokenStreamComponents createComponents(String field, Reader reader) {
final CharArraySet keywords = new CharArraySet(version, 1, false);
keywords.add("liście");
final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
result = new SetKeywordMarkerFilter(result, keywords);
result = new MorfologikFilter(result, dictionary, TEST_VERSION_CURRENT);
return new TokenStreamComponents(src, result);
}
};
assertAnalyzesToReuse(
a,
"liście danych",
new String[] { "liście", "dany", "dana", "dane", "dać" },
new int[] { 0, 7, 7, 7, 7 },
new int[] { 6, 13, 13, 13, 13 },
new int[] { 1, 1, 0, 0, 0 });
}
/** blast some random strings through the analyzer */
public void testRandom() throws Exception {
checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);