mirror of https://github.com/apache/lucene.git
LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1499312 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d34a8fdaba
commit
4496a2172f
|
@ -39,6 +39,9 @@ Optimizations
|
|||
|
||||
Changes in backwards compatibility policy
|
||||
|
||||
* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
|
||||
(Dawid Weiss, Grzegorz Sobczyk)
|
||||
|
||||
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
|
||||
same position and preserves the position length and the offsets of the
|
||||
original token. (Simon Willnauer, Adrien Grand)
|
||||
|
@ -217,6 +220,9 @@ Optimizations
|
|||
|
||||
New Features
|
||||
|
||||
* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
|
||||
(Dawid Weiss, Grzegorz Sobczyk)
|
||||
|
||||
* LUCENE-5064: Added PagedMutable (internal), a paged extension of
|
||||
PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import morfologik.stemming.PolishStemmer.DICTIONARY;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.*;
|
||||
|
@ -44,6 +45,7 @@ public class MorfologikFilter extends TokenFilter {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
private final CharsRef scratch = new CharsRef(0);
|
||||
private final CharacterUtils charUtils;
|
||||
|
@ -140,7 +142,8 @@ public class MorfologikFilter extends TokenFilter {
|
|||
popNextLemma();
|
||||
return true;
|
||||
} else if (this.input.incrementToken()) {
|
||||
if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
|
||||
if (!keywordAttr.isKeyword() &&
|
||||
(lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
|
||||
current = captureState();
|
||||
popNextLemma();
|
||||
} else {
|
||||
|
|
|
@ -18,11 +18,22 @@ package org.apache.lucene.analysis.morfologik;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
|
||||
|
@ -144,6 +155,35 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
ts.close();
|
||||
}
|
||||
|
||||
/** */
|
||||
public final void testKeywordAttrTokens() throws IOException {
|
||||
final Version version = TEST_VERSION_CURRENT;
|
||||
final DICTIONARY dictionary = DICTIONARY.COMBINED;
|
||||
|
||||
Analyzer a = new MorfologikAnalyzer(version, dictionary) {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String field, Reader reader) {
|
||||
final CharArraySet keywords = new CharArraySet(version, 1, false);
|
||||
keywords.add("liście");
|
||||
|
||||
final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
|
||||
result = new SetKeywordMarkerFilter(result, keywords);
|
||||
result = new MorfologikFilter(result, dictionary, TEST_VERSION_CURRENT);
|
||||
|
||||
return new TokenStreamComponents(src, result);
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
a,
|
||||
"liście danych",
|
||||
new String[] { "liście", "dany", "dana", "dane", "dać" },
|
||||
new int[] { 0, 7, 7, 7, 7 },
|
||||
new int[] { 6, 13, 13, 13, 13 },
|
||||
new int[] { 1, 1, 0, 0, 0 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandom() throws Exception {
|
||||
checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);
|
||||
|
|
Loading…
Reference in New Issue