mirror of https://github.com/apache/lucene.git
LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1499312 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d34a8fdaba
commit
4496a2172f
|
@ -39,6 +39,9 @@ Optimizations
|
||||||
|
|
||||||
Changes in backwards compatibility policy
|
Changes in backwards compatibility policy
|
||||||
|
|
||||||
|
* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
|
||||||
|
(Dawid Weiss, Grzegorz Sobczyk)
|
||||||
|
|
||||||
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
|
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
|
||||||
same position and preserves the position length and the offsets of the
|
same position and preserves the position length and the offsets of the
|
||||||
original token. (Simon Willnauer, Adrien Grand)
|
original token. (Simon Willnauer, Adrien Grand)
|
||||||
|
@ -217,6 +220,9 @@ Optimizations
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
|
* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
|
||||||
|
(Dawid Weiss, Grzegorz Sobczyk)
|
||||||
|
|
||||||
* LUCENE-5064: Added PagedMutable (internal), a paged extension of
|
* LUCENE-5064: Added PagedMutable (internal), a paged extension of
|
||||||
PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
|
PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||||
import org.apache.lucene.util.*;
|
import org.apache.lucene.util.*;
|
||||||
|
@ -44,6 +45,7 @@ public class MorfologikFilter extends TokenFilter {
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
|
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
private final CharsRef scratch = new CharsRef(0);
|
private final CharsRef scratch = new CharsRef(0);
|
||||||
private final CharacterUtils charUtils;
|
private final CharacterUtils charUtils;
|
||||||
|
@ -140,7 +142,8 @@ public class MorfologikFilter extends TokenFilter {
|
||||||
popNextLemma();
|
popNextLemma();
|
||||||
return true;
|
return true;
|
||||||
} else if (this.input.incrementToken()) {
|
} else if (this.input.incrementToken()) {
|
||||||
if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
|
if (!keywordAttr.isKeyword() &&
|
||||||
|
(lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
|
||||||
current = captureState();
|
current = captureState();
|
||||||
popNextLemma();
|
popNextLemma();
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -18,11 +18,22 @@ package org.apache.lucene.analysis.morfologik;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import morfologik.stemming.PolishStemmer.DICTIONARY;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
|
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
|
||||||
|
@ -144,6 +155,35 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** */
|
||||||
|
public final void testKeywordAttrTokens() throws IOException {
|
||||||
|
final Version version = TEST_VERSION_CURRENT;
|
||||||
|
final DICTIONARY dictionary = DICTIONARY.COMBINED;
|
||||||
|
|
||||||
|
Analyzer a = new MorfologikAnalyzer(version, dictionary) {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String field, Reader reader) {
|
||||||
|
final CharArraySet keywords = new CharArraySet(version, 1, false);
|
||||||
|
keywords.add("liście");
|
||||||
|
|
||||||
|
final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
|
||||||
|
result = new SetKeywordMarkerFilter(result, keywords);
|
||||||
|
result = new MorfologikFilter(result, dictionary, TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
|
return new TokenStreamComponents(src, result);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesToReuse(
|
||||||
|
a,
|
||||||
|
"liście danych",
|
||||||
|
new String[] { "liście", "dany", "dana", "dane", "dać" },
|
||||||
|
new int[] { 0, 7, 7, 7, 7 },
|
||||||
|
new int[] { 6, 13, 13, 13, 13 },
|
||||||
|
new int[] { 1, 1, 0, 0, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandom() throws Exception {
|
public void testRandom() throws Exception {
|
||||||
checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);
|
checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);
|
||||||
|
|
Loading…
Reference in New Issue