mirror of https://github.com/apache/lucene.git
SOLR-4565: Extend Norwegian stemmers to handle nynorsk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1497396 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f9a9649f71
commit
3003cfe668
|
@ -257,6 +257,9 @@ New Features
|
|||
* LUCENE-5079: IndexWriter.hasUncommittedChanges() returns true if there are
|
||||
changes that have not been committed. (yonik, Mike McCandless, Uwe Schindler)
|
||||
|
||||
* SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter
|
||||
to handle "nynorsk" (Erlend Garåsen, janhoy via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes
|
||||
|
|
|
@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
|||
* </p>
|
||||
*/
|
||||
public final class NorwegianLightStemFilter extends TokenFilter {
|
||||
private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
|
||||
private final NorwegianLightStemmer stemmer;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
/**
|
||||
* Calls {@link #NorwegianLightStemFilter(TokenStream, int)
|
||||
* NorwegianLightStemFilter(input, BOKMAAL)}
|
||||
*/
|
||||
public NorwegianLightStemFilter(TokenStream input) {
|
||||
this(input, NorwegianLightStemmer.BOKMAAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new NorwegianLightStemFilter
|
||||
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
|
||||
* {@link NorwegianLightStemmer#NYNORSK}, or both.
|
||||
*/
|
||||
public NorwegianLightStemFilter(TokenStream input, int flags) {
|
||||
super(input);
|
||||
stemmer = new NorwegianLightStemmer(flags);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.no.NorwegianLightStemFilter;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||
|
||||
/**
|
||||
* Factory for {@link NorwegianLightStemFilter}.
|
||||
* <pre class="prettyprint">
|
||||
|
@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.NorwegianLightStemFilterFactory"/>
|
||||
* <filter class="solr.NorwegianLightStemFilterFactory" variant="nb"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
|
||||
|
||||
private final int flags;
|
||||
|
||||
/** Creates a new NorwegianLightStemFilterFactory */
|
||||
public NorwegianLightStemFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
String variant = get(args, "variant");
|
||||
if (variant == null || "nb".equals(variant)) {
|
||||
flags = BOKMAAL;
|
||||
} else if ("nn".equals(variant)) {
|
||||
flags = NYNORSK;
|
||||
} else if ("no".equals(variant)) {
|
||||
flags = BOKMAAL | NYNORSK;
|
||||
} else {
|
||||
throw new IllegalArgumentException("invalid variant: " + variant);
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -46,6 +61,6 @@ public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new NorwegianLightStemFilter(input);
|
||||
return new NorwegianLightStemFilter(input, flags);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -62,6 +62,25 @@ import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
|||
* corpus to validate against whereas the Norwegian one is hand crafted.
|
||||
*/
|
||||
public class NorwegianLightStemmer {
|
||||
/** Constant to remove Bokmål-specific endings */
|
||||
public static final int BOKMAAL = 1;
|
||||
/** Constant to remove Nynorsk-specific endings */
|
||||
public static final int NYNORSK = 2;
|
||||
|
||||
final boolean useBokmaal;
|
||||
final boolean useNynorsk;
|
||||
|
||||
/**
|
||||
* Creates a new NorwegianLightStemmer
|
||||
* @param flags set to {@link #BOKMAAL}, {@link #NYNORSK}, or both.
|
||||
*/
|
||||
public NorwegianLightStemmer(int flags) {
|
||||
if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
|
||||
throw new IllegalArgumentException("invalid flags");
|
||||
}
|
||||
useBokmaal = (flags & BOKMAAL) != 0;
|
||||
useNynorsk = (flags & NYNORSK) != 0;
|
||||
}
|
||||
|
||||
public int stem(char s[], int len) {
|
||||
// Remove posessive -s (bilens -> bilen) and continue checking
|
||||
|
@ -70,39 +89,76 @@ public class NorwegianLightStemmer {
|
|||
|
||||
// Remove common endings, single-pass
|
||||
if (len > 7 &&
|
||||
(endsWith(s, len, "heter") || // general ending (hemmelig-heter -> hemmelig)
|
||||
endsWith(s, len, "heten"))) // general ending (hemmelig-heten -> hemmelig)
|
||||
((endsWith(s, len, "heter") &&
|
||||
useBokmaal) || // general ending (hemmelig-heter -> hemmelig)
|
||||
(endsWith(s, len, "heten") &&
|
||||
useBokmaal) || // general ending (hemmelig-heten -> hemmelig)
|
||||
(endsWith(s, len, "heita") &&
|
||||
useNynorsk))) // general ending (hemmeleg-heita -> hemmeleg)
|
||||
return len - 5;
|
||||
|
||||
// Remove Nynorsk common endings, single-pass
|
||||
if (len > 8 && useNynorsk &&
|
||||
(endsWith(s, len, "heiter") || // general ending (hemmeleg-heiter -> hemmeleg)
|
||||
endsWith(s, len, "leiken") || // general ending (trygg-leiken -> trygg)
|
||||
endsWith(s, len, "leikar"))) // general ending (trygg-leikar -> trygg)
|
||||
return len - 6;
|
||||
|
||||
if (len > 5 &&
|
||||
(endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
|
||||
endsWith(s, len, "het"))) // general ending (hemmelig-het -> hemmelig)
|
||||
(endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
|
||||
(endsWith(s, len, "het") &&
|
||||
useBokmaal))) // general ending (hemmelig-het -> hemmelig)
|
||||
return len - 3;
|
||||
|
||||
if (len > 6 && useNynorsk &&
|
||||
(endsWith(s, len, "heit") || // general ending (hemmeleg-heit -> hemmeleg)
|
||||
endsWith(s, len, "semd") || // general ending (verk-semd -> verk)
|
||||
endsWith(s, len, "leik"))) // general ending (trygg-leik -> trygg)
|
||||
return len - 4;
|
||||
|
||||
if (len > 7 &&
|
||||
(endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
|
||||
endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
|
||||
return len - 5;
|
||||
|
||||
if (len > 6 &&
|
||||
(endsWith(s, len, "ende") || // (sov-ende -> sov)
|
||||
((endsWith(s, len, "ende") &&
|
||||
useBokmaal) || // (sov-ende -> sov)
|
||||
(endsWith(s, len, "ande") &&
|
||||
useNynorsk) || // (sov-ande -> sov)
|
||||
endsWith(s, len, "else") || // general ending (føl-else -> føl)
|
||||
endsWith(s, len, "este") || // adj (fin-este -> fin)
|
||||
endsWith(s, len, "eren"))) // masc
|
||||
(endsWith(s, len, "este") &&
|
||||
useBokmaal) || // adj (fin-este -> fin)
|
||||
(endsWith(s, len, "aste") &&
|
||||
useNynorsk) || // adj (fin-aste -> fin)
|
||||
(endsWith(s, len, "eren") &&
|
||||
useBokmaal) || // masc
|
||||
(endsWith(s, len, "aren") &&
|
||||
useNynorsk))) // masc
|
||||
return len - 4;
|
||||
|
||||
if (len > 5 &&
|
||||
(endsWith(s, len, "ere") || // adj (fin-ere -> fin)
|
||||
endsWith(s, len, "est") || // adj (fin-est -> fin)
|
||||
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
|
||||
))
|
||||
((endsWith(s, len, "ere") &&
|
||||
useBokmaal) || // adj (fin-ere -> fin)
|
||||
(endsWith(s, len, "are") &&
|
||||
useNynorsk) || // adj (fin-are -> fin)
|
||||
(endsWith(s, len, "est") &&
|
||||
useBokmaal) || // adj (fin-est -> fin)
|
||||
(endsWith(s, len, "ast") &&
|
||||
useNynorsk) || // adj (fin-ast -> fin)
|
||||
endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
|
||||
(endsWith(s, len, "ane") &&
|
||||
useNynorsk))) // masc pl definite (gut-ane)
|
||||
return len - 3;
|
||||
|
||||
if (len > 4 &&
|
||||
(endsWith(s, len, "er") || // masc/fem indefinite
|
||||
endsWith(s, len, "en") || // masc/fem definite
|
||||
endsWith(s, len, "et") || // neutr definite
|
||||
endsWith(s, len, "st") || // adj (billig-st -> billig)
|
||||
(endsWith(s, len, "ar") &&
|
||||
useNynorsk) || // masc pl indefinite
|
||||
(endsWith(s, len, "st") &&
|
||||
useBokmaal) || // adj (billig-st -> billig)
|
||||
endsWith(s, len, "te")))
|
||||
return len - 2;
|
||||
|
||||
|
|
|
@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
|||
* </p>
|
||||
*/
|
||||
public final class NorwegianMinimalStemFilter extends TokenFilter {
|
||||
private final NorwegianMinimalStemmer stemmer = new NorwegianMinimalStemmer();
|
||||
private final NorwegianMinimalStemmer stemmer;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
/**
|
||||
* Calls {@link #NorwegianMinimalStemFilter(TokenStream, int)
|
||||
* NorwegianMinimalStemFilter(input, BOKMAAL)}
|
||||
*/
|
||||
public NorwegianMinimalStemFilter(TokenStream input) {
|
||||
this(input, NorwegianLightStemmer.BOKMAAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new NorwegianLightStemFilter
|
||||
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
|
||||
* {@link NorwegianLightStemmer#NYNORSK}, or both.
|
||||
*/
|
||||
public NorwegianMinimalStemFilter(TokenStream input, int flags) {
|
||||
super(input);
|
||||
this.stemmer = new NorwegianMinimalStemmer(flags);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||
|
||||
/**
|
||||
* Factory for {@link NorwegianMinimalStemFilter}.
|
||||
* <pre class="prettyprint">
|
||||
|
@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.NorwegianMinimalStemFilterFactory"/>
|
||||
* <filter class="solr.NorwegianMinimalStemFilterFactory" variant="nb"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory {
|
||||
|
||||
private final int flags;
|
||||
|
||||
/** Creates a new NorwegianMinimalStemFilterFactory */
|
||||
public NorwegianMinimalStemFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
String variant = get(args, "variant");
|
||||
if (variant == null || "nb".equals(variant)) {
|
||||
flags = BOKMAAL;
|
||||
} else if ("nn".equals(variant)) {
|
||||
flags = NYNORSK;
|
||||
} else if ("no".equals(variant)) {
|
||||
flags = BOKMAAL | NYNORSK;
|
||||
} else {
|
||||
throw new IllegalArgumentException("invalid variant: " + variant);
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -46,6 +61,6 @@ public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new NorwegianMinimalStemFilter(input);
|
||||
return new NorwegianMinimalStemFilter(input, flags);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,13 +53,30 @@ package org.apache.lucene.analysis.no;
|
|||
*/
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||
|
||||
/**
|
||||
* Minimal Stemmer for Norwegian bokmål (no-nb)
|
||||
* Minimal Stemmer for Norwegian Bokmål (no-nb) and Nynorsk (no-nn)
|
||||
* <p>
|
||||
* Stems known plural forms for Norwegian nouns only, together with genitiv -s
|
||||
*/
|
||||
public class NorwegianMinimalStemmer {
|
||||
final boolean useBokmaal;
|
||||
final boolean useNynorsk;
|
||||
|
||||
/**
|
||||
* Creates a new NorwegianMinimalStemmer
|
||||
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
|
||||
* {@link NorwegianLightStemmer#NYNORSK}, or both.
|
||||
*/
|
||||
public NorwegianMinimalStemmer(int flags) {
|
||||
if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
|
||||
throw new IllegalArgumentException("invalid flags");
|
||||
}
|
||||
useBokmaal = (flags & BOKMAAL) != 0;
|
||||
useNynorsk = (flags & NYNORSK) != 0;
|
||||
}
|
||||
|
||||
public int stem(char s[], int len) {
|
||||
// Remove genitiv s
|
||||
|
@ -67,15 +84,19 @@ public class NorwegianMinimalStemmer {
|
|||
len--;
|
||||
|
||||
if (len > 5 &&
|
||||
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
|
||||
)
|
||||
(endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
|
||||
(endsWith(s, len, "ane") &&
|
||||
useNynorsk // masc pl definite (gut-ane)
|
||||
)))
|
||||
return len - 3;
|
||||
|
||||
if (len > 4 &&
|
||||
(endsWith(s, len, "er") || // masc/fem indefinite
|
||||
endsWith(s, len, "en") || // masc/fem definite
|
||||
endsWith(s, len, "et") // neutr definite
|
||||
))
|
||||
(endsWith(s, len, "er") || // masc/fem indefinite
|
||||
endsWith(s, len, "en") || // masc/fem definite
|
||||
endsWith(s, len, "et") || // neutr definite
|
||||
(endsWith(s, len, "ar") &&
|
||||
useNynorsk // masc pl indefinite
|
||||
)))
|
||||
return len - 2;
|
||||
|
||||
if (len > 3)
|
||||
|
|
|
@ -32,6 +32,9 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||
|
||||
|
||||
/**
|
||||
* Simple tests for {@link NorwegianLightStemFilter}
|
||||
|
@ -42,7 +45,7 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source));
|
||||
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source, BOKMAAL));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -51,6 +54,18 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
|||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
|
||||
}
|
||||
|
||||
/** Test against a Nynorsk vocabulary file */
|
||||
public void testNynorskVocabulary() throws IOException {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source, NYNORSK));
|
||||
}
|
||||
};
|
||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nn_light.txt")));
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
|
|
|
@ -35,6 +35,22 @@ public class TestNorwegianLightStemFilterFactory extends BaseTokenStreamFactoryT
|
|||
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
|
||||
}
|
||||
|
||||
/** Test stemming with variant set explicitly to Bokmål */
|
||||
public void testBokmaalStemming() throws Exception {
|
||||
Reader reader = new StringReader("epler eple");
|
||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
stream = tokenFilterFactory("NorwegianLightStem", "variant", "nb").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
|
||||
}
|
||||
|
||||
/** Test stemming with variant set explicitly to Nynorsk */
|
||||
public void testNynorskStemming() throws Exception {
|
||||
Reader reader = new StringReader("gutar gutane");
|
||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
stream = tokenFilterFactory("NorwegianLightStem", "variant", "nn").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "gut", "gut" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
|
|
|
@ -32,6 +32,8 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||
|
||||
/**
|
||||
* Simple tests for {@link NorwegianMinimalStemFilter}
|
||||
|
@ -42,15 +44,27 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
|
|||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source));
|
||||
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source, BOKMAAL));
|
||||
}
|
||||
};
|
||||
|
||||
/** Test against a vocabulary file */
|
||||
/** Test against a Bokmål vocabulary file */
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
|
||||
}
|
||||
|
||||
/** Test against a Nynorsk vocabulary file */
|
||||
public void testNynorskVocabulary() throws IOException {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source, NYNORSK));
|
||||
}
|
||||
};
|
||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nn_minimal.txt")));
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
||||
Analyzer a = new Analyzer() {
|
||||
|
|
|
@ -35,6 +35,22 @@ public class TestNorwegianMinimalStemFilterFactory extends BaseTokenStreamFactor
|
|||
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
|
||||
}
|
||||
|
||||
/** Test stemming with variant set explicitly to Bokmål */
|
||||
public void testBokmaalStemming() throws Exception {
|
||||
Reader reader = new StringReader("eple eplet epler eplene eplets eplenes");
|
||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
stream = tokenFilterFactory("NorwegianMinimalStem", "variant", "nb").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
|
||||
}
|
||||
|
||||
/** Test stemming with variant set explicitly to Nynorsk */
|
||||
public void testNynorskStemming() throws Exception {
|
||||
Reader reader = new StringReader("gut guten gutar gutane gutens gutanes");
|
||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
stream = tokenFilterFactory("NorwegianMinimalStem", "variant", "nn").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "gut", "gut", "gut", "gut", "gut", "gut" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
#
|
||||
# Tests for Norwegian Nynorsk light stemmer
|
||||
# It should tackle nouns, adjectives, genitive and some general endings
|
||||
#
|
||||
# Nouns masculine
|
||||
gut gut
|
||||
guten gut
|
||||
gutar gut
|
||||
gutane gut
|
||||
gutens gut
|
||||
gutanes gut
|
||||
søknad søknad
|
||||
søknaden søknad
|
||||
søknadar søknad
|
||||
søknadane søknad
|
||||
søknadens søknad
|
||||
søknadanes søknad
|
||||
# Nouns feminine
|
||||
kjole kjol
|
||||
kjola kjol
|
||||
kjoler kjol
|
||||
kjolene kjol
|
||||
kjolas kjol
|
||||
# Nouns neutral
|
||||
dyr dyr
|
||||
dyret dyr
|
||||
dyra dyr
|
||||
dyras dyr
|
||||
prospekt prospekt
|
||||
prospektet prospekt
|
||||
prospekta prospekt
|
||||
prospektas prospekt
|
||||
innhald innhald
|
||||
innhaldet innhald
|
||||
innhalda innhald
|
||||
# General endings
|
||||
hemmeleg hemmeleg
|
||||
hemmelegheit hemmeleg
|
||||
hemmelegheita hemmeleg
|
||||
hemmelegheiter hemmeleg
|
||||
vanskeleg vanskeleg
|
||||
vanskelegheit vanskeleg
|
||||
vanskelegheita vanskeleg
|
||||
vanskelegheiter vanskeleg
|
||||
hevelse hev
|
||||
heva hev
|
||||
hevelsen hev
|
||||
heve hev
|
||||
ærleg ærleg
|
||||
ærlegdom ærleg
|
||||
ærlegdommen ærlegdomm
|
||||
ærlegdommens ærlegdomm
|
||||
alderdom alder
|
||||
alderdommen alderdomm
|
||||
alderdommens alderdomm
|
||||
trygg trygg
|
||||
tryggleik trygg
|
||||
tryggleiken trygg
|
||||
tryggleikens trygg
|
||||
tryggleikar trygg
|
||||
kjærleik kjær
|
||||
kjærleiken kjær
|
||||
kjærleikens kjær
|
||||
kjærleikar kjær
|
||||
verke verk
|
||||
verksemd verk
|
||||
hjelpe hjelp
|
||||
hjelpsemd hjelp
|
||||
# Adjectives
|
||||
billeg billeg
|
||||
billegare billeg
|
||||
billegast billeg
|
||||
smal smal
|
||||
smalare smal
|
||||
smalast smal
|
||||
farleg farleg
|
||||
farlegare farleg
|
||||
farlegast farleg
|
||||
#########################################
|
||||
# Words that should not be stemmed
|
||||
#
|
||||
# Irregular masculine nouns (not supposed to be handled correctly)
|
||||
søner søn
|
||||
sønene søn
|
||||
brør brør
|
||||
brørne brørn
|
||||
# Irregular feminine nouns, not handled
|
||||
dotter dott
|
||||
døtrer døtr
|
||||
døtrene døtr
|
||||
klo klo
|
||||
klørne klørn
|
||||
mor mor
|
||||
mødrer mødr
|
||||
mødrene mødr
|
||||
# Irregular neutral nouns, not handled
|
||||
vedunder vedund
|
||||
# Other words that should not be touched
|
||||
abc abc
|
||||
123 123
|
||||
Jens Jens
|
||||
# Irregular adjectives that should not be stemmed
|
||||
gammal gammal
|
||||
eldre eldr
|
||||
eldst eldst
|
||||
# Verbs, should not be stemmed
|
||||
syngje syngj
|
||||
syng syng
|
||||
song song
|
||||
sunge sung
|
|
@ -0,0 +1,76 @@
|
|||
#
|
||||
# Tests for Norwegian minimal stemmer using Nynorsk as variant
|
||||
# It only tries to stem nouns, i.e. being very little aggressive
|
||||
#
|
||||
# Nouns masculine
|
||||
gut gut
|
||||
guten gut
|
||||
gutar gut
|
||||
gutane gut
|
||||
gutens gut
|
||||
gutanes gut
|
||||
søknad søknad
|
||||
søknaden søknad
|
||||
søknadar søknad
|
||||
søknadane søknad
|
||||
søknadens søknad
|
||||
søknadanes søknad
|
||||
# Nouns feminine
|
||||
jente jent
|
||||
jenta jent
|
||||
jenter jent
|
||||
jentene jent
|
||||
jentas jent
|
||||
# Nouns neutral
|
||||
dyr dyr
|
||||
dyret dyr
|
||||
dyra dyr
|
||||
dyras dyr
|
||||
prospekt prospekt
|
||||
prospektet prospekt
|
||||
prospekta prospekt
|
||||
prospektas prospekt
|
||||
innhald innhald
|
||||
innhaldet innhald
|
||||
innhalda innhald
|
||||
#########################################
|
||||
# Words that should not be stemmed
|
||||
#
|
||||
# Irregular masculine nouns (not supposed to be handled correctly)
|
||||
søner søn
|
||||
sønene søn
|
||||
brør brør
|
||||
brørne brørn
|
||||
# Irregular feminine nouns, not handled
|
||||
dotter dott
|
||||
døtrer døtr
|
||||
døtrene døtr
|
||||
klo klo
|
||||
klørne klørn
|
||||
mor mor
|
||||
mødrer mødr
|
||||
mødrene mødr
|
||||
# Irregular neutral nouns, not handled
|
||||
vedunder vedund
|
||||
# Other words that should not be touched
|
||||
abc abc
|
||||
123 123
|
||||
Jens Jens
|
||||
# Adjective, should not be stemmed
|
||||
farleg farleg
|
||||
farlegare farlegar
|
||||
farlegast farlegast
|
||||
stor stor
|
||||
større størr
|
||||
størst størst
|
||||
gammal gammal
|
||||
eldre eldr
|
||||
eldst eldst
|
||||
# General endings, should not be stemmed
|
||||
sanning sanning
|
||||
sanninga sanning
|
||||
# Verbs, should not be stemmed
|
||||
syngje syngj
|
||||
syng syng
|
||||
song song
|
||||
sunge sung
|
Loading…
Reference in New Issue