mirror of https://github.com/apache/lucene.git
SOLR-4565: Extend Norwegian stemmers to handle nynorsk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1497396 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f9a9649f71
commit
3003cfe668
|
@ -257,6 +257,9 @@ New Features
|
||||||
* LUCENE-5079: IndexWriter.hasUncommittedChanges() returns true if there are
|
* LUCENE-5079: IndexWriter.hasUncommittedChanges() returns true if there are
|
||||||
changes that have not been committed. (yonik, Mike McCandless, Uwe Schindler)
|
changes that have not been committed. (yonik, Mike McCandless, Uwe Schindler)
|
||||||
|
|
||||||
|
* SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter
|
||||||
|
to handle "nynorsk" (Erlend Garåsen, janhoy via Robert Muir)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes
|
* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes
|
||||||
|
|
|
@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class NorwegianLightStemFilter extends TokenFilter {
|
public final class NorwegianLightStemFilter extends TokenFilter {
|
||||||
private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
|
private final NorwegianLightStemmer stemmer;
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calls {@link #NorwegianLightStemFilter(TokenStream, int)
|
||||||
|
* NorwegianLightStemFilter(input, BOKMAAL)}
|
||||||
|
*/
|
||||||
public NorwegianLightStemFilter(TokenStream input) {
|
public NorwegianLightStemFilter(TokenStream input) {
|
||||||
|
this(input, NorwegianLightStemmer.BOKMAAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new NorwegianLightStemFilter
|
||||||
|
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
|
||||||
|
* {@link NorwegianLightStemmer#NYNORSK}, or both.
|
||||||
|
*/
|
||||||
|
public NorwegianLightStemFilter(TokenStream input, int flags) {
|
||||||
super(input);
|
super(input);
|
||||||
|
stemmer = new NorwegianLightStemmer(flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.no.NorwegianLightStemFilter;
|
import org.apache.lucene.analysis.no.NorwegianLightStemFilter;
|
||||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for {@link NorwegianLightStemFilter}.
|
* Factory for {@link NorwegianLightStemFilter}.
|
||||||
* <pre class="prettyprint">
|
* <pre class="prettyprint">
|
||||||
|
@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||||
* <filter class="solr.NorwegianLightStemFilterFactory"/>
|
* <filter class="solr.NorwegianLightStemFilterFactory" variant="nb"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
|
public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
private final int flags;
|
||||||
|
|
||||||
/** Creates a new NorwegianLightStemFilterFactory */
|
/** Creates a new NorwegianLightStemFilterFactory */
|
||||||
public NorwegianLightStemFilterFactory(Map<String,String> args) {
|
public NorwegianLightStemFilterFactory(Map<String,String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
|
String variant = get(args, "variant");
|
||||||
|
if (variant == null || "nb".equals(variant)) {
|
||||||
|
flags = BOKMAAL;
|
||||||
|
} else if ("nn".equals(variant)) {
|
||||||
|
flags = NYNORSK;
|
||||||
|
} else if ("no".equals(variant)) {
|
||||||
|
flags = BOKMAAL | NYNORSK;
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("invalid variant: " + variant);
|
||||||
|
}
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -46,6 +61,6 @@ public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new NorwegianLightStemFilter(input);
|
return new NorwegianLightStemFilter(input, flags);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -62,6 +62,25 @@ import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
* corpus to validate against whereas the Norwegian one is hand crafted.
|
* corpus to validate against whereas the Norwegian one is hand crafted.
|
||||||
*/
|
*/
|
||||||
public class NorwegianLightStemmer {
|
public class NorwegianLightStemmer {
|
||||||
|
/** Constant to remove Bokmål-specific endings */
|
||||||
|
public static final int BOKMAAL = 1;
|
||||||
|
/** Constant to remove Nynorsk-specific endings */
|
||||||
|
public static final int NYNORSK = 2;
|
||||||
|
|
||||||
|
final boolean useBokmaal;
|
||||||
|
final boolean useNynorsk;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new NorwegianLightStemmer
|
||||||
|
* @param flags set to {@link #BOKMAAL}, {@link #NYNORSK}, or both.
|
||||||
|
*/
|
||||||
|
public NorwegianLightStemmer(int flags) {
|
||||||
|
if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
|
||||||
|
throw new IllegalArgumentException("invalid flags");
|
||||||
|
}
|
||||||
|
useBokmaal = (flags & BOKMAAL) != 0;
|
||||||
|
useNynorsk = (flags & NYNORSK) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
public int stem(char s[], int len) {
|
public int stem(char s[], int len) {
|
||||||
// Remove posessive -s (bilens -> bilen) and continue checking
|
// Remove posessive -s (bilens -> bilen) and continue checking
|
||||||
|
@ -70,39 +89,76 @@ public class NorwegianLightStemmer {
|
||||||
|
|
||||||
// Remove common endings, single-pass
|
// Remove common endings, single-pass
|
||||||
if (len > 7 &&
|
if (len > 7 &&
|
||||||
(endsWith(s, len, "heter") || // general ending (hemmelig-heter -> hemmelig)
|
((endsWith(s, len, "heter") &&
|
||||||
endsWith(s, len, "heten"))) // general ending (hemmelig-heten -> hemmelig)
|
useBokmaal) || // general ending (hemmelig-heter -> hemmelig)
|
||||||
|
(endsWith(s, len, "heten") &&
|
||||||
|
useBokmaal) || // general ending (hemmelig-heten -> hemmelig)
|
||||||
|
(endsWith(s, len, "heita") &&
|
||||||
|
useNynorsk))) // general ending (hemmeleg-heita -> hemmeleg)
|
||||||
return len - 5;
|
return len - 5;
|
||||||
|
|
||||||
|
// Remove Nynorsk common endings, single-pass
|
||||||
|
if (len > 8 && useNynorsk &&
|
||||||
|
(endsWith(s, len, "heiter") || // general ending (hemmeleg-heiter -> hemmeleg)
|
||||||
|
endsWith(s, len, "leiken") || // general ending (trygg-leiken -> trygg)
|
||||||
|
endsWith(s, len, "leikar"))) // general ending (trygg-leikar -> trygg)
|
||||||
|
return len - 6;
|
||||||
|
|
||||||
if (len > 5 &&
|
if (len > 5 &&
|
||||||
(endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
|
(endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
|
||||||
endsWith(s, len, "het"))) // general ending (hemmelig-het -> hemmelig)
|
(endsWith(s, len, "het") &&
|
||||||
|
useBokmaal))) // general ending (hemmelig-het -> hemmelig)
|
||||||
return len - 3;
|
return len - 3;
|
||||||
|
|
||||||
|
if (len > 6 && useNynorsk &&
|
||||||
|
(endsWith(s, len, "heit") || // general ending (hemmeleg-heit -> hemmeleg)
|
||||||
|
endsWith(s, len, "semd") || // general ending (verk-semd -> verk)
|
||||||
|
endsWith(s, len, "leik"))) // general ending (trygg-leik -> trygg)
|
||||||
|
return len - 4;
|
||||||
|
|
||||||
if (len > 7 &&
|
if (len > 7 &&
|
||||||
(endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
|
(endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
|
||||||
endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
|
endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
|
||||||
return len - 5;
|
return len - 5;
|
||||||
|
|
||||||
if (len > 6 &&
|
if (len > 6 &&
|
||||||
(endsWith(s, len, "ende") || // (sov-ende -> sov)
|
((endsWith(s, len, "ende") &&
|
||||||
|
useBokmaal) || // (sov-ende -> sov)
|
||||||
|
(endsWith(s, len, "ande") &&
|
||||||
|
useNynorsk) || // (sov-ande -> sov)
|
||||||
endsWith(s, len, "else") || // general ending (føl-else -> føl)
|
endsWith(s, len, "else") || // general ending (føl-else -> føl)
|
||||||
endsWith(s, len, "este") || // adj (fin-este -> fin)
|
(endsWith(s, len, "este") &&
|
||||||
endsWith(s, len, "eren"))) // masc
|
useBokmaal) || // adj (fin-este -> fin)
|
||||||
|
(endsWith(s, len, "aste") &&
|
||||||
|
useNynorsk) || // adj (fin-aste -> fin)
|
||||||
|
(endsWith(s, len, "eren") &&
|
||||||
|
useBokmaal) || // masc
|
||||||
|
(endsWith(s, len, "aren") &&
|
||||||
|
useNynorsk))) // masc
|
||||||
return len - 4;
|
return len - 4;
|
||||||
|
|
||||||
if (len > 5 &&
|
if (len > 5 &&
|
||||||
(endsWith(s, len, "ere") || // adj (fin-ere -> fin)
|
((endsWith(s, len, "ere") &&
|
||||||
endsWith(s, len, "est") || // adj (fin-est -> fin)
|
useBokmaal) || // adj (fin-ere -> fin)
|
||||||
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
|
(endsWith(s, len, "are") &&
|
||||||
))
|
useNynorsk) || // adj (fin-are -> fin)
|
||||||
|
(endsWith(s, len, "est") &&
|
||||||
|
useBokmaal) || // adj (fin-est -> fin)
|
||||||
|
(endsWith(s, len, "ast") &&
|
||||||
|
useNynorsk) || // adj (fin-ast -> fin)
|
||||||
|
endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
|
||||||
|
(endsWith(s, len, "ane") &&
|
||||||
|
useNynorsk))) // masc pl definite (gut-ane)
|
||||||
return len - 3;
|
return len - 3;
|
||||||
|
|
||||||
if (len > 4 &&
|
if (len > 4 &&
|
||||||
(endsWith(s, len, "er") || // masc/fem indefinite
|
(endsWith(s, len, "er") || // masc/fem indefinite
|
||||||
endsWith(s, len, "en") || // masc/fem definite
|
endsWith(s, len, "en") || // masc/fem definite
|
||||||
endsWith(s, len, "et") || // neutr definite
|
endsWith(s, len, "et") || // neutr definite
|
||||||
endsWith(s, len, "st") || // adj (billig-st -> billig)
|
(endsWith(s, len, "ar") &&
|
||||||
|
useNynorsk) || // masc pl indefinite
|
||||||
|
(endsWith(s, len, "st") &&
|
||||||
|
useBokmaal) || // adj (billig-st -> billig)
|
||||||
endsWith(s, len, "te")))
|
endsWith(s, len, "te")))
|
||||||
return len - 2;
|
return len - 2;
|
||||||
|
|
||||||
|
|
|
@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class NorwegianMinimalStemFilter extends TokenFilter {
|
public final class NorwegianMinimalStemFilter extends TokenFilter {
|
||||||
private final NorwegianMinimalStemmer stemmer = new NorwegianMinimalStemmer();
|
private final NorwegianMinimalStemmer stemmer;
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calls {@link #NorwegianMinimalStemFilter(TokenStream, int)
|
||||||
|
* NorwegianMinimalStemFilter(input, BOKMAAL)}
|
||||||
|
*/
|
||||||
public NorwegianMinimalStemFilter(TokenStream input) {
|
public NorwegianMinimalStemFilter(TokenStream input) {
|
||||||
|
this(input, NorwegianLightStemmer.BOKMAAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new NorwegianLightStemFilter
|
||||||
|
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
|
||||||
|
* {@link NorwegianLightStemmer#NYNORSK}, or both.
|
||||||
|
*/
|
||||||
|
public NorwegianMinimalStemFilter(TokenStream input, int flags) {
|
||||||
super(input);
|
super(input);
|
||||||
|
this.stemmer = new NorwegianMinimalStemmer(flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter;
|
import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter;
|
||||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for {@link NorwegianMinimalStemFilter}.
|
* Factory for {@link NorwegianMinimalStemFilter}.
|
||||||
* <pre class="prettyprint">
|
* <pre class="prettyprint">
|
||||||
|
@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||||
* <filter class="solr.NorwegianMinimalStemFilterFactory"/>
|
* <filter class="solr.NorwegianMinimalStemFilterFactory" variant="nb"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory {
|
public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
private final int flags;
|
||||||
|
|
||||||
/** Creates a new NorwegianMinimalStemFilterFactory */
|
/** Creates a new NorwegianMinimalStemFilterFactory */
|
||||||
public NorwegianMinimalStemFilterFactory(Map<String,String> args) {
|
public NorwegianMinimalStemFilterFactory(Map<String,String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
|
String variant = get(args, "variant");
|
||||||
|
if (variant == null || "nb".equals(variant)) {
|
||||||
|
flags = BOKMAAL;
|
||||||
|
} else if ("nn".equals(variant)) {
|
||||||
|
flags = NYNORSK;
|
||||||
|
} else if ("no".equals(variant)) {
|
||||||
|
flags = BOKMAAL | NYNORSK;
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("invalid variant: " + variant);
|
||||||
|
}
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -46,6 +61,6 @@ public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new NorwegianMinimalStemFilter(input);
|
return new NorwegianMinimalStemFilter(input, flags);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,13 +53,30 @@ package org.apache.lucene.analysis.no;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Minimal Stemmer for Norwegian bokmål (no-nb)
|
* Minimal Stemmer for Norwegian Bokmål (no-nb) and Nynorsk (no-nn)
|
||||||
* <p>
|
* <p>
|
||||||
* Stems known plural forms for Norwegian nouns only, together with genitiv -s
|
* Stems known plural forms for Norwegian nouns only, together with genitiv -s
|
||||||
*/
|
*/
|
||||||
public class NorwegianMinimalStemmer {
|
public class NorwegianMinimalStemmer {
|
||||||
|
final boolean useBokmaal;
|
||||||
|
final boolean useNynorsk;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new NorwegianMinimalStemmer
|
||||||
|
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
|
||||||
|
* {@link NorwegianLightStemmer#NYNORSK}, or both.
|
||||||
|
*/
|
||||||
|
public NorwegianMinimalStemmer(int flags) {
|
||||||
|
if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
|
||||||
|
throw new IllegalArgumentException("invalid flags");
|
||||||
|
}
|
||||||
|
useBokmaal = (flags & BOKMAAL) != 0;
|
||||||
|
useNynorsk = (flags & NYNORSK) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
public int stem(char s[], int len) {
|
public int stem(char s[], int len) {
|
||||||
// Remove genitiv s
|
// Remove genitiv s
|
||||||
|
@ -67,15 +84,19 @@ public class NorwegianMinimalStemmer {
|
||||||
len--;
|
len--;
|
||||||
|
|
||||||
if (len > 5 &&
|
if (len > 5 &&
|
||||||
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
|
(endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
|
||||||
)
|
(endsWith(s, len, "ane") &&
|
||||||
|
useNynorsk // masc pl definite (gut-ane)
|
||||||
|
)))
|
||||||
return len - 3;
|
return len - 3;
|
||||||
|
|
||||||
if (len > 4 &&
|
if (len > 4 &&
|
||||||
(endsWith(s, len, "er") || // masc/fem indefinite
|
(endsWith(s, len, "er") || // masc/fem indefinite
|
||||||
endsWith(s, len, "en") || // masc/fem definite
|
endsWith(s, len, "en") || // masc/fem definite
|
||||||
endsWith(s, len, "et") // neutr definite
|
endsWith(s, len, "et") || // neutr definite
|
||||||
))
|
(endsWith(s, len, "ar") &&
|
||||||
|
useNynorsk // masc pl indefinite
|
||||||
|
)))
|
||||||
return len - 2;
|
return len - 2;
|
||||||
|
|
||||||
if (len > 3)
|
if (len > 3)
|
||||||
|
|
|
@ -32,6 +32,9 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simple tests for {@link NorwegianLightStemFilter}
|
* Simple tests for {@link NorwegianLightStemFilter}
|
||||||
|
@ -42,7 +45,7 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
Reader reader) {
|
Reader reader) {
|
||||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source));
|
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source, BOKMAAL));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -51,6 +54,18 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
|
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test against a Nynorsk vocabulary file */
|
||||||
|
public void testNynorskVocabulary() throws IOException {
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source, NYNORSK));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertVocabulary(analyzer, new FileInputStream(getDataFile("nn_light.txt")));
|
||||||
|
}
|
||||||
|
|
||||||
public void testKeyword() throws IOException {
|
public void testKeyword() throws IOException {
|
||||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
|
|
|
@ -35,6 +35,22 @@ public class TestNorwegianLightStemFilterFactory extends BaseTokenStreamFactoryT
|
||||||
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
|
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test stemming with variant set explicitly to Bokmål */
|
||||||
|
public void testBokmaalStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("epler eple");
|
||||||
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
stream = tokenFilterFactory("NorwegianLightStem", "variant", "nb").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test stemming with variant set explicitly to Nynorsk */
|
||||||
|
public void testNynorskStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("gutar gutane");
|
||||||
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
stream = tokenFilterFactory("NorwegianLightStem", "variant", "nn").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "gut", "gut" });
|
||||||
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() throws Exception {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -32,6 +32,8 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
|
||||||
|
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simple tests for {@link NorwegianMinimalStemFilter}
|
* Simple tests for {@link NorwegianMinimalStemFilter}
|
||||||
|
@ -42,15 +44,27 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
Reader reader) {
|
Reader reader) {
|
||||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source));
|
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source, BOKMAAL));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Test against a vocabulary file */
|
/** Test against a Bokmål vocabulary file */
|
||||||
public void testVocabulary() throws IOException {
|
public void testVocabulary() throws IOException {
|
||||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
|
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test against a Nynorsk vocabulary file */
|
||||||
|
public void testNynorskVocabulary() throws IOException {
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source, NYNORSK));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertVocabulary(analyzer, new FileInputStream(getDataFile("nn_minimal.txt")));
|
||||||
|
}
|
||||||
|
|
||||||
public void testKeyword() throws IOException {
|
public void testKeyword() throws IOException {
|
||||||
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
|
|
|
@ -35,6 +35,22 @@ public class TestNorwegianMinimalStemFilterFactory extends BaseTokenStreamFactor
|
||||||
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
|
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test stemming with variant set explicitly to Bokmål */
|
||||||
|
public void testBokmaalStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("eple eplet epler eplene eplets eplenes");
|
||||||
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
stream = tokenFilterFactory("NorwegianMinimalStem", "variant", "nb").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test stemming with variant set explicitly to Nynorsk */
|
||||||
|
public void testNynorskStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("gut guten gutar gutane gutens gutanes");
|
||||||
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
stream = tokenFilterFactory("NorwegianMinimalStem", "variant", "nn").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "gut", "gut", "gut", "gut", "gut", "gut" });
|
||||||
|
}
|
||||||
|
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() throws Exception {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -0,0 +1,110 @@
|
||||||
|
#
|
||||||
|
# Tests for Norwegian Nynorsk light stemmer
|
||||||
|
# It should tackle nouns, adjectives, genitive and some general endings
|
||||||
|
#
|
||||||
|
# Nouns masculine
|
||||||
|
gut gut
|
||||||
|
guten gut
|
||||||
|
gutar gut
|
||||||
|
gutane gut
|
||||||
|
gutens gut
|
||||||
|
gutanes gut
|
||||||
|
søknad søknad
|
||||||
|
søknaden søknad
|
||||||
|
søknadar søknad
|
||||||
|
søknadane søknad
|
||||||
|
søknadens søknad
|
||||||
|
søknadanes søknad
|
||||||
|
# Nouns feminine
|
||||||
|
kjole kjol
|
||||||
|
kjola kjol
|
||||||
|
kjoler kjol
|
||||||
|
kjolene kjol
|
||||||
|
kjolas kjol
|
||||||
|
# Nouns neutral
|
||||||
|
dyr dyr
|
||||||
|
dyret dyr
|
||||||
|
dyra dyr
|
||||||
|
dyras dyr
|
||||||
|
prospekt prospekt
|
||||||
|
prospektet prospekt
|
||||||
|
prospekta prospekt
|
||||||
|
prospektas prospekt
|
||||||
|
innhald innhald
|
||||||
|
innhaldet innhald
|
||||||
|
innhalda innhald
|
||||||
|
# General endings
|
||||||
|
hemmeleg hemmeleg
|
||||||
|
hemmelegheit hemmeleg
|
||||||
|
hemmelegheita hemmeleg
|
||||||
|
hemmelegheiter hemmeleg
|
||||||
|
vanskeleg vanskeleg
|
||||||
|
vanskelegheit vanskeleg
|
||||||
|
vanskelegheita vanskeleg
|
||||||
|
vanskelegheiter vanskeleg
|
||||||
|
hevelse hev
|
||||||
|
heva hev
|
||||||
|
hevelsen hev
|
||||||
|
heve hev
|
||||||
|
ærleg ærleg
|
||||||
|
ærlegdom ærleg
|
||||||
|
ærlegdommen ærlegdomm
|
||||||
|
ærlegdommens ærlegdomm
|
||||||
|
alderdom alder
|
||||||
|
alderdommen alderdomm
|
||||||
|
alderdommens alderdomm
|
||||||
|
trygg trygg
|
||||||
|
tryggleik trygg
|
||||||
|
tryggleiken trygg
|
||||||
|
tryggleikens trygg
|
||||||
|
tryggleikar trygg
|
||||||
|
kjærleik kjær
|
||||||
|
kjærleiken kjær
|
||||||
|
kjærleikens kjær
|
||||||
|
kjærleikar kjær
|
||||||
|
verke verk
|
||||||
|
verksemd verk
|
||||||
|
hjelpe hjelp
|
||||||
|
hjelpsemd hjelp
|
||||||
|
# Adjectives
|
||||||
|
billeg billeg
|
||||||
|
billegare billeg
|
||||||
|
billegast billeg
|
||||||
|
smal smal
|
||||||
|
smalare smal
|
||||||
|
smalast smal
|
||||||
|
farleg farleg
|
||||||
|
farlegare farleg
|
||||||
|
farlegast farleg
|
||||||
|
#########################################
|
||||||
|
# Words that should not be stemmed
|
||||||
|
#
|
||||||
|
# Irregular masculine nouns (not supposed to be handled correctly)
|
||||||
|
søner søn
|
||||||
|
sønene søn
|
||||||
|
brør brør
|
||||||
|
brørne brørn
|
||||||
|
# Irregular feminine nouns, not handled
|
||||||
|
dotter dott
|
||||||
|
døtrer døtr
|
||||||
|
døtrene døtr
|
||||||
|
klo klo
|
||||||
|
klørne klørn
|
||||||
|
mor mor
|
||||||
|
mødrer mødr
|
||||||
|
mødrene mødr
|
||||||
|
# Irregular neutral nouns, not handled
|
||||||
|
vedunder vedund
|
||||||
|
# Other words that should not be touched
|
||||||
|
abc abc
|
||||||
|
123 123
|
||||||
|
Jens Jens
|
||||||
|
# Irregular adjectives that should not be stemmed
|
||||||
|
gammal gammal
|
||||||
|
eldre eldr
|
||||||
|
eldst eldst
|
||||||
|
# Verbs, should not be stemmed
|
||||||
|
syngje syngj
|
||||||
|
syng syng
|
||||||
|
song song
|
||||||
|
sunge sung
|
|
@ -0,0 +1,76 @@
|
||||||
|
#
|
||||||
|
# Tests for Norwegian minimal stemmer using Nynorsk as variant
|
||||||
|
# It only tries to stem nouns, i.e. being very little aggressive
|
||||||
|
#
|
||||||
|
# Nouns masculine
|
||||||
|
gut gut
|
||||||
|
guten gut
|
||||||
|
gutar gut
|
||||||
|
gutane gut
|
||||||
|
gutens gut
|
||||||
|
gutanes gut
|
||||||
|
søknad søknad
|
||||||
|
søknaden søknad
|
||||||
|
søknadar søknad
|
||||||
|
søknadane søknad
|
||||||
|
søknadens søknad
|
||||||
|
søknadanes søknad
|
||||||
|
# Nouns feminine
|
||||||
|
jente jent
|
||||||
|
jenta jent
|
||||||
|
jenter jent
|
||||||
|
jentene jent
|
||||||
|
jentas jent
|
||||||
|
# Nouns neutral
|
||||||
|
dyr dyr
|
||||||
|
dyret dyr
|
||||||
|
dyra dyr
|
||||||
|
dyras dyr
|
||||||
|
prospekt prospekt
|
||||||
|
prospektet prospekt
|
||||||
|
prospekta prospekt
|
||||||
|
prospektas prospekt
|
||||||
|
innhald innhald
|
||||||
|
innhaldet innhald
|
||||||
|
innhalda innhald
|
||||||
|
#########################################
|
||||||
|
# Words that should not be stemmed
|
||||||
|
#
|
||||||
|
# Irregular masculine nouns (not supposed to be handled correctly)
|
||||||
|
søner søn
|
||||||
|
sønene søn
|
||||||
|
brør brør
|
||||||
|
brørne brørn
|
||||||
|
# Irregular feminine nouns, not handled
|
||||||
|
dotter dott
|
||||||
|
døtrer døtr
|
||||||
|
døtrene døtr
|
||||||
|
klo klo
|
||||||
|
klørne klørn
|
||||||
|
mor mor
|
||||||
|
mødrer mødr
|
||||||
|
mødrene mødr
|
||||||
|
# Irregular neutral nouns, not handled
|
||||||
|
vedunder vedund
|
||||||
|
# Other words that should not be touched
|
||||||
|
abc abc
|
||||||
|
123 123
|
||||||
|
Jens Jens
|
||||||
|
# Adjective, should not be stemmed
|
||||||
|
farleg farleg
|
||||||
|
farlegare farlegar
|
||||||
|
farlegast farlegast
|
||||||
|
stor stor
|
||||||
|
større størr
|
||||||
|
størst størst
|
||||||
|
gammal gammal
|
||||||
|
eldre eldr
|
||||||
|
eldst eldst
|
||||||
|
# General endings, should not be stemmed
|
||||||
|
sanning sanning
|
||||||
|
sanninga sanning
|
||||||
|
# Verbs, should not be stemmed
|
||||||
|
syngje syngj
|
||||||
|
syng syng
|
||||||
|
song song
|
||||||
|
sunge sung
|
Loading…
Reference in New Issue