SOLR-4565: Extend Norwegian stemmers to handle nynorsk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1497396 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-06-27 15:28:45 +00:00
parent f9a9649f71
commit 3003cfe668
13 changed files with 420 additions and 35 deletions

View File

@ -257,6 +257,9 @@ New Features
* LUCENE-5079: IndexWriter.hasUncommittedChanges() returns true if there are
changes that have not been committed. (yonik, Mike McCandless, Uwe Schindler)
* SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter
to handle "nynorsk" (Erlend Garåsen, janhoy via Robert Muir)
API Changes
* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes

View File

@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* </p>
*/
public final class NorwegianLightStemFilter extends TokenFilter {
private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
private final NorwegianLightStemmer stemmer;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
/**
* Calls {@link #NorwegianLightStemFilter(TokenStream, int)
* NorwegianLightStemFilter(input, BOKMAAL)}
*/
public NorwegianLightStemFilter(TokenStream input) {
this(input, NorwegianLightStemmer.BOKMAAL);
}
/**
* Creates a new NorwegianLightStemFilter
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
* {@link NorwegianLightStemmer#NYNORSK}, or both.
*/
public NorwegianLightStemFilter(TokenStream input, int flags) {
super(input);
stemmer = new NorwegianLightStemmer(flags);
}
@Override

View File

@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.no.NorwegianLightStemFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
/**
* Factory for {@link NorwegianLightStemFilter}.
* <pre class="prettyprint">
@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.NorwegianLightStemFilterFactory"/&gt;
* &lt;filter class="solr.NorwegianLightStemFilterFactory" variant="nb"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
private final int flags;
/** Creates a new NorwegianLightStemFilterFactory */
public NorwegianLightStemFilterFactory(Map<String,String> args) {
super(args);
String variant = get(args, "variant");
if (variant == null || "nb".equals(variant)) {
flags = BOKMAAL;
} else if ("nn".equals(variant)) {
flags = NYNORSK;
} else if ("no".equals(variant)) {
flags = BOKMAAL | NYNORSK;
} else {
throw new IllegalArgumentException("invalid variant: " + variant);
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -46,6 +61,6 @@ public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
@Override
public TokenStream create(TokenStream input) {
return new NorwegianLightStemFilter(input);
return new NorwegianLightStemFilter(input, flags);
}
}

View File

@ -62,6 +62,25 @@ import static org.apache.lucene.analysis.util.StemmerUtil.*;
* corpus to validate against whereas the Norwegian one is hand crafted.
*/
public class NorwegianLightStemmer {
/** Constant to remove Bokmål-specific endings */
public static final int BOKMAAL = 1;
/** Constant to remove Nynorsk-specific endings */
public static final int NYNORSK = 2;
final boolean useBokmaal;
final boolean useNynorsk;
/**
* Creates a new NorwegianLightStemmer
* @param flags set to {@link #BOKMAAL}, {@link #NYNORSK}, or both.
*/
public NorwegianLightStemmer(int flags) {
if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
throw new IllegalArgumentException("invalid flags");
}
useBokmaal = (flags & BOKMAAL) != 0;
useNynorsk = (flags & NYNORSK) != 0;
}
public int stem(char s[], int len) {
// Remove posessive -s (bilens -> bilen) and continue checking
@ -70,39 +89,76 @@ public class NorwegianLightStemmer {
// Remove common endings, single-pass
if (len > 7 &&
(endsWith(s, len, "heter") || // general ending (hemmelig-heter -> hemmelig)
endsWith(s, len, "heten"))) // general ending (hemmelig-heten -> hemmelig)
((endsWith(s, len, "heter") &&
useBokmaal) || // general ending (hemmelig-heter -> hemmelig)
(endsWith(s, len, "heten") &&
useBokmaal) || // general ending (hemmelig-heten -> hemmelig)
(endsWith(s, len, "heita") &&
useNynorsk))) // general ending (hemmeleg-heita -> hemmeleg)
return len - 5;
// Remove Nynorsk common endings, single-pass
if (len > 8 && useNynorsk &&
(endsWith(s, len, "heiter") || // general ending (hemmeleg-heiter -> hemmeleg)
endsWith(s, len, "leiken") || // general ending (trygg-leiken -> trygg)
endsWith(s, len, "leikar"))) // general ending (trygg-leikar -> trygg)
return len - 6;
if (len > 5 &&
(endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
endsWith(s, len, "het"))) // general ending (hemmelig-het -> hemmelig)
(endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
(endsWith(s, len, "het") &&
useBokmaal))) // general ending (hemmelig-het -> hemmelig)
return len - 3;
if (len > 6 && useNynorsk &&
(endsWith(s, len, "heit") || // general ending (hemmeleg-heit -> hemmeleg)
endsWith(s, len, "semd") || // general ending (verk-semd -> verk)
endsWith(s, len, "leik"))) // general ending (trygg-leik -> trygg)
return len - 4;
if (len > 7 &&
(endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
return len - 5;
if (len > 6 &&
(endsWith(s, len, "ende") || // (sov-ende -> sov)
((endsWith(s, len, "ende") &&
useBokmaal) || // (sov-ende -> sov)
(endsWith(s, len, "ande") &&
useNynorsk) || // (sov-ande -> sov)
endsWith(s, len, "else") || // general ending (føl-else -> føl)
endsWith(s, len, "este") || // adj (fin-este -> fin)
endsWith(s, len, "eren"))) // masc
(endsWith(s, len, "este") &&
useBokmaal) || // adj (fin-este -> fin)
(endsWith(s, len, "aste") &&
useNynorsk) || // adj (fin-aste -> fin)
(endsWith(s, len, "eren") &&
useBokmaal) || // masc
(endsWith(s, len, "aren") &&
useNynorsk))) // masc
return len - 4;
if (len > 5 &&
(endsWith(s, len, "ere") || // adj (fin-ere -> fin)
endsWith(s, len, "est") || // adj (fin-est -> fin)
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
))
((endsWith(s, len, "ere") &&
useBokmaal) || // adj (fin-ere -> fin)
(endsWith(s, len, "are") &&
useNynorsk) || // adj (fin-are -> fin)
(endsWith(s, len, "est") &&
useBokmaal) || // adj (fin-est -> fin)
(endsWith(s, len, "ast") &&
useNynorsk) || // adj (fin-ast -> fin)
endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
(endsWith(s, len, "ane") &&
useNynorsk))) // masc pl definite (gut-ane)
return len - 3;
if (len > 4 &&
(endsWith(s, len, "er") || // masc/fem indefinite
endsWith(s, len, "en") || // masc/fem definite
endsWith(s, len, "et") || // neutr definite
endsWith(s, len, "st") || // adj (billig-st -> billig)
(endsWith(s, len, "ar") &&
useNynorsk) || // masc pl indefinite
(endsWith(s, len, "st") &&
useBokmaal) || // adj (billig-st -> billig)
endsWith(s, len, "te")))
return len - 2;

View File

@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
* </p>
*/
public final class NorwegianMinimalStemFilter extends TokenFilter {
private final NorwegianMinimalStemmer stemmer = new NorwegianMinimalStemmer();
private final NorwegianMinimalStemmer stemmer;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
/**
* Calls {@link #NorwegianMinimalStemFilter(TokenStream, int)
* NorwegianMinimalStemFilter(input, BOKMAAL)}
*/
public NorwegianMinimalStemFilter(TokenStream input) {
this(input, NorwegianLightStemmer.BOKMAAL);
}
/**
* Creates a new NorwegianLightStemFilter
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
* {@link NorwegianLightStemmer#NYNORSK}, or both.
*/
public NorwegianMinimalStemFilter(TokenStream input, int flags) {
super(input);
this.stemmer = new NorwegianMinimalStemmer(flags);
}
@Override

View File

@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
/**
* Factory for {@link NorwegianMinimalStemFilter}.
* <pre class="prettyprint">
@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.NorwegianMinimalStemFilterFactory"/&gt;
* &lt;filter class="solr.NorwegianMinimalStemFilterFactory" variant="nb"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory {
private final int flags;
/** Creates a new NorwegianMinimalStemFilterFactory */
public NorwegianMinimalStemFilterFactory(Map<String,String> args) {
super(args);
String variant = get(args, "variant");
if (variant == null || "nb".equals(variant)) {
flags = BOKMAAL;
} else if ("nn".equals(variant)) {
flags = NYNORSK;
} else if ("no".equals(variant)) {
flags = BOKMAAL | NYNORSK;
} else {
throw new IllegalArgumentException("invalid variant: " + variant);
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -46,6 +61,6 @@ public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory {
@Override
public TokenStream create(TokenStream input) {
return new NorwegianMinimalStemFilter(input);
return new NorwegianMinimalStemFilter(input, flags);
}
}

View File

@ -53,13 +53,30 @@ package org.apache.lucene.analysis.no;
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
/**
* Minimal Stemmer for Norwegian bokmål (no-nb)
* Minimal Stemmer for Norwegian Bokmål (no-nb) and Nynorsk (no-nn)
* <p>
* Stems known plural forms for Norwegian nouns only, together with genitiv -s
*/
public class NorwegianMinimalStemmer {
final boolean useBokmaal;
final boolean useNynorsk;
/**
* Creates a new NorwegianMinimalStemmer
* @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
* {@link NorwegianLightStemmer#NYNORSK}, or both.
*/
public NorwegianMinimalStemmer(int flags) {
if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
throw new IllegalArgumentException("invalid flags");
}
useBokmaal = (flags & BOKMAAL) != 0;
useNynorsk = (flags & NYNORSK) != 0;
}
public int stem(char s[], int len) {
// Remove genitiv s
@ -67,15 +84,19 @@ public class NorwegianMinimalStemmer {
len--;
if (len > 5 &&
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
)
(endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
(endsWith(s, len, "ane") &&
useNynorsk // masc pl definite (gut-ane)
)))
return len - 3;
if (len > 4 &&
(endsWith(s, len, "er") || // masc/fem indefinite
endsWith(s, len, "en") || // masc/fem definite
endsWith(s, len, "et") // neutr definite
))
(endsWith(s, len, "er") || // masc/fem indefinite
endsWith(s, len, "en") || // masc/fem definite
endsWith(s, len, "et") || // neutr definite
(endsWith(s, len, "ar") &&
useNynorsk // masc pl indefinite
)))
return len - 2;
if (len > 3)

View File

@ -32,6 +32,9 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
/**
* Simple tests for {@link NorwegianLightStemFilter}
@ -42,7 +45,7 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source));
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source, BOKMAAL));
}
};
@ -51,6 +54,18 @@ public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
}
/** Test against a Nynorsk vocabulary file */
public void testNynorskVocabulary() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source, NYNORSK));
}
};
assertVocabulary(analyzer, new FileInputStream(getDataFile("nn_light.txt")));
}
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
Analyzer a = new Analyzer() {

View File

@ -35,6 +35,22 @@ public class TestNorwegianLightStemFilterFactory extends BaseTokenStreamFactoryT
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
}
/** Test stemming with variant set explicitly to Bokmål */
public void testBokmaalStemming() throws Exception {
Reader reader = new StringReader("epler eple");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("NorwegianLightStem", "variant", "nb").create(stream);
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
}
/** Test stemming with variant set explicitly to Nynorsk */
public void testNynorskStemming() throws Exception {
Reader reader = new StringReader("gutar gutane");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("NorwegianLightStem", "variant", "nn").create(stream);
assertTokenStreamContents(stream, new String[] { "gut", "gut" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {

View File

@ -32,6 +32,8 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
/**
* Simple tests for {@link NorwegianMinimalStemFilter}
@ -42,15 +44,27 @@ public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source));
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source, BOKMAAL));
}
};
/** Test against a vocabulary file */
/** Test against a Bokmål vocabulary file */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
}
/** Test against a Nynorsk vocabulary file */
public void testNynorskVocabulary() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source, NYNORSK));
}
};
assertVocabulary(analyzer, new FileInputStream(getDataFile("nn_minimal.txt")));
}
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
Analyzer a = new Analyzer() {

View File

@ -35,6 +35,22 @@ public class TestNorwegianMinimalStemFilterFactory extends BaseTokenStreamFactor
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
}
/** Test stemming with variant set explicitly to Bokmål */
public void testBokmaalStemming() throws Exception {
Reader reader = new StringReader("eple eplet epler eplene eplets eplenes");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("NorwegianMinimalStem", "variant", "nb").create(stream);
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
}
/** Test stemming with variant set explicitly to Nynorsk */
public void testNynorskStemming() throws Exception {
Reader reader = new StringReader("gut guten gutar gutane gutens gutanes");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("NorwegianMinimalStem", "variant", "nn").create(stream);
assertTokenStreamContents(stream, new String[] { "gut", "gut", "gut", "gut", "gut", "gut" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {

View File

@ -0,0 +1,110 @@
#
# Tests for Norwegian Nynorsk light stemmer
# It should tackle nouns, adjectives, genitive and some general endings
#
# Nouns masculine
gut gut
guten gut
gutar gut
gutane gut
gutens gut
gutanes gut
søknad søknad
søknaden søknad
søknadar søknad
søknadane søknad
søknadens søknad
søknadanes søknad
# Nouns feminine
kjole kjol
kjola kjol
kjoler kjol
kjolene kjol
kjolas kjol
# Nouns neutral
dyr dyr
dyret dyr
dyra dyr
dyras dyr
prospekt prospekt
prospektet prospekt
prospekta prospekt
prospektas prospekt
innhald innhald
innhaldet innhald
innhalda innhald
# General endings
hemmeleg hemmeleg
hemmelegheit hemmeleg
hemmelegheita hemmeleg
hemmelegheiter hemmeleg
vanskeleg vanskeleg
vanskelegheit vanskeleg
vanskelegheita vanskeleg
vanskelegheiter vanskeleg
hevelse hev
heva hev
hevelsen hev
heve hev
ærleg ærleg
ærlegdom ærleg
ærlegdommen ærlegdomm
ærlegdommens ærlegdomm
alderdom alder
alderdommen alderdomm
alderdommens alderdomm
trygg trygg
tryggleik trygg
tryggleiken trygg
tryggleikens trygg
tryggleikar trygg
kjærleik kjær
kjærleiken kjær
kjærleikens kjær
kjærleikar kjær
verke verk
verksemd verk
hjelpe hjelp
hjelpsemd hjelp
# Adjectives
billeg billeg
billegare billeg
billegast billeg
smal smal
smalare smal
smalast smal
farleg farleg
farlegare farleg
farlegast farleg
#########################################
# Words that should not be stemmed
#
# Irregular masculine nouns (not supposed to be handled correctly)
søner søn
sønene søn
brør brør
brørne brørn
# Irregular feminine nouns, not handled
dotter dott
døtrer døtr
døtrene døtr
klo klo
klørne klørn
mor mor
mødrer mødr
mødrene mødr
# Irregular neutral nouns, not handled
vedunder vedund
# Other words that should not be touched
abc abc
123 123
Jens Jens
# Irregular adjectives that should not be stemmed
gammal gammal
eldre eldr
eldst eldst
# Verbs, should not be stemmed
syngje syngj
syng syng
song song
sunge sung

View File

@ -0,0 +1,76 @@
#
# Tests for Norwegian minimal stemmer using Nynorsk as variant
# It only tries to stem nouns, i.e. being very little aggressive
#
# Nouns masculine
gut gut
guten gut
gutar gut
gutane gut
gutens gut
gutanes gut
søknad søknad
søknaden søknad
søknadar søknad
søknadane søknad
søknadens søknad
søknadanes søknad
# Nouns feminine
jente jent
jenta jent
jenter jent
jentene jent
jentas jent
# Nouns neutral
dyr dyr
dyret dyr
dyra dyr
dyras dyr
prospekt prospekt
prospektet prospekt
prospekta prospekt
prospektas prospekt
innhald innhald
innhaldet innhald
innhalda innhald
#########################################
# Words that should not be stemmed
#
# Irregular masculine nouns (not supposed to be handled correctly)
søner søn
sønene søn
brør brør
brørne brørn
# Irregular feminine nouns, not handled
dotter dott
døtrer døtr
døtrene døtr
klo klo
klørne klørn
mor mor
mødrer mødr
mødrene mødr
# Irregular neutral nouns, not handled
vedunder vedund
# Other words that should not be touched
abc abc
123 123
Jens Jens
# Adjective, should not be stemmed
farleg farleg
farlegare farlegar
farlegast farlegast
stor stor
større størr
størst størst
gammal gammal
eldre eldr
eldst eldst
# General endings, should not be stemmed
sanning sanning
sanninga sanning
# Verbs, should not be stemmed
syngje syngj
syng syng
song song
sunge sung