mirror of
https://github.com/apache/lucene.git
synced 2025-02-20 17:07:09 +00:00
SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1302833 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7230d78fe9
commit
54d48eb98b
@ -106,3 +106,6 @@ New Features
|
||||
All analyzers in contrib/analyzers and contrib/icu were moved to the
|
||||
analysis module. The 'smartcn' and 'stempel' components now depend on 'common'.
|
||||
(Chris Male, Robert Muir)
|
||||
|
||||
* SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)
|
||||
|
@ -0,0 +1,58 @@
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link NorwegianLightStemmer} to stem Norwegian
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
public final class NorwegianLightStemFilter extends TokenFilter {
|
||||
private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public NorwegianLightStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,119 @@
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This algorithm is updated based on code located at:
|
||||
* http://members.unine.ch/jacques.savoy/clef/
|
||||
*
|
||||
* Full copyright for that code follows:
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2005, Jacques Savoy
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer. Redistributions in binary
|
||||
* form must reproduce the above copyright notice, this list of conditions and
|
||||
* the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution. Neither the name of the author nor the names
|
||||
* of its contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||
|
||||
/**
|
||||
* Light Stemmer for Norwegian.
|
||||
* <p>
|
||||
* Parts of this stemmer is adapted from SwedishLightStemFilter, except
|
||||
* that while the Swedish one has a pre-defined rule set and a corresponding
|
||||
* corpus to validate against whereas the Norwegian one is hand crafted.
|
||||
*/
|
||||
public class NorwegianLightStemmer {
|
||||
|
||||
public int stem(char s[], int len) {
|
||||
// Remove posessive -s (bilens -> bilen) and continue checking
|
||||
if (len > 4 && s[len-1] == 's')
|
||||
len--;
|
||||
|
||||
// Remove common endings, single-pass
|
||||
if (len > 7 &&
|
||||
(endsWith(s, len, "heter") || // general ending (hemmelig-heter -> hemmelig)
|
||||
endsWith(s, len, "heten"))) // general ending (hemmelig-heten -> hemmelig)
|
||||
return len - 5;
|
||||
|
||||
if (len > 5 &&
|
||||
(endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
|
||||
endsWith(s, len, "het"))) // general ending (hemmelig-het -> hemmelig)
|
||||
return len - 3;
|
||||
|
||||
if (len > 7 &&
|
||||
(endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
|
||||
endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
|
||||
return len - 5;
|
||||
|
||||
if (len > 6 &&
|
||||
(endsWith(s, len, "ende") || // (sov-ende -> sov)
|
||||
endsWith(s, len, "else") || // general ending (føl-else -> føl)
|
||||
endsWith(s, len, "este") || // adj (fin-este -> fin)
|
||||
endsWith(s, len, "eren"))) // masc
|
||||
return len - 4;
|
||||
|
||||
if (len > 5 &&
|
||||
(endsWith(s, len, "ere") || // adj (fin-ere -> fin)
|
||||
endsWith(s, len, "est") || // adj (fin-est -> fin)
|
||||
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
|
||||
))
|
||||
return len - 3;
|
||||
|
||||
if (len > 4 &&
|
||||
(endsWith(s, len, "er") || // masc/fem indefinite
|
||||
endsWith(s, len, "en") || // masc/fem definite
|
||||
endsWith(s, len, "et") || // neutr definite
|
||||
endsWith(s, len, "st") || // adj (billig-st -> billig)
|
||||
endsWith(s, len, "te")))
|
||||
return len - 2;
|
||||
|
||||
if (len > 3)
|
||||
switch(s[len-1]) {
|
||||
case 'a': // fem definite
|
||||
case 'e': // to get correct stem for nouns ending in -e (kake -> kak, kaker -> kak)
|
||||
case 'n':
|
||||
return len - 1;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
}
|
@ -0,0 +1,58 @@
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link NorwegianMinimalStemmer} to stem Norwegian
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
public final class NorwegianMinimalStemFilter extends TokenFilter {
|
||||
private final NorwegianMinimalStemmer stemmer = new NorwegianMinimalStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public NorwegianMinimalStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,90 @@
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This algorithm is updated based on code located at:
|
||||
* http://members.unine.ch/jacques.savoy/clef/
|
||||
*
|
||||
* Full copyright for that code follows:
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2005, Jacques Savoy
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer. Redistributions in binary
|
||||
* form must reproduce the above copyright notice, this list of conditions and
|
||||
* the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution. Neither the name of the author nor the names
|
||||
* of its contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||
|
||||
/**
|
||||
* Minimal Stemmer for Norwegian bokmål (no-nb)
|
||||
* <p>
|
||||
* Stems known plural forms for Norwegian nouns only, together with genitiv -s
|
||||
*/
|
||||
public class NorwegianMinimalStemmer {
|
||||
|
||||
public int stem(char s[], int len) {
|
||||
// Remove genitiv s
|
||||
if (len > 4 && s[len-1] == 's')
|
||||
len--;
|
||||
|
||||
if (len > 5 &&
|
||||
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
|
||||
)
|
||||
return len - 3;
|
||||
|
||||
if (len > 4 &&
|
||||
(endsWith(s, len, "er") || // masc/fem indefinite
|
||||
endsWith(s, len, "en") || // masc/fem definite
|
||||
endsWith(s, len, "et") // neutr definite
|
||||
))
|
||||
return len - 2;
|
||||
|
||||
if (len > 3)
|
||||
switch(s[len-1]) {
|
||||
case 'a': // fem definite
|
||||
case 'e': // to get correct stem for nouns ending in -e (kake -> kak, kaker -> kak)
|
||||
return len - 1;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
/**
|
||||
* Simple tests for {@link NorwegianLightStemFilter}
|
||||
*/
|
||||
public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
||||
/** Test against a vocabulary file */
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
package org.apache.lucene.analysis.no;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.*;
|
||||
|
||||
/**
|
||||
* Simple tests for {@link NorwegianMinimalStemFilter}
|
||||
*/
|
||||
public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source));
|
||||
}
|
||||
};
|
||||
|
||||
/** Test against a vocabulary file */
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
@ -0,0 +1,144 @@
|
||||
#
|
||||
# Tests for norwegian Bokmål light stemmer
|
||||
# It should tackle nouns, adjectives, genitiv and some general endings
|
||||
#
|
||||
# Nouns masculine
|
||||
bil bil
|
||||
bilen bil
|
||||
biler bil
|
||||
bilene bil
|
||||
bilens bil
|
||||
bilenes bil
|
||||
sekretæren sekretær
|
||||
sekretær sekretær
|
||||
sekretærene sekretær
|
||||
kaker kak
|
||||
kaken kak
|
||||
kakene kak
|
||||
kakenes kak
|
||||
bibliotekar bibliotekar
|
||||
bibliotekarer bibliotekar
|
||||
bibliotekaren bibliotekar
|
||||
bibliotekarens bibliotekar
|
||||
bibliotekarene bibliotekar
|
||||
bibliotekarenes bibliotekar
|
||||
# Nouns feminine
|
||||
veske vesk
|
||||
veska vesk
|
||||
vesken vesk
|
||||
veskene vesk
|
||||
veskas vesk
|
||||
# Nouns neutral
|
||||
huset hus
|
||||
husene hus
|
||||
husets hus
|
||||
hus hus
|
||||
huset hus
|
||||
husene hus
|
||||
husenes hus
|
||||
flagg flagg
|
||||
flagga flagg
|
||||
flaggene flagg
|
||||
flaggets flagg
|
||||
flaggenes flagg
|
||||
politi politi
|
||||
politiet politi
|
||||
politiets politi
|
||||
politienes politi
|
||||
# General endings
|
||||
god god
|
||||
godhet god
|
||||
godheten god
|
||||
forelskelse forelsk
|
||||
forelsket forelsk
|
||||
forelskelsen forelsk
|
||||
forelske forelsk
|
||||
kristen krist
|
||||
kristendom kristen
|
||||
kristendommen kristendomm
|
||||
kristendommens kristendomm
|
||||
fattig fattig
|
||||
fattigdom fattig
|
||||
fattigdommen fattigdomm
|
||||
fattigdommens fattigdomm
|
||||
# -het (see http://no.wiktionary.org/wiki/Kategori:Ord_som_ender_p%C3%A5_%C2%AB-het%C2%BB)
|
||||
hemmelig hemmelig
|
||||
hemmelighet hemmelig
|
||||
hemmelighets hemmelig
|
||||
hemmeligheter hemmelig
|
||||
hemmeligheten hemmelig
|
||||
hemmelighetens hemmelig
|
||||
kjærlig kjærlig
|
||||
kjærlighet kjærlig
|
||||
kjærligheter kjærlig
|
||||
kjærligheten kjærlig
|
||||
forlegen forleg
|
||||
forlegenhet forlegen
|
||||
forlegenheten forlegen
|
||||
forlegenhetens forlegen
|
||||
tvetydig tvetydig
|
||||
tvetydighet tvetydig
|
||||
tvetydigheter tvetydig
|
||||
tvetydigheten tvetydig
|
||||
tvetydighetens tvetydig
|
||||
virkelig virkelig
|
||||
virkelighet virkelig
|
||||
virkeligheten virkelig
|
||||
virkelighetens virkelig
|
||||
# Adjectives
|
||||
billig billig
|
||||
billigere billig
|
||||
billigst billig
|
||||
billige billig
|
||||
frisk frisk
|
||||
friskere frisk
|
||||
friskest frisk
|
||||
syk syk
|
||||
sykere syk
|
||||
sykest syk
|
||||
#########################################
|
||||
# Words that should not be stemmed
|
||||
#
|
||||
# Irregular masculine nouns (not supposed to be handled correctly)
|
||||
# Fetched from http://no.wiktionary.org/wiki/Kategori:Substantiv_i_norsk_med_uregelrett_flertallsb%C3%B8yning
|
||||
vaffel vaffel
|
||||
vafler vafl
|
||||
vaflene vafl
|
||||
tittel tittel
|
||||
titler titl
|
||||
titlene titl
|
||||
kam kam
|
||||
kammer kamm
|
||||
kammene kamm
|
||||
kamrene kamr
|
||||
# Irregular feminine nouns, not handled
|
||||
ku ku
|
||||
ku ku
|
||||
kyr kyr
|
||||
kuer kuer
|
||||
kyrne kyrn
|
||||
kuene kuen
|
||||
datter datt
|
||||
døtre døtr
|
||||
døtrene døtr
|
||||
# Other words that should not be touched
|
||||
abc abc
|
||||
123 123
|
||||
Jens Jens
|
||||
# Adjectives
|
||||
billig billig
|
||||
billigere billig
|
||||
billigst billig
|
||||
billige billig
|
||||
frisk frisk
|
||||
friskere frisk
|
||||
friskest frisk
|
||||
# Irregular adjectives that should not be stemmed
|
||||
god god
|
||||
bedre bedr
|
||||
best best
|
||||
# Verbs, should not be stemmed
|
||||
føle føl
|
||||
følte føl
|
||||
følt følt
|
||||
|
@ -0,0 +1,99 @@
|
||||
#
|
||||
# Tests for norwegian Bokmål minimal stemmer
|
||||
# It only tries to stem nouns, i.e. being very little agressive
|
||||
#
|
||||
# Nouns masculine
|
||||
bil bil
|
||||
bilen bil
|
||||
biler bil
|
||||
bilene bil
|
||||
bilens bil
|
||||
bilenes bil
|
||||
sekretæren sekretær
|
||||
sekretær sekretær
|
||||
sekretærene sekretær
|
||||
kaker kak
|
||||
kaken kak
|
||||
kakene kak
|
||||
kakenes kak
|
||||
bibliotekar bibliotekar
|
||||
bibliotekarer bibliotekar
|
||||
bibliotekaren bibliotekar
|
||||
bibliotekarens bibliotekar
|
||||
bibliotekarene bibliotekar
|
||||
bibliotekarenes bibliotekar
|
||||
# Nouns feminine
|
||||
veske vesk
|
||||
veska vesk
|
||||
vesken vesk
|
||||
veskene vesk
|
||||
veskas vesk
|
||||
# Nouns neutral
|
||||
huset hus
|
||||
husene hus
|
||||
husets hus
|
||||
hus hus
|
||||
huset hus
|
||||
husene hus
|
||||
husenes hus
|
||||
flagg flagg
|
||||
flagga flagg
|
||||
flaggene flagg
|
||||
flaggets flagg
|
||||
flaggenes flagg
|
||||
politi politi
|
||||
politiet politi
|
||||
politiets politi
|
||||
politienes politi
|
||||
#########################################
|
||||
# Words that should not be stemmed
|
||||
#
|
||||
# Irregular masculine nouns (not supposed to be handled correctly)
|
||||
# Fetched from http://no.wiktionary.org/wiki/Kategori:Substantiv_i_norsk_med_uregelrett_flertallsb%C3%B8yning
|
||||
vaffel vaffel
|
||||
vafler vafl
|
||||
vaflene vafl
|
||||
tittel tittel
|
||||
titler titl
|
||||
titlene titl
|
||||
kam kam
|
||||
kammer kamm
|
||||
kammene kamm
|
||||
kamrene kamr
|
||||
# Irregular feminine nouns, not handled
|
||||
ku ku
|
||||
ku ku
|
||||
kyr kyr
|
||||
kuer kuer
|
||||
kyrne kyrn
|
||||
kuene kuen
|
||||
datter datt
|
||||
døtre døtr
|
||||
døtrene døtr
|
||||
# Other words that should not be touched
|
||||
abc abc
|
||||
123 123
|
||||
Jens Jens
|
||||
# Adjective, should not be stemmed
|
||||
billig billig
|
||||
billigere billiger
|
||||
billigst billigst
|
||||
billige billig
|
||||
god god
|
||||
bedre bedr
|
||||
best best
|
||||
# General endings, should not be stemmed
|
||||
god god
|
||||
godhet godh
|
||||
forelskelse forelskels
|
||||
kristendom kristendom
|
||||
# Verbs, should not be stemmed
|
||||
føle føl
|
||||
følte følt
|
||||
følt følt
|
||||
hemmelig hemmelig
|
||||
hemmelighet hemmeligh
|
||||
hemmeligheten hemmelighet
|
||||
kjærlig kjærlig
|
||||
kjærlighet kjærlig
|
||||
kjærligheten kjærlig
|
@ -561,6 +561,8 @@ New Features
|
||||
|
||||
* SOLR-2826: URLClassify Update Processor (janhoy)
|
||||
|
||||
* SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
|
||||
|
3903
solr/CHANGES.txt.orig
Normal file
3903
solr/CHANGES.txt.orig
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,39 @@
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.no.NorwegianLightStemFilter;
|
||||
|
||||
/**
|
||||
* Factory for {@link NorwegianLightStemFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.NorwegianLightStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class NorwegianLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new NorwegianLightStemFilter(input);
|
||||
}
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter;
|
||||
|
||||
/**
|
||||
* Factory for {@link NorwegianMinimalStemFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.NorwegianMinimalStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class NorwegianMinimalStemFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new NorwegianMinimalStemFilter(input);
|
||||
}
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Norwegian Light stem factory is working.
|
||||
*/
|
||||
public class TestNorwegianLightStemFilterFactory extends BaseTokenTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("epler eple");
|
||||
NorwegianLightStemFilterFactory factory = new NorwegianLightStemFilterFactory();
|
||||
TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
|
||||
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
|
||||
}
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Norwegian Minimal stem factory is working.
|
||||
*/
|
||||
public class TestNorwegianMinimalStemFilterFactory extends BaseTokenTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("eple eplet epler eplene eplets eplenes");
|
||||
NorwegianMinimalStemFilterFactory factory = new NorwegianMinimalStemFilterFactory();
|
||||
TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
|
||||
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
|
||||
}
|
||||
}
|
@ -753,6 +753,8 @@
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
|
||||
<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->
|
||||
<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user