SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1302833 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jan Høydahl 2012-03-20 10:57:50 +00:00
parent 7230d78fe9
commit 54d48eb98b
16 changed files with 4734 additions and 0 deletions

View File

@ -106,3 +106,6 @@ New Features
All analyzers in contrib/analyzers and contrib/icu were moved to the
analysis module. The 'smartcn' and 'stempel' components now depend on 'common'.
(Chris Male, Robert Muir)
* SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.no;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link NorwegianLightStemmer} to stem Norwegian
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class NorwegianLightStemFilter extends TokenFilter {
private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public NorwegianLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,119 @@
package org.apache.lucene.analysis.no;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Norwegian.
* <p>
* Parts of this stemmer is adapted from SwedishLightStemFilter, except
* that while the Swedish one has a pre-defined rule set and a corresponding
* corpus to validate against whereas the Norwegian one is hand crafted.
*/
public class NorwegianLightStemmer {
public int stem(char s[], int len) {
// Remove posessive -s (bilens -> bilen) and continue checking
if (len > 4 && s[len-1] == 's')
len--;
// Remove common endings, single-pass
if (len > 7 &&
(endsWith(s, len, "heter") || // general ending (hemmelig-heter -> hemmelig)
endsWith(s, len, "heten"))) // general ending (hemmelig-heten -> hemmelig)
return len - 5;
if (len > 5 &&
(endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
endsWith(s, len, "het"))) // general ending (hemmelig-het -> hemmelig)
return len - 3;
if (len > 7 &&
(endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
return len - 5;
if (len > 6 &&
(endsWith(s, len, "ende") || // (sov-ende -> sov)
endsWith(s, len, "else") || // general ending (føl-else -> føl)
endsWith(s, len, "este") || // adj (fin-este -> fin)
endsWith(s, len, "eren"))) // masc
return len - 4;
if (len > 5 &&
(endsWith(s, len, "ere") || // adj (fin-ere -> fin)
endsWith(s, len, "est") || // adj (fin-est -> fin)
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
))
return len - 3;
if (len > 4 &&
(endsWith(s, len, "er") || // masc/fem indefinite
endsWith(s, len, "en") || // masc/fem definite
endsWith(s, len, "et") || // neutr definite
endsWith(s, len, "st") || // adj (billig-st -> billig)
endsWith(s, len, "te")))
return len - 2;
if (len > 3)
switch(s[len-1]) {
case 'a': // fem definite
case 'e': // to get correct stem for nouns ending in -e (kake -> kak, kaker -> kak)
case 'n':
return len - 1;
}
return len;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.no;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link NorwegianMinimalStemmer} to stem Norwegian
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class NorwegianMinimalStemFilter extends TokenFilter {
private final NorwegianMinimalStemmer stemmer = new NorwegianMinimalStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public NorwegianMinimalStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,90 @@
package org.apache.lucene.analysis.no;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Minimal Stemmer for Norwegian bokmål (no-nb)
* <p>
* Stems known plural forms for Norwegian nouns only, together with genitiv -s
*/
public class NorwegianMinimalStemmer {
public int stem(char s[], int len) {
// Remove genitiv s
if (len > 4 && s[len-1] == 's')
len--;
if (len > 5 &&
endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
)
return len - 3;
if (len > 4 &&
(endsWith(s, len, "er") || // masc/fem indefinite
endsWith(s, len, "en") || // masc/fem definite
endsWith(s, len, "et") // neutr definite
))
return len - 2;
if (len > 3)
switch(s[len-1]) {
case 'a': // fem definite
case 'e': // to get correct stem for nouns ending in -e (kake -> kak, kaker -> kak)
return len - 1;
}
return len;
}
}

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.no;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
/**
* Simple tests for {@link NorwegianLightStemFilter}
*/
public class TestNorwegianLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new NorwegianLightStemFilter(source));
}
};
/** Test against a vocabulary file */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.no;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import static org.apache.lucene.analysis.VocabularyAssert.*;
/**
* Simple tests for {@link NorwegianMinimalStemFilter}
*/
public class TestNorwegianMinimalStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(source));
}
};
/** Test against a vocabulary file */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -0,0 +1,144 @@
#
# Tests for norwegian Bokmål light stemmer
# It should tackle nouns, adjectives, genitiv and some general endings
#
# Nouns masculine
bil bil
bilen bil
biler bil
bilene bil
bilens bil
bilenes bil
sekretæren sekretær
sekretær sekretær
sekretærene sekretær
kaker kak
kaken kak
kakene kak
kakenes kak
bibliotekar bibliotekar
bibliotekarer bibliotekar
bibliotekaren bibliotekar
bibliotekarens bibliotekar
bibliotekarene bibliotekar
bibliotekarenes bibliotekar
# Nouns feminine
veske vesk
veska vesk
vesken vesk
veskene vesk
veskas vesk
# Nouns neutral
huset hus
husene hus
husets hus
hus hus
huset hus
husene hus
husenes hus
flagg flagg
flagga flagg
flaggene flagg
flaggets flagg
flaggenes flagg
politi politi
politiet politi
politiets politi
politienes politi
# General endings
god god
godhet god
godheten god
forelskelse forelsk
forelsket forelsk
forelskelsen forelsk
forelske forelsk
kristen krist
kristendom kristen
kristendommen kristendomm
kristendommens kristendomm
fattig fattig
fattigdom fattig
fattigdommen fattigdomm
fattigdommens fattigdomm
# -het (see http://no.wiktionary.org/wiki/Kategori:Ord_som_ender_p%C3%A5_%C2%AB-het%C2%BB)
hemmelig hemmelig
hemmelighet hemmelig
hemmelighets hemmelig
hemmeligheter hemmelig
hemmeligheten hemmelig
hemmelighetens hemmelig
kjærlig kjærlig
kjærlighet kjærlig
kjærligheter kjærlig
kjærligheten kjærlig
forlegen forleg
forlegenhet forlegen
forlegenheten forlegen
forlegenhetens forlegen
tvetydig tvetydig
tvetydighet tvetydig
tvetydigheter tvetydig
tvetydigheten tvetydig
tvetydighetens tvetydig
virkelig virkelig
virkelighet virkelig
virkeligheten virkelig
virkelighetens virkelig
# Adjectives
billig billig
billigere billig
billigst billig
billige billig
frisk frisk
friskere frisk
friskest frisk
syk syk
sykere syk
sykest syk
#########################################
# Words that should not be stemmed
#
# Irregular masculine nouns (not supposed to be handled correctly)
# Fetched from http://no.wiktionary.org/wiki/Kategori:Substantiv_i_norsk_med_uregelrett_flertallsb%C3%B8yning
vaffel vaffel
vafler vafl
vaflene vafl
tittel tittel
titler titl
titlene titl
kam kam
kammer kamm
kammene kamm
kamrene kamr
# Irregular feminine nouns, not handled
ku ku
ku ku
kyr kyr
kuer kuer
kyrne kyrn
kuene kuen
datter datt
døtre døtr
døtrene døtr
# Other words that should not be touched
abc abc
123 123
Jens Jens
# Adjectives
billig billig
billigere billig
billigst billig
billige billig
frisk frisk
friskere frisk
friskest frisk
# Irregular adjectives that should not be stemmed
god god
bedre bedr
best best
# Verbs, should not be stemmed
føle føl
følte føl
følt følt

View File

@ -0,0 +1,99 @@
#
# Tests for norwegian Bokmål minimal stemmer
# It only tries to stem nouns, i.e. being very little agressive
#
# Nouns masculine
bil bil
bilen bil
biler bil
bilene bil
bilens bil
bilenes bil
sekretæren sekretær
sekretær sekretær
sekretærene sekretær
kaker kak
kaken kak
kakene kak
kakenes kak
bibliotekar bibliotekar
bibliotekarer bibliotekar
bibliotekaren bibliotekar
bibliotekarens bibliotekar
bibliotekarene bibliotekar
bibliotekarenes bibliotekar
# Nouns feminine
veske vesk
veska vesk
vesken vesk
veskene vesk
veskas vesk
# Nouns neutral
huset hus
husene hus
husets hus
hus hus
huset hus
husene hus
husenes hus
flagg flagg
flagga flagg
flaggene flagg
flaggets flagg
flaggenes flagg
politi politi
politiet politi
politiets politi
politienes politi
#########################################
# Words that should not be stemmed
#
# Irregular masculine nouns (not supposed to be handled correctly)
# Fetched from http://no.wiktionary.org/wiki/Kategori:Substantiv_i_norsk_med_uregelrett_flertallsb%C3%B8yning
vaffel vaffel
vafler vafl
vaflene vafl
tittel tittel
titler titl
titlene titl
kam kam
kammer kamm
kammene kamm
kamrene kamr
# Irregular feminine nouns, not handled
ku ku
ku ku
kyr kyr
kuer kuer
kyrne kyrn
kuene kuen
datter datt
døtre døtr
døtrene døtr
# Other words that should not be touched
abc abc
123 123
Jens Jens
# Adjective, should not be stemmed
billig billig
billigere billiger
billigst billigst
billige billig
god god
bedre bedr
best best
# General endings, should not be stemmed
god god
godhet godh
forelskelse forelskels
kristendom kristendom
# Verbs, should not be stemmed
føle føl
følte følt
følt følt
hemmelig hemmelig
hemmelighet hemmeligh
hemmeligheten hemmelighet
kjærlig kjærlig
kjærlighet kjærlig
kjærligheten kjærlig

View File

@ -561,6 +561,8 @@ New Features
* SOLR-2826: URLClassify Update Processor (janhoy)
* SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)
Optimizations
----------------------
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter

3903
solr/CHANGES.txt.orig Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,39 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.no.NorwegianLightStemFilter;
/**
* Factory for {@link NorwegianLightStemFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.NorwegianLightStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @version $Id$
*/
public class NorwegianLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new NorwegianLightStemFilter(input);
}
}

View File

@ -0,0 +1,39 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter;
/**
* Factory for {@link NorwegianMinimalStemFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.NorwegianMinimalStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @version $Id$
*/
public class NorwegianMinimalStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new NorwegianMinimalStemFilter(input);
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure the Norwegian Light stem factory is working.
*/
public class TestNorwegianLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("epler eple");
NorwegianLightStemFilterFactory factory = new NorwegianLightStemFilterFactory();
TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
assertTokenStreamContents(stream, new String[] { "epl", "epl" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure the Norwegian Minimal stem factory is working.
*/
public class TestNorwegianMinimalStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("eple eplet epler eplene eplets eplenes");
NorwegianMinimalStemFilterFactory factory = new NorwegianMinimalStemFilterFactory();
TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
}
}

View File

@ -753,6 +753,8 @@
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->
<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->
</analyzer>
</fieldType>