mirror of
https://github.com/apache/lucene.git
synced 2025-02-09 03:25:15 +00:00
LUCENE-9589 Swedish Minimal Stemmer (#136)
This commit is contained in:
parent
0a316b2495
commit
5fdff6eabb
@ -19,6 +19,8 @@ New Features
|
|||||||
* LUCENE-9507: Custom order for leaves in IndexReader and IndexWriter
|
* LUCENE-9507: Custom order for leaves in IndexReader and IndexWriter
|
||||||
(Mayya Sharipova, Mike McCandless, Jim Ferenczi)
|
(Mayya Sharipova, Mike McCandless, Jim Ferenczi)
|
||||||
|
|
||||||
|
* LUCENE-9589 Swedish Minimal Stemmer (janhoy)
|
||||||
|
|
||||||
System Requirements
|
System Requirements
|
||||||
|
|
||||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||||
|
@ -0,0 +1,56 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link SwedishMinimalStemmer} to stem Swedish words.
|
||||||
|
*
|
||||||
|
* <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
|
||||||
|
* custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
|
||||||
|
* TokenStream}.
|
||||||
|
*
|
||||||
|
* @since 9.0.0
|
||||||
|
*/
|
||||||
|
public final class SwedishMinimalStemFilter extends TokenFilter {
|
||||||
|
private final SwedishMinimalStemmer stemmer = new SwedishMinimalStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public SwedishMinimalStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link SwedishMinimalStemFilter}.
|
||||||
|
*
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_svminstem" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
* <filter class="solr.SwedishMinimalStemFilterFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*
|
||||||
|
* @since 9.0.0
|
||||||
|
* @lucene.spi {@value #NAME}
|
||||||
|
*/
|
||||||
|
public class SwedishMinimalStemFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
/** SPI name */
|
||||||
|
public static final String NAME = "swedishMinimalStem";
|
||||||
|
|
||||||
|
/** Creates a new SwedishMinimalStemFilterFactory */
|
||||||
|
public SwedishMinimalStemFilterFactory(Map<String, String> args) {
|
||||||
|
super(args);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Default ctor for compatibility with SPI */
|
||||||
|
public SwedishMinimalStemFilterFactory() {
|
||||||
|
throw defaultCtorException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new SwedishMinimalStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,95 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The code is inspired from original code located at: http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimal Stemmer for Swedish. The algorithm is an adapted version of the SwedishLightStemmer, but
|
||||||
|
* only stripping the most common plural suffixes for nouns: -ar/arne/arna/aren, -at, -er/erna, -et,
|
||||||
|
* -or/orna, -en. We do not strip -an or -ans suffixes, since that would require a large dictionary
|
||||||
|
* of exceptions.
|
||||||
|
*
|
||||||
|
* @since 9.0.0
|
||||||
|
*/
|
||||||
|
public class SwedishMinimalStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len > 4 && s[len - 1] == 's') len--;
|
||||||
|
|
||||||
|
if (len > 6
|
||||||
|
&& (endsWith(s, len, "arne")
|
||||||
|
|| endsWith(s, len, "erna")
|
||||||
|
|| endsWith(s, len, "arna")
|
||||||
|
|| endsWith(s, len, "orna")
|
||||||
|
|| endsWith(s, len, "aren"))) return len - 4;
|
||||||
|
|
||||||
|
if (len > 5 && (endsWith(s, len, "are"))) return len - 3;
|
||||||
|
|
||||||
|
if (len > 4
|
||||||
|
&& (endsWith(s, len, "ar")
|
||||||
|
|| endsWith(s, len, "at")
|
||||||
|
|| endsWith(s, len, "er")
|
||||||
|
|| endsWith(s, len, "et")
|
||||||
|
|| endsWith(s, len, "or")
|
||||||
|
|| endsWith(s, len, "en"))) return len - 2;
|
||||||
|
|
||||||
|
if (len > 3)
|
||||||
|
switch (s[len - 1]) {
|
||||||
|
case 'a':
|
||||||
|
case 'e':
|
||||||
|
case 'n':
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
@ -113,6 +113,7 @@ org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
|
|||||||
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
|
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
|
||||||
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
|
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
|
||||||
org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
|
org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
|
||||||
|
org.apache.lucene.analysis.sv.SwedishMinimalStemFilterFactory
|
||||||
org.apache.lucene.analysis.synonym.SynonymFilterFactory
|
org.apache.lucene.analysis.synonym.SynonymFilterFactory
|
||||||
org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory
|
org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory
|
||||||
org.apache.lucene.analysis.core.FlattenGraphFilterFactory
|
org.apache.lucene.analysis.core.FlattenGraphFilterFactory
|
||||||
|
@ -0,0 +1,87 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
|
||||||
|
/** Simple tests for {@link SwedishMinimalStemFilter} */
|
||||||
|
public class TestSwedishMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
analyzer =
|
||||||
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(source, new SwedishMinimalStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
analyzer.close();
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test against vocabulary file */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, Files.newInputStream(getDataPath("minimal.txt")));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKeyword() throws IOException {
|
||||||
|
final CharArraySet exclusionSet = new CharArraySet(asSet("jaktkarlens"), false);
|
||||||
|
Analyzer a =
|
||||||
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||||
|
return new TokenStreamComponents(source, new SwedishMinimalStemFilter(sink));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "jaktkarlens", "jaktkarlens");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
Analyzer a =
|
||||||
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
return new TokenStreamComponents(tokenizer, new SwedishMinimalStemFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,83 @@
|
|||||||
|
#
|
||||||
|
# Tests for Swedish minimal stemmer
|
||||||
|
# It only tries to stem nouns, i.e. being very little agressive
|
||||||
|
#
|
||||||
|
bil bil
|
||||||
|
bilen bil
|
||||||
|
biler bil
|
||||||
|
bilar bil
|
||||||
|
bilarna bil
|
||||||
|
bilens bil
|
||||||
|
bilarnas bil
|
||||||
|
pojke pojk
|
||||||
|
pojken pojk
|
||||||
|
pojkar pojk
|
||||||
|
pojkarna pojk
|
||||||
|
flaska flask
|
||||||
|
flaskor flask
|
||||||
|
flaskorna flask
|
||||||
|
stol stol
|
||||||
|
stolen stol
|
||||||
|
stolar stol
|
||||||
|
stolarna stol
|
||||||
|
gubbe gubb
|
||||||
|
gubbar gubb
|
||||||
|
gubben gubb
|
||||||
|
gubbarna gubb
|
||||||
|
sak sak
|
||||||
|
saker sak
|
||||||
|
saken sak
|
||||||
|
sakerna sak
|
||||||
|
bakelse bakels
|
||||||
|
bakelser bakels
|
||||||
|
bakelsen bakels
|
||||||
|
bakelserna bakels
|
||||||
|
parti parti
|
||||||
|
partier parti
|
||||||
|
partiet parti
|
||||||
|
partierna parti
|
||||||
|
# Expected mismatch for short noun
|
||||||
|
horn hor
|
||||||
|
hornet horn
|
||||||
|
hornen horn
|
||||||
|
bagar bag
|
||||||
|
bagare bag
|
||||||
|
bagaren bag
|
||||||
|
bagare bag
|
||||||
|
bagarna bag
|
||||||
|
# The -a vs -an, -ana endings are not explicitly handled
|
||||||
|
hjärta hjärt
|
||||||
|
hjärtat hjärt
|
||||||
|
hjärtan hjärta
|
||||||
|
hjärtana hjärtan
|
||||||
|
#########################################
|
||||||
|
# Words that should not be stemmed
|
||||||
|
#
|
||||||
|
# Irregular masculine nouns (not supposed to be handled correctly)
|
||||||
|
abc abc
|
||||||
|
123 123
|
||||||
|
Jens Jens
|
||||||
|
# Too short words should not be stemmed
|
||||||
|
ba ba
|
||||||
|
nnn nnn
|
||||||
|
ttt ttt
|
||||||
|
eee eee
|
||||||
|
# Some common examples that SwedishLightStemmer do stem but this one don't
|
||||||
|
åre åre
|
||||||
|
årets året
|
||||||
|
grann gran
|
||||||
|
gran gra
|
||||||
|
starar star
|
||||||
|
start start
|
||||||
|
måsar mås
|
||||||
|
måste måst
|
||||||
|
# Some examples that will still clash and must be handled with e.g. protwords
|
||||||
|
villa vill
|
||||||
|
vill vill
|
||||||
|
timmer timm
|
||||||
|
timme timm
|
||||||
|
timmar timm
|
||||||
|
tomter tomt
|
||||||
|
tomtar tomt
|
||||||
|
änderna änd
|
||||||
|
ändar änd
|
Loading…
x
Reference in New Issue
Block a user