mirror of https://github.com/apache/lucene.git
LUCENE-9589 Swedish Minimal Stemmer (#136)
This commit is contained in:
parent
0a316b2495
commit
5fdff6eabb
|
@ -19,6 +19,8 @@ New Features
|
|||
* LUCENE-9507: Custom order for leaves in IndexReader and IndexWriter
|
||||
(Mayya Sharipova, Mike McCandless, Jim Ferenczi)
|
||||
|
||||
* LUCENE-9589 Swedish Minimal Stemmer (janhoy)
|
||||
|
||||
System Requirements
|
||||
|
||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.sv;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link SwedishMinimalStemmer} to stem Swedish words.
|
||||
*
|
||||
* <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
|
||||
* custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
|
||||
* TokenStream}.
|
||||
*
|
||||
* @since 9.0.0
|
||||
*/
|
||||
public final class SwedishMinimalStemFilter extends TokenFilter {
|
||||
private final SwedishMinimalStemmer stemmer = new SwedishMinimalStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public SwedishMinimalStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.sv;
|
||||
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Factory for {@link SwedishMinimalStemFilter}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_svminstem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.SwedishMinimalStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @since 9.0.0
|
||||
* @lucene.spi {@value #NAME}
|
||||
*/
|
||||
public class SwedishMinimalStemFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** SPI name */
|
||||
public static final String NAME = "swedishMinimalStem";
|
||||
|
||||
/** Creates a new SwedishMinimalStemFilterFactory */
|
||||
public SwedishMinimalStemFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
/** Default ctor for compatibility with SPI */
|
||||
public SwedishMinimalStemFilterFactory() {
|
||||
throw defaultCtorException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new SwedishMinimalStemFilter(input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.sv;
|
||||
|
||||
/*
|
||||
* The code is inspired from original code located at: http://members.unine.ch/jacques.savoy/clef/
|
||||
*
|
||||
* Full copyright for that code follows:
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2005, Jacques Savoy
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer. Redistributions in binary
|
||||
* form must reproduce the above copyright notice, this list of conditions and
|
||||
* the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution. Neither the name of the author nor the names
|
||||
* of its contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
|
||||
|
||||
/**
|
||||
* Minimal Stemmer for Swedish. The algorithm is an adapted version of the SwedishLightStemmer, but
|
||||
* only stripping the most common plural suffixes for nouns: -ar/arne/arna/aren, -at, -er/erna, -et,
|
||||
* -or/orna, -en. We do not strip -an or -ans suffixes, since that would require a large dictionary
|
||||
* of exceptions.
|
||||
*
|
||||
* @since 9.0.0
|
||||
*/
|
||||
public class SwedishMinimalStemmer {
|
||||
|
||||
public int stem(char s[], int len) {
|
||||
if (len > 4 && s[len - 1] == 's') len--;
|
||||
|
||||
if (len > 6
|
||||
&& (endsWith(s, len, "arne")
|
||||
|| endsWith(s, len, "erna")
|
||||
|| endsWith(s, len, "arna")
|
||||
|| endsWith(s, len, "orna")
|
||||
|| endsWith(s, len, "aren"))) return len - 4;
|
||||
|
||||
if (len > 5 && (endsWith(s, len, "are"))) return len - 3;
|
||||
|
||||
if (len > 4
|
||||
&& (endsWith(s, len, "ar")
|
||||
|| endsWith(s, len, "at")
|
||||
|| endsWith(s, len, "er")
|
||||
|| endsWith(s, len, "et")
|
||||
|| endsWith(s, len, "or")
|
||||
|| endsWith(s, len, "en"))) return len - 2;
|
||||
|
||||
if (len > 3)
|
||||
switch (s[len - 1]) {
|
||||
case 'a':
|
||||
case 'e':
|
||||
case 'n':
|
||||
return len - 1;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
}
|
|
@ -113,6 +113,7 @@ org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
|
|||
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
|
||||
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
|
||||
org.apache.lucene.analysis.sv.SwedishMinimalStemFilterFactory
|
||||
org.apache.lucene.analysis.synonym.SynonymFilterFactory
|
||||
org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory
|
||||
org.apache.lucene.analysis.core.FlattenGraphFilterFactory
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.sv;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
|
||||
/** Simple tests for {@link SwedishMinimalStemFilter} */
|
||||
public class TestSwedishMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
analyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new SwedishMinimalStemFilter(source));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
analyzer.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
/** Test against vocabulary file */
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, Files.newInputStream(getDataPath("minimal.txt")));
|
||||
}
|
||||
|
||||
public void testKeyword() throws IOException {
|
||||
final CharArraySet exclusionSet = new CharArraySet(asSet("jaktkarlens"), false);
|
||||
Analyzer a =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
|
||||
return new TokenStreamComponents(source, new SwedishMinimalStemFilter(sink));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "jaktkarlens", "jaktkarlens");
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new SwedishMinimalStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
a.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
#
|
||||
# Tests for Swedish minimal stemmer
|
||||
# It only tries to stem nouns, i.e. being very little agressive
|
||||
#
|
||||
bil bil
|
||||
bilen bil
|
||||
biler bil
|
||||
bilar bil
|
||||
bilarna bil
|
||||
bilens bil
|
||||
bilarnas bil
|
||||
pojke pojk
|
||||
pojken pojk
|
||||
pojkar pojk
|
||||
pojkarna pojk
|
||||
flaska flask
|
||||
flaskor flask
|
||||
flaskorna flask
|
||||
stol stol
|
||||
stolen stol
|
||||
stolar stol
|
||||
stolarna stol
|
||||
gubbe gubb
|
||||
gubbar gubb
|
||||
gubben gubb
|
||||
gubbarna gubb
|
||||
sak sak
|
||||
saker sak
|
||||
saken sak
|
||||
sakerna sak
|
||||
bakelse bakels
|
||||
bakelser bakels
|
||||
bakelsen bakels
|
||||
bakelserna bakels
|
||||
parti parti
|
||||
partier parti
|
||||
partiet parti
|
||||
partierna parti
|
||||
# Expected mismatch for short noun
|
||||
horn hor
|
||||
hornet horn
|
||||
hornen horn
|
||||
bagar bag
|
||||
bagare bag
|
||||
bagaren bag
|
||||
bagare bag
|
||||
bagarna bag
|
||||
# The -a vs -an, -ana endings are not explicitly handled
|
||||
hjärta hjärt
|
||||
hjärtat hjärt
|
||||
hjärtan hjärta
|
||||
hjärtana hjärtan
|
||||
#########################################
|
||||
# Words that should not be stemmed
|
||||
#
|
||||
# Irregular masculine nouns (not supposed to be handled correctly)
|
||||
abc abc
|
||||
123 123
|
||||
Jens Jens
|
||||
# Too short words should not be stemmed
|
||||
ba ba
|
||||
nnn nnn
|
||||
ttt ttt
|
||||
eee eee
|
||||
# Some common examples that SwedishLightStemmer do stem but this one don't
|
||||
åre åre
|
||||
årets året
|
||||
grann gran
|
||||
gran gra
|
||||
starar star
|
||||
start start
|
||||
måsar mås
|
||||
måste måst
|
||||
# Some examples that will still clash and must be handled with e.g. protwords
|
||||
villa vill
|
||||
vill vill
|
||||
timmer timm
|
||||
timme timm
|
||||
timmar timm
|
||||
tomter tomt
|
||||
tomtar tomt
|
||||
änderna änd
|
||||
ändar änd
|
Loading…
Reference in New Issue