From 5fdff6eabb1e6452320800805355938ce8b903ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 28 May 2021 14:20:11 +0200 Subject: [PATCH] LUCENE-9589 Swedish Minimal Stemmer (#136) --- lucene/CHANGES.txt | 2 + .../analysis/sv/SwedishMinimalStemFilter.java | 56 +++++++++++ .../sv/SwedishMinimalStemFilterFactory.java | 60 ++++++++++++ .../analysis/sv/SwedishMinimalStemmer.java | 95 +++++++++++++++++++ ....apache.lucene.analysis.TokenFilterFactory | 1 + .../sv/TestSwedishMinimalStemFilter.java | 87 +++++++++++++++++ .../org/apache/lucene/analysis/sv/minimal.txt | 83 ++++++++++++++++ 7 files changed, 384 insertions(+) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilterFactory.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemmer.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishMinimalStemFilter.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/minimal.txt diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f1d3cef82ab..a9545776616 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -19,6 +19,8 @@ New Features * LUCENE-9507: Custom order for leaves in IndexReader and IndexWriter (Mayya Sharipova, Mike McCandless, Jim Ferenczi) +* LUCENE-9589 Swedish Minimal Stemmer (janhoy) + System Requirements * LUCENE-8738: Move to Java 11 as minimum Java version. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilter.java new file mode 100644 index 00000000000..0df5ec82bd1 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilter.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.sv; + +import java.io.IOException; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link SwedishMinimalStemmer} to stem Swedish words. + * + *

To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a + * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link + * TokenStream}. + * + * @since 9.0.0 + */ +public final class SwedishMinimalStemFilter extends TokenFilter { + private final SwedishMinimalStemmer stemmer = new SwedishMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public SwedishMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilterFactory.java new file mode 100644 index 00000000000..49edfa4e8b6 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilterFactory.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.sv; + +import java.util.Map; +import org.apache.lucene.analysis.TokenFilterFactory; +import org.apache.lucene.analysis.TokenStream; + +/** + * Factory for {@link SwedishMinimalStemFilter}. + * + *

+ * <fieldType name="text_svminstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.SwedishMinimalStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @since 9.0.0 + * @lucene.spi {@value #NAME} + */ +public class SwedishMinimalStemFilterFactory extends TokenFilterFactory { + + /** SPI name */ + public static final String NAME = "swedishMinimalStem"; + + /** Creates a new SwedishMinimalStemFilterFactory */ + public SwedishMinimalStemFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + /** Default ctor for compatibility with SPI */ + public SwedishMinimalStemFilterFactory() { + throw defaultCtorException(); + } + + @Override + public TokenStream create(TokenStream input) { + return new SwedishMinimalStemFilter(input); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemmer.java new file mode 100644 index 00000000000..c564a9d5a7b --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemmer.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.sv; + +/* + * The code is inspired from original code located at: http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.endsWith; + +/** + * Minimal Stemmer for Swedish. The algorithm is an adapted version of the SwedishLightStemmer, but + * only stripping the most common plural suffixes for nouns: -ar/arne/arna/aren, -at, -er/erna, -et, + * -or/orna, -en. We do not strip -an or -ans suffixes, since that would require a large dictionary + * of exceptions. + * + * @since 9.0.0 + */ +public class SwedishMinimalStemmer { + + public int stem(char s[], int len) { + if (len > 4 && s[len - 1] == 's') len--; + + if (len > 6 + && (endsWith(s, len, "arne") + || endsWith(s, len, "erna") + || endsWith(s, len, "arna") + || endsWith(s, len, "orna") + || endsWith(s, len, "aren"))) return len - 4; + + if (len > 5 && (endsWith(s, len, "are"))) return len - 3; + + if (len > 4 + && (endsWith(s, len, "ar") + || endsWith(s, len, "at") + || endsWith(s, len, "er") + || endsWith(s, len, "et") + || endsWith(s, len, "or") + || endsWith(s, len, "en"))) return len - 2; + + if (len > 3) + switch (s[len - 1]) { + case 'a': + case 'e': + case 'n': + return len - 1; + } + + return len; + } +} diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory index e08399d7cc4..2899fd516b9 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory @@ -113,6 +113,7 @@ org.apache.lucene.analysis.shingle.FixedShingleFilterFactory org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory +org.apache.lucene.analysis.sv.SwedishMinimalStemFilterFactory org.apache.lucene.analysis.synonym.SynonymFilterFactory org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory org.apache.lucene.analysis.core.FlattenGraphFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishMinimalStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishMinimalStemFilter.java new file mode 100644 index 00000000000..ca337fee37a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishMinimalStemFilter.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.sv; + +import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary; + +import java.io.IOException; +import java.nio.file.Files; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; + +/** Simple tests for {@link SwedishMinimalStemFilter} */ +public class TestSwedishMinimalStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer; + + @Override + public void setUp() throws Exception { + super.setUp(); + analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(source, new SwedishMinimalStemFilter(source)); + } + }; + } + + @Override + public void tearDown() throws Exception { + analyzer.close(); + super.tearDown(); + } + + /** Test against vocabulary file */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, Files.newInputStream(getDataPath("minimal.txt"))); + } + + public void testKeyword() throws IOException { + final CharArraySet exclusionSet = new CharArraySet(asSet("jaktkarlens"), false); + Analyzer a = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); + TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); + return new TokenStreamComponents(source, new SwedishMinimalStemFilter(sink)); + } + }; + checkOneTerm(a, "jaktkarlens", "jaktkarlens"); + a.close(); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new SwedishMinimalStemFilter(tokenizer)); + } + }; + checkOneTerm(a, "", ""); + a.close(); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/minimal.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/minimal.txt new file mode 100644 index 00000000000..e2e73661f3a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/minimal.txt @@ -0,0 +1,83 @@ +# +# Tests for Swedish minimal stemmer +# It only tries to stem nouns, i.e. being very little agressive +# +bil bil +bilen bil +biler bil +bilar bil +bilarna bil +bilens bil +bilarnas bil +pojke pojk +pojken pojk +pojkar pojk +pojkarna pojk +flaska flask +flaskor flask +flaskorna flask +stol stol +stolen stol +stolar stol +stolarna stol +gubbe gubb +gubbar gubb +gubben gubb +gubbarna gubb +sak sak +saker sak +saken sak +sakerna sak +bakelse bakels +bakelser bakels +bakelsen bakels +bakelserna bakels +parti parti +partier parti +partiet parti +partierna parti +# Expected mismatch for short noun +horn hor +hornet horn +hornen horn +bagar bag +bagare bag +bagaren bag +bagare bag +bagarna bag +# The -a vs -an, -ana endings are not explicitly handled +hjärta hjärt +hjärtat hjärt +hjärtan hjärta +hjärtana hjärtan +######################################### +# Words that should not be stemmed +# +# Irregular masculine nouns (not supposed to be handled correctly) +abc abc +123 123 +Jens Jens +# Too short words should not be stemmed +ba ba +nnn nnn +ttt ttt +eee eee +# Some common examples that SwedishLightStemmer do stem but this one don't +åre åre +årets året +grann gran +gran gra +starar star +start start +måsar mås +måste måst +# Some examples that will still clash and must be handled with e.g. protwords +villa vill +vill vill +timmer timm +timme timm +timmar timm +tomter tomt +tomtar tomt +änderna änd +ändar änd \ No newline at end of file