LUCENE-9589 Swedish Minimal Stemmer (#136)

2021-05-28 14:20:11 +02:00 · 2021-05-28 14:20:11 +02:00 · 5fdff6eabb
parent 0a316b2495
commit 5fdff6eabb
7 changed files with 384 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -19,6 +19,8 @@ New Features
 * LUCENE-9507: Custom order for leaves in IndexReader and IndexWriter
  (Mayya Sharipova, Mike McCandless, Jim Ferenczi)

+* LUCENE-9589 Swedish Minimal Stemmer (janhoy)
+
 System Requirements

 * LUCENE-8738: Move to Java 11 as minimum Java version.
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilter.java
@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.sv;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SwedishMinimalStemmer} to stem Swedish words.
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
+ *
+ * @since 9.0.0
+ */
+public final class SwedishMinimalStemFilter extends TokenFilter {
+  private final SwedishMinimalStemmer stemmer = new SwedishMinimalStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public SwedishMinimalStemFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemFilterFactory.java
@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.sv;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link SwedishMinimalStemFilter}.
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_svminstem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.SwedishMinimalStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ * @since 9.0.0
+ * @lucene.spi {@value #NAME}
+ */
+public class SwedishMinimalStemFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "swedishMinimalStem";
+
+  /** Creates a new SwedishMinimalStemFilterFactory */
+  public SwedishMinimalStemFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public SwedishMinimalStemFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new SwedishMinimalStemFilter(input);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishMinimalStemmer.java
@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.sv;
+
+/*
+ * The code is inspired from original code located at: http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+/*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
+
+/**
+ * Minimal Stemmer for Swedish. The algorithm is an adapted version of the SwedishLightStemmer, but
+ * only stripping the most common plural suffixes for nouns: -ar/arne/arna/aren, -at, -er/erna, -et,
+ * -or/orna, -en. We do not strip -an or -ans suffixes, since that would require a large dictionary
+ * of exceptions.
+ *
+ * @since 9.0.0
+ */
+public class SwedishMinimalStemmer {
+
+  public int stem(char s[], int len) {
+    if (len > 4 && s[len - 1] == 's') len--;
+
+    if (len > 6
+        && (endsWith(s, len, "arne")
+            || endsWith(s, len, "erna")
+            || endsWith(s, len, "arna")
+            || endsWith(s, len, "orna")
+            || endsWith(s, len, "aren"))) return len - 4;
+
+    if (len > 5 && (endsWith(s, len, "are"))) return len - 3;
+
+    if (len > 4
+        && (endsWith(s, len, "ar")
+            || endsWith(s, len, "at")
+            || endsWith(s, len, "er")
+            || endsWith(s, len, "et")
+            || endsWith(s, len, "or")
+            || endsWith(s, len, "en"))) return len - 2;
+
+    if (len > 3)
+      switch (s[len - 1]) {
+        case 'a':
+        case 'e':
+        case 'n':
+          return len - 1;
+      }
+
+    return len;
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@ -113,6 +113,7 @@ org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
 org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
 org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
 org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
+org.apache.lucene.analysis.sv.SwedishMinimalStemFilterFactory
 org.apache.lucene.analysis.synonym.SynonymFilterFactory
 org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory
 org.apache.lucene.analysis.core.FlattenGraphFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishMinimalStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishMinimalStemFilter.java
@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.sv;
+
+import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+
+/** Simple tests for {@link SwedishMinimalStemFilter} */
+public class TestSwedishMinimalStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    analyzer =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            return new TokenStreamComponents(source, new SwedishMinimalStemFilter(source));
+          }
+        };
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    analyzer.close();
+    super.tearDown();
+  }
+
+  /** Test against vocabulary file */
+  public void testVocabulary() throws IOException {
+    assertVocabulary(analyzer, Files.newInputStream(getDataPath("minimal.txt")));
+  }
+
+  public void testKeyword() throws IOException {
+    final CharArraySet exclusionSet = new CharArraySet(asSet("jaktkarlens"), false);
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
+            return new TokenStreamComponents(source, new SwedishMinimalStemFilter(sink));
+          }
+        };
+    checkOneTerm(a, "jaktkarlens", "jaktkarlens");
+    a.close();
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
+  }
+
+  public void testEmptyTerm() throws IOException {
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new KeywordTokenizer();
+            return new TokenStreamComponents(tokenizer, new SwedishMinimalStemFilter(tokenizer));
+          }
+        };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/minimal.txt
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/minimal.txt
@ -0,0 +1,83 @@
+#
+# Tests for Swedish minimal stemmer
+# It only tries to stem nouns, i.e. being very little agressive
+#
+bil	bil
+bilen	bil
+biler	bil
+bilar	bil
+bilarna	bil
+bilens	bil
+bilarnas	bil
+pojke	pojk
+pojken	pojk
+pojkar	pojk
+pojkarna	pojk
+flaska	flask
+flaskor	flask
+flaskorna	flask
+stol	stol
+stolen	stol
+stolar	stol
+stolarna	stol
+gubbe	gubb
+gubbar	gubb
+gubben	gubb
+gubbarna	gubb
+sak	sak
+saker	sak
+saken	sak
+sakerna	sak
+bakelse	bakels
+bakelser	bakels
+bakelsen	bakels
+bakelserna	bakels
+parti	parti
+partier	parti
+partiet	parti
+partierna	parti
+# Expected mismatch for short noun
+horn	hor
+hornet	horn
+hornen	horn
+bagar	bag
+bagare	bag
+bagaren	bag
+bagare	bag
+bagarna	bag
+# The -a vs -an, -ana endings are not explicitly handled
+hjärta	hjärt
+hjärtat	hjärt
+hjärtan	hjärta
+hjärtana	hjärtan
+#########################################
+# Words that should not be stemmed
+#
+# Irregular masculine nouns (not supposed to be handled correctly)
+abc	abc
+123	123
+Jens	Jens
+# Too short words should not be stemmed
+ba	ba
+nnn	nnn
+ttt	ttt
+eee	eee
+# Some common examples that SwedishLightStemmer do stem but this one don't
+åre	åre
+årets	året
+grann	gran
+gran	gra
+starar	star
+start	start
+måsar	mås
+måste	måst
+# Some examples that will still clash and must be handled with e.g. protwords
+villa	vill
+vill	vill
+timmer	timm
+timme	timm
+timmar	timm
+tomter	tomt
+tomtar	tomt
+änderna	änd
+ändar	änd