LUCENE-8936: Add SpanishMinimalStemFilter

Signed-off-by: Tomoko Uchida <tomoko@apache.org>
2019-07-28 23:36:56 +09:00 · 2019-07-28 23:36:56 +09:00 · 8c8d8abddc
parent 4050ddc59b
commit 8c8d8abddc
7 changed files with 322 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -56,6 +56,10 @@ API Changes
 * LUCENE-8909: IndexWriter#getFieldNames() method is used to get fields present in index. After LUCENE-8316, this
  method is no longer required. Hence, deprecate IndexWriter#getFieldNames() method. (Adrien Grand, Munendra S N)

+New Features
+
+* LUCENE-8936: Add SpanishMinimalStemFilter (vinod kumar via Tomoko Uchida)
+
 Improvements

 * LUCENE-8874: Show SPI names instead of class names in Luke Analysis tab. (Tomoko Uchida)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemFilter.java
@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.es;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link SpanishMinimalStemmer} to stem Spanish
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class SpanishMinimalStemFilter extends TokenFilter {
+  private final SpanishMinimalStemmer stemmer = new SpanishMinimalStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public SpanishMinimalStemFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemFilterFactory.java
@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.es;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link SpanishMinimalStemFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.SpanishMinimalStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * @lucene.spi {@value #NAME}
+ */
+public class SpanishMinimalStemFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "spanishMinimalStem";
+
+  /** Creates a new SpanishMinimalStemFilterFactory */
+  public SpanishMinimalStemFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) { return new SpanishMinimalStemFilter(input); }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemmer.java
@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.es;
+
+/**
+ * Minimal plural stemmer for Spanish.
+ * <p>
+ * This stemmer implements the "plurals" stemmer for
+ * spanish lanugage.
+ *
+ */
+public class SpanishMinimalStemmer {
+
+  public int stem(char s[], int len) {
+    if (len < 4 || s[len-1] != 's')
+      return len;
+
+    for (int i = 0; i < len; i++)
+      switch(s[i]) {
+        case 'à':
+        case 'á':
+        case 'â':
+        case 'ä': s[i] = 'a'; break;
+        case 'ò':
+        case 'ó':
+        case 'ô':
+        case 'ö': s[i] = 'o'; break;
+        case 'è':
+        case 'é':
+        case 'ê':
+        case 'ë': s[i] = 'e'; break;
+        case 'ù':
+        case 'ú':
+        case 'û':
+        case 'ü': s[i] = 'u'; break;
+        case 'ì':
+        case 'í':
+        case 'î':
+        case 'ï': s[i] = 'i'; break;
+        case 'ñ': s[i] = 'n'; break;
+      }
+
+    switch(s[len-1]) {
+      case 's':
+        if (s[len-2] == 'a' || s[len-2] == 'o') {
+          return len-1;
+        }
+        if (s[len-2] == 'e') {
+          if (s[len-3] == 's' && s[len-4] == 'e') {
+            return len-2;
+          }
+          if (s[len-3] == 'c') {
+            s[len-3] = 'z';
+            return len-2;
+          } else {
+            return len-2;
+          }
+        } else {
+          return len-1;
+        }
+    }
+
+    return len;
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -45,6 +45,7 @@ org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory
 org.apache.lucene.analysis.en.KStemFilterFactory
 org.apache.lucene.analysis.en.PorterStemFilterFactory
 org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
+org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
 org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
 org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
 org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishMinimalStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishMinimalStemFilter.java
@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.es;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+
+/**
+ * Simple tests for {@link SpanishMinimalStemFilter}
+ */
+public class TestSpanishMinimalStemFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(source, new SpanishMinimalStemFilter(source));
+      }
+    };
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    analyzer.close();
+    super.tearDown();
+  }
+
+  /** Test some examples */
+  public void testExamples() throws IOException {
+    checkOneTerm(analyzer, "actrices", "actriz");
+    checkOneTerm(analyzer, "niños", "nino");
+    checkOneTerm(analyzer, "países", "pais");
+    checkOneTerm(analyzer, "caragodor", "caragodor");
+    checkOneTerm(analyzer, "móviles", "movil");
+    checkOneTerm(analyzer, "chicas", "chica");
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), analyzer, 50*RANDOM_MULTIPLIER);
+  }
+
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishMinimalStemFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishMinimalStemFilterFactory.java
@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.es;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Simple tests to ensure the spanish minimal stem factory is working.
+ */
+public class TestSpanishMinimalStemFilterFactory extends BaseTokenStreamFactoryTestCase {
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("camisetas");
+    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    ((Tokenizer)stream).setReader(reader);
+    stream = tokenFilterFactory("spanishMinimalStem").create(stream);
+    assertTokenStreamContents(stream, new String[] { "camiseta" });
+  }
+
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+      tokenFilterFactory("spanishMinimalStem", "bogusArg", "bogusValue");
+    });
+    assertTrue(expected.getMessage().contains("Unknown parameters"));
+  }
+}