LUCENE-8936: Add SpanishMinimalStemFilter

Signed-off-by: Tomoko Uchida <tomoko@apache.org>
2019-07-28 23:36:56 +09:00 · 2019-07-28 23:36:56 +09:00 · 8c8d8abddc
parent 4050ddc59b
commit 8c8d8abddc
7 changed files with 322 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -56,6 +56,10 @@ API Changes
 * LUCENE-8909: IndexWriter#getFieldNames() method is used to get fields present in index. After LUCENE-8316, this
  method is no longer required. Hence, deprecate IndexWriter#getFieldNames() method. (Adrien Grand, Munendra S N)
 New Features
 * LUCENE-8936: Add SpanishMinimalStemFilter (vinod kumar via Tomoko Uchida)
 Improvements
 * LUCENE-8874: Show SPI names instead of class names in Luke Analysis tab. (Tomoko Uchida)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemFilter.java
@ -0,0 +1,58 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.es;
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 /**
 * A {@link TokenFilter} that applies {@link SpanishMinimalStemmer} to stem Spanish
 * words.
 * <p>
 * To prevent terms from being stemmed use an instance of
 * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
 * the {@link KeywordAttribute} before this {@link TokenStream}.
 * </p>
 */
 public final class SpanishMinimalStemFilter extends TokenFilter {
  private final SpanishMinimalStemmer stemmer = new SpanishMinimalStemmer();
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
  public SpanishMinimalStemFilter(TokenStream input) {
    super(input);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      if (!keywordAttr.isKeyword()) {
        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
        termAtt.setLength(newlen);
      }
      return true;
    } else {
      return false;
    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemFilterFactory.java
@ -0,0 +1,52 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.es;
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 /**
 * Factory for {@link SpanishMinimalStemFilter}.
 * <pre class="prettyprint">
 * &lt;fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
 *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
 *     &lt;filter class="solr.SpanishMinimalStemFilterFactory"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 * @lucene.spi {@value #NAME}
 */
 public class SpanishMinimalStemFilterFactory extends TokenFilterFactory {
  /** SPI name */
  public static final String NAME = "spanishMinimalStem";
  /** Creates a new SpanishMinimalStemFilterFactory */
  public SpanishMinimalStemFilterFactory(Map<String,String> args) {
    super(args);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }
  @Override
  public TokenStream create(TokenStream input) { return new SpanishMinimalStemFilter(input); }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishMinimalStemmer.java
@ -0,0 +1,80 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.es;
 /**
 * Minimal plural stemmer for Spanish.
 * <p>
 * This stemmer implements the "plurals" stemmer for
 * spanish lanugage.
 *
 */
 public class SpanishMinimalStemmer {
  public int stem(char s[], int len) {
    if (len < 4 || s[len-1] != 's')
      return len;
    for (int i = 0; i < len; i++)
      switch(s[i]) {
        case 'à':
        case 'á':
        case 'â':
        case 'ä': s[i] = 'a'; break;
        case 'ò':
        case 'ó':
        case 'ô':
        case 'ö': s[i] = 'o'; break;
        case 'è':
        case 'é':
        case 'ê':
        case 'ë': s[i] = 'e'; break;
        case 'ù':
        case 'ú':
        case 'û':
        case 'ü': s[i] = 'u'; break;
        case 'ì':
        case 'í':
        case 'î':
        case 'ï': s[i] = 'i'; break;
        case 'ñ': s[i] = 'n'; break;
      }
    switch(s[len-1]) {
      case 's':
        if (s[len-2] == 'a' || s[len-2] == 'o') {
          return len-1;
        }
        if (s[len-2] == 'e') {
          if (s[len-3] == 's' && s[len-4] == 'e') {
            return len-2;
          }
          if (s[len-3] == 'c') {
            s[len-3] = 'z';
            return len-2;
          } else {
            return len-2;
          }
        } else {
          return len-1;
        }
    }
    return len;
  }
 }
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -45,6 +45,7 @@ org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory
 org.apache.lucene.analysis.en.KStemFilterFactory
 org.apache.lucene.analysis.en.PorterStemFilterFactory
 org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
 org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
 org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
 org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
 org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishMinimalStemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishMinimalStemFilter.java
@ -0,0 +1,80 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.es;
 import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
 /**
 * Simple tests for {@link SpanishMinimalStemFilter}
 */
 public class TestSpanishMinimalStemFilter extends BaseTokenStreamTestCase {
  private Analyzer analyzer;
  @Override
  public void setUp() throws Exception {
    super.setUp();
    analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(source, new SpanishMinimalStemFilter(source));
      }
    };
  }
  @Override
  public void tearDown() throws Exception {
    analyzer.close();
    super.tearDown();
  }
  /** Test some examples */
  public void testExamples() throws IOException {
    checkOneTerm(analyzer, "actrices", "actriz");
    checkOneTerm(analyzer, "niños", "nino");
    checkOneTerm(analyzer, "países", "pais");
    checkOneTerm(analyzer, "caragodor", "caragodor");
    checkOneTerm(analyzer, "móviles", "movil");
    checkOneTerm(analyzer, "chicas", "chica");
  }
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), analyzer, 50*RANDOM_MULTIPLIER);
  }
  public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer));
      }
    };
    checkOneTerm(a, "", "");
    a.close();
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishMinimalStemFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishMinimalStemFilterFactory.java
@ -0,0 +1,47 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.es;
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
 /**
 * Simple tests to ensure the spanish minimal stem factory is working.
 */
 public class TestSpanishMinimalStemFilterFactory extends BaseTokenStreamFactoryTestCase {
  public void testStemming() throws Exception {
    Reader reader = new StringReader("camisetas");
    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    ((Tokenizer)stream).setReader(reader);
    stream = tokenFilterFactory("spanishMinimalStem").create(stream);
    assertTokenStreamContents(stream, new String[] { "camiseta" });
  }
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
      tokenFilterFactory("spanishMinimalStem", "bogusArg", "bogusValue");
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
  }
 }