LUCENE-10248: Spanish Plural Stemmer (#461)

Adds a new Spanish stemmer just for stemming plural to singular whilst maintaining gender: the SpanishPluralStemmer. The goal is to provide a lightweight algorithmic approach with better precision and recall than current approaches.

See blog post for more details: https://medium.com/inside-wallapop/spanish-plural-stemmer-matching-plural-and-singular-forms-in-spanish-using-lucene-93e005e38373

This approach is based on rules specified in WikiLingua: http://www.wikilengua.org/index.php/Plural_(formaci%C3%B3n)

Some characteristics:

* Designed to stem just plural to singular form
* Distinguishes between masculine and feminine forms
* It will increase recall but precision can be reduced depending on the use case/information need
* Stems plural words of foreign origin: i.e. complots, bits, punks, robots
* Support for invariant words: same plural and singular form or plural does not make sense: i.e. crisis, jueves, lapsus, abrebotellas, etc
* Support for special cases: i.e. yoes, clubes, itemes, faralaes
* Use it when the distinction between singular and plural is not relevant but gender is relevant
* Produces meaningful tokens in form of singular
* Not strange stems like “amig”: it’s true that stemmers must not generate grammatically correct tokens, but if we generate correct stems we decrease the possibility of collisions with other words
This commit is contained in:
Xavier Sanchez Loro 2021-11-30 21:51:10 +01:00 committed by GitHub
parent f48a430f35
commit edb936f090
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 517 additions and 0 deletions

View File

@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link SpanishPluralStemmer} to stem Spanish words.
*
* <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
* custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
* TokenStream}.
*/
public final class SpanishPluralStemFilter extends TokenFilter {
private final SpanishPluralStemmer stemmer = new SpanishPluralStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public SpanishPluralStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
/**
* Factory for {@link SpanishPluralStemFilterFactory}.
*
* <pre class="prettyprint">
* &lt;fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.SpanishPluralStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* @lucene.spi {@value #NAME}
*/
public class SpanishPluralStemFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "spanishPluralStem";
/** Default ctor for compatibility with SPI */
public SpanishPluralStemFilterFactory() {
throw defaultCtorException();
}
public SpanishPluralStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new SpanishPluralStemFilter(input);
}
}

View File

@ -0,0 +1,285 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.analysis.CharArraySet;
/**
* Plural Stemmer for Spanish
*
* <p>This stemmer implements the rules described in:
* <i>http://www.wikilengua.org/index.php/Plural_(formación)</i>
*/
public class SpanishPluralStemmer {
private static final CharArraySet invariants;
private static final CharArraySet specialCases;
private static final List<String> invariantsList =
Arrays.asList(
"abrebotellas",
"abrecartas",
"abrelatas",
"afueras",
"albatros",
"albricias",
"aledaños",
"alexis",
"aries",
"alicates",
"analisis",
"andurriales",
"antitesis",
"añicos",
"apendicitis",
"apocalipsis",
"arcoiris",
"aries",
"bilis",
"boletus",
"boris",
"brindis",
"cactus",
"canutas",
"caries",
"cascanueces",
"cascarrabias",
"ciempies",
"cifosis",
"cortaplumas",
"corpus",
"cosmos",
"cosquillas",
"creces",
"crisis",
"cuatrocientas",
"cuatrocientos",
"cuelgacapas",
"cuentacuentos",
"cuentapasos",
"cumpleaños",
"doscientas",
"doscientos",
"dosis",
"enseres",
"entonces",
"esponsales",
"estatus",
"exequias",
"fauces",
"forceps",
"fotosintesis",
"gafas",
"gafotas",
"gargaras",
"gris",
"honorarios",
"ictus",
"jueves",
"lapsus",
"lavacoches",
"lavaplatos",
"limpiabotas",
"lunes",
"maitines",
"martes",
"mondadientes",
"novecientas",
"novecientos",
"nupcias",
"ochocientas",
"ochocientos",
"pais",
"paris",
"parabrisas",
"paracaidas",
"parachoques",
"paraguas",
"pararrayos",
"pisapapeles",
"piscis",
"portaaviones",
"portamaletas",
"portamantas",
"quinientas",
"quinientos",
"quinientos",
"quitamanchas",
"recogepelotas",
"rictus",
"rompeolas",
"sacacorchos",
"sacapuntas",
"saltamontes",
"salvavidas",
"seis",
"seiscientas",
"seiscientos",
"setecientas",
"setecientos",
"sintesis",
"tenis",
"tifus",
"trabalenguas",
"vacaciones",
"venus",
"versus",
"viacrucis",
"virus",
"viveres",
"volandas");
static {
final CharArraySet invariantSet = new CharArraySet(invariantsList, true);
invariants = CharArraySet.unmodifiableSet(invariantSet);
final List<String> specialCasesList =
Arrays.asList(
"yoes",
"noes",
"sies",
"clubes",
"faralaes",
"albalaes",
"itemes",
"albumes",
"sandwiches",
"relojes",
"bojes",
"contrarreloj",
"carcajes");
final CharArraySet sepecialSet = new CharArraySet(specialCasesList, true);
specialCases = CharArraySet.unmodifiableSet(sepecialSet);
}
public int stem(char s[], int len) {
if (len < 4) return len; // plural have at least 4 letters (ases,eses,etc.)
removeAccents(s, len);
if (invariant(s, len)) return len;
if (special(s, len)) return len - 2;
switch (s[len - 1]) {
case 's':
if (!isVowel(s[len - 2])) { // no vocals, singular words ending with consonant
return len - 1;
}
if ((s[len - 4] == 'q'
|| (s[len - 4] == 'g')
&& s[len - 3] == 'u'
&& (s[len - 2] == 'i' || s[len - 2] == 'e'))) { // maniquis,caquis, parques
return len - 1;
}
if (isVowel(s[len - 4])
&& (s[len - 3] == 'r')
&& s[len - 2] == 'e') { // escaneres, alfileres, amores, cables
return len - 2;
}
if (isVowel(s[len - 4])
&& (s[len - 3] == 'd' || s[len - 3] == 'l' || s[len - 3] == 'n' || s[len - 3] == 'x')
&& s[len - 2] == 'e') { // abades, comerciales, faxes, relojes,
return len - 2;
}
if ((s[len - 3] == 'y' || s[len - 3] == 'u') && s[len - 2] == 'e') { // bambues,leyes
return len - 2;
}
if ((s[len - 4] == 'u'
|| s[len - 4] == 'l'
|| s[len - 4] == 'r'
|| s[len - 4] == 't'
|| s[len - 4] == 'n')
&& (s[len - 3] == 'i')
&& s[len - 2] == 'e') { // jabalies,israelies, maniquies
return len - 2;
}
if ((s[len - 3] == 's' && s[len - 2] == 'e')) { // reses
return len - 2;
}
if (isVowel(s[len - 3]) && s[len - 2] == 'i') { // jerseis
s[len - 2] = 'y';
return len - 1;
}
if (s[len - 3] == 'd' && s[len - 2] == 'i') { // brandis
s[len - 2] = 'y';
return len - 1;
}
if (s[len - 2] == 'e' && s[len - 3] == 'c') { // voces-->voz
s[len - 3] = 'z';
return len - 2;
}
if (isVowel(s[len - 2])) // remove last 's': jabalís, casas, coches, etc.
{
return len - 1;
}
break;
}
return len;
}
private boolean isVowel(char c) {
boolean res = false;
if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') {
res = true;
}
return res;
}
private boolean invariant(char[] s, int len) {
return invariants.contains(s, 0, len);
}
private boolean special(char[] s, int len) {
return specialCases.contains(s, 0, len);
}
private void removeAccents(char[] s, int len) {
for (int i = 0; i < len; i++) {
switch (s[i]) {
case 'à':
case 'á':
case 'â':
case 'ä':
s[i] = 'a';
break;
case 'ò':
case 'ó':
case 'ô':
case 'ö':
s[i] = 'o';
break;
case 'è':
case 'é':
case 'ê':
case 'ë':
s[i] = 'e';
break;
case 'ù':
case 'ú':
case 'û':
case 'ü':
s[i] = 'u';
break;
case 'ì':
case 'í':
case 'î':
case 'ï':
s[i] = 'i';
break;
}
}
}
}

View File

@ -48,6 +48,7 @@ org.apache.lucene.analysis.en.KStemFilterFactory
org.apache.lucene.analysis.en.PorterStemFilterFactory
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory

View File

@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/** Simple tests for {@link SpanishPluralStemFilter} */
public class TestSpanishPluralStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer;
@Override
public void setUp() throws Exception {
super.setUp();
analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new SpanishPluralStemFilter(source));
}
};
}
@Override
public void tearDown() throws Exception {
analyzer.close();
super.tearDown();
}
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataPath("espluraltestdata.zip"), "esplural.txt");
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new SpanishPluralStemFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
}

View File

@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/** Simple tests to ensure the Spanish Plural stem factory is working. */
public class TestSpanishPluralStemFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("sociedades");
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer) stream).setReader(reader);
stream = tokenFilterFactory("SpanishPluralStem").create(stream);
assertTokenStreamContents(stream, new String[] {"sociedad"});
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected =
expectThrows(
IllegalArgumentException.class,
() -> {
tokenFilterFactory("SpanishPluralStem", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}