mirror of https://github.com/apache/lucene.git
LUCENE-10248: Spanish Plural Stemmer (#461)
Adds a new Spanish stemmer just for stemming plural to singular whilst maintaining gender: the SpanishPluralStemmer. The goal is to provide a lightweight algorithmic approach with better precision and recall than current approaches. See blog post for more details: https://medium.com/inside-wallapop/spanish-plural-stemmer-matching-plural-and-singular-forms-in-spanish-using-lucene-93e005e38373 This approach is based on rules specified in WikiLingua: http://www.wikilengua.org/index.php/Plural_(formaci%C3%B3n) Some characteristics: * Designed to stem just plural to singular form * Distinguishes between masculine and feminine forms * It will increase recall but precision can be reduced depending on the use case/information need * Stems plural words of foreign origin: i.e. complots, bits, punks, robots * Support for invariant words: same plural and singular form or plural does not make sense: i.e. crisis, jueves, lapsus, abrebotellas, etc * Support for special cases: i.e. yoes, clubes, itemes, faralaes * Use it when the distinction between singular and plural is not relevant but gender is relevant * Produces meaningful tokens in form of singular * Not strange stems like “amig”: it’s true that stemmers must not generate grammatically correct tokens, but if we generate correct stems we decrease the possibility of collisions with other words
This commit is contained in:
parent
f48a430f35
commit
edb936f090
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link SpanishPluralStemmer} to stem Spanish words.
|
||||
*
|
||||
* <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
|
||||
* custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
|
||||
* TokenStream}.
|
||||
*/
|
||||
public final class SpanishPluralStemFilter extends TokenFilter {
|
||||
private final SpanishPluralStemmer stemmer = new SpanishPluralStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public SpanishPluralStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Factory for {@link SpanishPluralStemFilterFactory}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.SpanishPluralStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @lucene.spi {@value #NAME}
|
||||
*/
|
||||
public class SpanishPluralStemFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** SPI name */
|
||||
public static final String NAME = "spanishPluralStem";
|
||||
|
||||
/** Default ctor for compatibility with SPI */
|
||||
public SpanishPluralStemFilterFactory() {
|
||||
throw defaultCtorException();
|
||||
}
|
||||
|
||||
public SpanishPluralStemFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new SpanishPluralStemFilter(input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,285 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
/**
|
||||
* Plural Stemmer for Spanish
|
||||
*
|
||||
* <p>This stemmer implements the rules described in:
|
||||
* <i>http://www.wikilengua.org/index.php/Plural_(formación)</i>
|
||||
*/
|
||||
public class SpanishPluralStemmer {
|
||||
|
||||
private static final CharArraySet invariants;
|
||||
private static final CharArraySet specialCases;
|
||||
|
||||
private static final List<String> invariantsList =
|
||||
Arrays.asList(
|
||||
"abrebotellas",
|
||||
"abrecartas",
|
||||
"abrelatas",
|
||||
"afueras",
|
||||
"albatros",
|
||||
"albricias",
|
||||
"aledaños",
|
||||
"alexis",
|
||||
"aries",
|
||||
"alicates",
|
||||
"analisis",
|
||||
"andurriales",
|
||||
"antitesis",
|
||||
"añicos",
|
||||
"apendicitis",
|
||||
"apocalipsis",
|
||||
"arcoiris",
|
||||
"aries",
|
||||
"bilis",
|
||||
"boletus",
|
||||
"boris",
|
||||
"brindis",
|
||||
"cactus",
|
||||
"canutas",
|
||||
"caries",
|
||||
"cascanueces",
|
||||
"cascarrabias",
|
||||
"ciempies",
|
||||
"cifosis",
|
||||
"cortaplumas",
|
||||
"corpus",
|
||||
"cosmos",
|
||||
"cosquillas",
|
||||
"creces",
|
||||
"crisis",
|
||||
"cuatrocientas",
|
||||
"cuatrocientos",
|
||||
"cuelgacapas",
|
||||
"cuentacuentos",
|
||||
"cuentapasos",
|
||||
"cumpleaños",
|
||||
"doscientas",
|
||||
"doscientos",
|
||||
"dosis",
|
||||
"enseres",
|
||||
"entonces",
|
||||
"esponsales",
|
||||
"estatus",
|
||||
"exequias",
|
||||
"fauces",
|
||||
"forceps",
|
||||
"fotosintesis",
|
||||
"gafas",
|
||||
"gafotas",
|
||||
"gargaras",
|
||||
"gris",
|
||||
"honorarios",
|
||||
"ictus",
|
||||
"jueves",
|
||||
"lapsus",
|
||||
"lavacoches",
|
||||
"lavaplatos",
|
||||
"limpiabotas",
|
||||
"lunes",
|
||||
"maitines",
|
||||
"martes",
|
||||
"mondadientes",
|
||||
"novecientas",
|
||||
"novecientos",
|
||||
"nupcias",
|
||||
"ochocientas",
|
||||
"ochocientos",
|
||||
"pais",
|
||||
"paris",
|
||||
"parabrisas",
|
||||
"paracaidas",
|
||||
"parachoques",
|
||||
"paraguas",
|
||||
"pararrayos",
|
||||
"pisapapeles",
|
||||
"piscis",
|
||||
"portaaviones",
|
||||
"portamaletas",
|
||||
"portamantas",
|
||||
"quinientas",
|
||||
"quinientos",
|
||||
"quinientos",
|
||||
"quitamanchas",
|
||||
"recogepelotas",
|
||||
"rictus",
|
||||
"rompeolas",
|
||||
"sacacorchos",
|
||||
"sacapuntas",
|
||||
"saltamontes",
|
||||
"salvavidas",
|
||||
"seis",
|
||||
"seiscientas",
|
||||
"seiscientos",
|
||||
"setecientas",
|
||||
"setecientos",
|
||||
"sintesis",
|
||||
"tenis",
|
||||
"tifus",
|
||||
"trabalenguas",
|
||||
"vacaciones",
|
||||
"venus",
|
||||
"versus",
|
||||
"viacrucis",
|
||||
"virus",
|
||||
"viveres",
|
||||
"volandas");
|
||||
|
||||
static {
|
||||
final CharArraySet invariantSet = new CharArraySet(invariantsList, true);
|
||||
invariants = CharArraySet.unmodifiableSet(invariantSet);
|
||||
|
||||
final List<String> specialCasesList =
|
||||
Arrays.asList(
|
||||
"yoes",
|
||||
"noes",
|
||||
"sies",
|
||||
"clubes",
|
||||
"faralaes",
|
||||
"albalaes",
|
||||
"itemes",
|
||||
"albumes",
|
||||
"sandwiches",
|
||||
"relojes",
|
||||
"bojes",
|
||||
"contrarreloj",
|
||||
"carcajes");
|
||||
final CharArraySet sepecialSet = new CharArraySet(specialCasesList, true);
|
||||
specialCases = CharArraySet.unmodifiableSet(sepecialSet);
|
||||
}
|
||||
|
||||
public int stem(char s[], int len) {
|
||||
if (len < 4) return len; // plural have at least 4 letters (ases,eses,etc.)
|
||||
removeAccents(s, len);
|
||||
if (invariant(s, len)) return len;
|
||||
if (special(s, len)) return len - 2;
|
||||
switch (s[len - 1]) {
|
||||
case 's':
|
||||
if (!isVowel(s[len - 2])) { // no vocals, singular words ending with consonant
|
||||
return len - 1;
|
||||
}
|
||||
if ((s[len - 4] == 'q'
|
||||
|| (s[len - 4] == 'g')
|
||||
&& s[len - 3] == 'u'
|
||||
&& (s[len - 2] == 'i' || s[len - 2] == 'e'))) { // maniquis,caquis, parques
|
||||
return len - 1;
|
||||
}
|
||||
if (isVowel(s[len - 4])
|
||||
&& (s[len - 3] == 'r')
|
||||
&& s[len - 2] == 'e') { // escaneres, alfileres, amores, cables
|
||||
return len - 2;
|
||||
}
|
||||
if (isVowel(s[len - 4])
|
||||
&& (s[len - 3] == 'd' || s[len - 3] == 'l' || s[len - 3] == 'n' || s[len - 3] == 'x')
|
||||
&& s[len - 2] == 'e') { // abades, comerciales, faxes, relojes,
|
||||
return len - 2;
|
||||
}
|
||||
if ((s[len - 3] == 'y' || s[len - 3] == 'u') && s[len - 2] == 'e') { // bambues,leyes
|
||||
return len - 2;
|
||||
}
|
||||
if ((s[len - 4] == 'u'
|
||||
|| s[len - 4] == 'l'
|
||||
|| s[len - 4] == 'r'
|
||||
|| s[len - 4] == 't'
|
||||
|| s[len - 4] == 'n')
|
||||
&& (s[len - 3] == 'i')
|
||||
&& s[len - 2] == 'e') { // jabalies,israelies, maniquies
|
||||
return len - 2;
|
||||
}
|
||||
if ((s[len - 3] == 's' && s[len - 2] == 'e')) { // reses
|
||||
return len - 2;
|
||||
}
|
||||
if (isVowel(s[len - 3]) && s[len - 2] == 'i') { // jerseis
|
||||
s[len - 2] = 'y';
|
||||
return len - 1;
|
||||
}
|
||||
if (s[len - 3] == 'd' && s[len - 2] == 'i') { // brandis
|
||||
s[len - 2] = 'y';
|
||||
return len - 1;
|
||||
}
|
||||
if (s[len - 2] == 'e' && s[len - 3] == 'c') { // voces-->voz
|
||||
s[len - 3] = 'z';
|
||||
return len - 2;
|
||||
}
|
||||
if (isVowel(s[len - 2])) // remove last 's': jabalís, casas, coches, etc.
|
||||
{
|
||||
return len - 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private boolean isVowel(char c) {
|
||||
boolean res = false;
|
||||
if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') {
|
||||
res = true;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private boolean invariant(char[] s, int len) {
|
||||
return invariants.contains(s, 0, len);
|
||||
}
|
||||
|
||||
private boolean special(char[] s, int len) {
|
||||
return specialCases.contains(s, 0, len);
|
||||
}
|
||||
|
||||
private void removeAccents(char[] s, int len) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
case 'à':
|
||||
case 'á':
|
||||
case 'â':
|
||||
case 'ä':
|
||||
s[i] = 'a';
|
||||
break;
|
||||
case 'ò':
|
||||
case 'ó':
|
||||
case 'ô':
|
||||
case 'ö':
|
||||
s[i] = 'o';
|
||||
break;
|
||||
case 'è':
|
||||
case 'é':
|
||||
case 'ê':
|
||||
case 'ë':
|
||||
s[i] = 'e';
|
||||
break;
|
||||
case 'ù':
|
||||
case 'ú':
|
||||
case 'û':
|
||||
case 'ü':
|
||||
s[i] = 'u';
|
||||
break;
|
||||
case 'ì':
|
||||
case 'í':
|
||||
case 'î':
|
||||
case 'ï':
|
||||
s[i] = 'i';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -48,6 +48,7 @@ org.apache.lucene.analysis.en.KStemFilterFactory
|
|||
org.apache.lucene.analysis.en.PorterStemFilterFactory
|
||||
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
|
||||
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
|
||||
org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory
|
||||
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
|
||||
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import static org.apache.lucene.analysis.VocabularyAssert.assertVocabulary;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
/** Simple tests for {@link SpanishPluralStemFilter} */
|
||||
public class TestSpanishPluralStemFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
analyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new SpanishPluralStemFilter(source));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
analyzer.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
/** Test against a vocabulary from the reference impl */
|
||||
public void testVocabulary() throws IOException {
|
||||
assertVocabulary(analyzer, getDataPath("espluraltestdata.zip"), "esplural.txt");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new SpanishPluralStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
a.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/** Simple tests to ensure the Spanish Plural stem factory is working. */
|
||||
public class TestSpanishPluralStemFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("sociedades");
|
||||
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
((Tokenizer) stream).setReader(reader);
|
||||
stream = tokenFilterFactory("SpanishPluralStem").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"sociedad"});
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected =
|
||||
expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> {
|
||||
tokenFilterFactory("SpanishPluralStem", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
Binary file not shown.
Loading…
Reference in New Issue