mirror of https://github.com/apache/lucene.git
LUCENE-8936: Add SpanishMinimalStemFilter
Signed-off-by: Tomoko Uchida <tomoko@apache.org>
This commit is contained in:
parent
4050ddc59b
commit
8c8d8abddc
|
@ -56,6 +56,10 @@ API Changes
|
||||||
* LUCENE-8909: IndexWriter#getFieldNames() method is used to get fields present in index. After LUCENE-8316, this
|
* LUCENE-8909: IndexWriter#getFieldNames() method is used to get fields present in index. After LUCENE-8316, this
|
||||||
method is no longer required. Hence, deprecate IndexWriter#getFieldNames() method. (Adrien Grand, Munendra S N)
|
method is no longer required. Hence, deprecate IndexWriter#getFieldNames() method. (Adrien Grand, Munendra S N)
|
||||||
|
|
||||||
|
New Features
|
||||||
|
|
||||||
|
* LUCENE-8936: Add SpanishMinimalStemFilter (vinod kumar via Tomoko Uchida)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-8874: Show SPI names instead of class names in Luke Analysis tab. (Tomoko Uchida)
|
* LUCENE-8874: Show SPI names instead of class names in Luke Analysis tab. (Tomoko Uchida)
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link SpanishMinimalStemmer} to stem Spanish
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class SpanishMinimalStemFilter extends TokenFilter {
|
||||||
|
private final SpanishMinimalStemmer stemmer = new SpanishMinimalStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public SpanishMinimalStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link SpanishMinimalStemFilter}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
* <filter class="solr.SpanishMinimalStemFilterFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
* @lucene.spi {@value #NAME}
|
||||||
|
*/
|
||||||
|
public class SpanishMinimalStemFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
/** SPI name */
|
||||||
|
public static final String NAME = "spanishMinimalStem";
|
||||||
|
|
||||||
|
/** Creates a new SpanishMinimalStemFilterFactory */
|
||||||
|
public SpanishMinimalStemFilterFactory(Map<String,String> args) {
|
||||||
|
super(args);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) { return new SpanishMinimalStemFilter(input); }
|
||||||
|
}
|
|
@ -0,0 +1,80 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimal plural stemmer for Spanish.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the "plurals" stemmer for
|
||||||
|
* spanish lanugage.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class SpanishMinimalStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 4 || s[len-1] != 's')
|
||||||
|
return len;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'à':
|
||||||
|
case 'á':
|
||||||
|
case 'â':
|
||||||
|
case 'ä': s[i] = 'a'; break;
|
||||||
|
case 'ò':
|
||||||
|
case 'ó':
|
||||||
|
case 'ô':
|
||||||
|
case 'ö': s[i] = 'o'; break;
|
||||||
|
case 'è':
|
||||||
|
case 'é':
|
||||||
|
case 'ê':
|
||||||
|
case 'ë': s[i] = 'e'; break;
|
||||||
|
case 'ù':
|
||||||
|
case 'ú':
|
||||||
|
case 'û':
|
||||||
|
case 'ü': s[i] = 'u'; break;
|
||||||
|
case 'ì':
|
||||||
|
case 'í':
|
||||||
|
case 'î':
|
||||||
|
case 'ï': s[i] = 'i'; break;
|
||||||
|
case 'ñ': s[i] = 'n'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 's':
|
||||||
|
if (s[len-2] == 'a' || s[len-2] == 'o') {
|
||||||
|
return len-1;
|
||||||
|
}
|
||||||
|
if (s[len-2] == 'e') {
|
||||||
|
if (s[len-3] == 's' && s[len-4] == 'e') {
|
||||||
|
return len-2;
|
||||||
|
}
|
||||||
|
if (s[len-3] == 'c') {
|
||||||
|
s[len-3] = 'z';
|
||||||
|
return len-2;
|
||||||
|
} else {
|
||||||
|
return len-2;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return len-1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -45,6 +45,7 @@ org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory
|
||||||
org.apache.lucene.analysis.en.KStemFilterFactory
|
org.apache.lucene.analysis.en.KStemFilterFactory
|
||||||
org.apache.lucene.analysis.en.PorterStemFilterFactory
|
org.apache.lucene.analysis.en.PorterStemFilterFactory
|
||||||
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
|
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
|
||||||
|
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
|
||||||
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
|
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
|
||||||
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
|
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
|
||||||
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
|
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link SpanishMinimalStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestSpanishMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(source, new SpanishMinimalStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
analyzer.close();
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test some examples */
|
||||||
|
public void testExamples() throws IOException {
|
||||||
|
checkOneTerm(analyzer, "actrices", "actriz");
|
||||||
|
checkOneTerm(analyzer, "niños", "nino");
|
||||||
|
checkOneTerm(analyzer, "países", "pais");
|
||||||
|
checkOneTerm(analyzer, "caragodor", "caragodor");
|
||||||
|
checkOneTerm(analyzer, "móviles", "movil");
|
||||||
|
checkOneTerm(analyzer, "chicas", "chica");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random(), analyzer, 50*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the spanish minimal stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestSpanishMinimalStemFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("camisetas");
|
||||||
|
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
((Tokenizer)stream).setReader(reader);
|
||||||
|
stream = tokenFilterFactory("spanishMinimalStem").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "camiseta" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
|
tokenFilterFactory("spanishMinimalStem", "bogusArg", "bogusValue");
|
||||||
|
});
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue