mirror of https://github.com/apache/lucene.git
LUCENE-8936: Add SpanishMinimalStemFilter
Signed-off-by: Tomoko Uchida <tomoko@apache.org>
This commit is contained in:
parent
4050ddc59b
commit
8c8d8abddc
|
@ -56,6 +56,10 @@ API Changes
|
|||
* LUCENE-8909: IndexWriter#getFieldNames() method is used to get fields present in index. After LUCENE-8316, this
|
||||
method is no longer required. Hence, deprecate IndexWriter#getFieldNames() method. (Adrien Grand, Munendra S N)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-8936: Add SpanishMinimalStemFilter (vinod kumar via Tomoko Uchida)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-8874: Show SPI names instead of class names in Luke Analysis tab. (Tomoko Uchida)
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link SpanishMinimalStemmer} to stem Spanish
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
*/
|
||||
public final class SpanishMinimalStemFilter extends TokenFilter {
|
||||
private final SpanishMinimalStemmer stemmer = new SpanishMinimalStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public SpanishMinimalStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link SpanishMinimalStemFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.SpanishMinimalStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* @lucene.spi {@value #NAME}
|
||||
*/
|
||||
public class SpanishMinimalStemFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** SPI name */
|
||||
public static final String NAME = "spanishMinimalStem";
|
||||
|
||||
/** Creates a new SpanishMinimalStemFilterFactory */
|
||||
public SpanishMinimalStemFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) { return new SpanishMinimalStemFilter(input); }
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
/**
|
||||
* Minimal plural stemmer for Spanish.
|
||||
* <p>
|
||||
* This stemmer implements the "plurals" stemmer for
|
||||
* spanish lanugage.
|
||||
*
|
||||
*/
|
||||
public class SpanishMinimalStemmer {
|
||||
|
||||
public int stem(char s[], int len) {
|
||||
if (len < 4 || s[len-1] != 's')
|
||||
return len;
|
||||
|
||||
for (int i = 0; i < len; i++)
|
||||
switch(s[i]) {
|
||||
case 'à':
|
||||
case 'á':
|
||||
case 'â':
|
||||
case 'ä': s[i] = 'a'; break;
|
||||
case 'ò':
|
||||
case 'ó':
|
||||
case 'ô':
|
||||
case 'ö': s[i] = 'o'; break;
|
||||
case 'è':
|
||||
case 'é':
|
||||
case 'ê':
|
||||
case 'ë': s[i] = 'e'; break;
|
||||
case 'ù':
|
||||
case 'ú':
|
||||
case 'û':
|
||||
case 'ü': s[i] = 'u'; break;
|
||||
case 'ì':
|
||||
case 'í':
|
||||
case 'î':
|
||||
case 'ï': s[i] = 'i'; break;
|
||||
case 'ñ': s[i] = 'n'; break;
|
||||
}
|
||||
|
||||
switch(s[len-1]) {
|
||||
case 's':
|
||||
if (s[len-2] == 'a' || s[len-2] == 'o') {
|
||||
return len-1;
|
||||
}
|
||||
if (s[len-2] == 'e') {
|
||||
if (s[len-3] == 's' && s[len-4] == 'e') {
|
||||
return len-2;
|
||||
}
|
||||
if (s[len-3] == 'c') {
|
||||
s[len-3] = 'z';
|
||||
return len-2;
|
||||
} else {
|
||||
return len-2;
|
||||
}
|
||||
} else {
|
||||
return len-1;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
}
|
|
@ -45,6 +45,7 @@ org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory
|
|||
org.apache.lucene.analysis.en.KStemFilterFactory
|
||||
org.apache.lucene.analysis.en.PorterStemFilterFactory
|
||||
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
|
||||
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
|
||||
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
|
||||
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
|
||||
|
||||
/**
|
||||
* Simple tests for {@link SpanishMinimalStemFilter}
|
||||
*/
|
||||
public class TestSpanishMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new SpanishMinimalStemFilter(source));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
analyzer.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
/** Test some examples */
|
||||
public void testExamples() throws IOException {
|
||||
checkOneTerm(analyzer, "actrices", "actriz");
|
||||
checkOneTerm(analyzer, "niños", "nino");
|
||||
checkOneTerm(analyzer, "países", "pais");
|
||||
checkOneTerm(analyzer, "caragodor", "caragodor");
|
||||
checkOneTerm(analyzer, "móviles", "movil");
|
||||
checkOneTerm(analyzer, "chicas", "chica");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 50*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
a.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.es;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the spanish minimal stem factory is working.
|
||||
*/
|
||||
public class TestSpanishMinimalStemFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("camisetas");
|
||||
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
((Tokenizer)stream).setReader(reader);
|
||||
stream = tokenFilterFactory("spanishMinimalStem").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "camiseta" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
tokenFilterFactory("spanishMinimalStem", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue