LUCENE-8936: Add SpanishMinimalStemFilter

Signed-off-by: Tomoko Uchida <tomoko@apache.org>
This commit is contained in:
vinod kumar 2019-07-28 23:36:56 +09:00 committed by Tomoko Uchida
parent 4050ddc59b
commit 8c8d8abddc
7 changed files with 322 additions and 0 deletions

View File

@ -56,6 +56,10 @@ API Changes
* LUCENE-8909: IndexWriter#getFieldNames() method is used to get fields present in index. After LUCENE-8316, this
method is no longer required. Hence, deprecate IndexWriter#getFieldNames() method. (Adrien Grand, Munendra S N)
New Features
* LUCENE-8936: Add SpanishMinimalStemFilter (vinod kumar via Tomoko Uchida)
Improvements
* LUCENE-8874: Show SPI names instead of class names in Luke Analysis tab. (Tomoko Uchida)

View File

@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link SpanishMinimalStemmer} to stem Spanish
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class SpanishMinimalStemFilter extends TokenFilter {
private final SpanishMinimalStemmer stemmer = new SpanishMinimalStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public SpanishMinimalStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link SpanishMinimalStemFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.SpanishMinimalStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @lucene.spi {@value #NAME}
*/
public class SpanishMinimalStemFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "spanishMinimalStem";
/** Creates a new SpanishMinimalStemFilterFactory */
public SpanishMinimalStemFilterFactory(Map<String,String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) { return new SpanishMinimalStemFilter(input); }
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
/**
* Minimal plural stemmer for Spanish.
* <p>
* This stemmer implements the "plurals" stemmer for
* spanish lanugage.
*
*/
public class SpanishMinimalStemmer {
public int stem(char s[], int len) {
if (len < 4 || s[len-1] != 's')
return len;
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'à':
case 'á':
case 'â':
case 'ä': s[i] = 'a'; break;
case 'ò':
case 'ó':
case 'ô':
case 'ö': s[i] = 'o'; break;
case 'è':
case 'é':
case 'ê':
case 'ë': s[i] = 'e'; break;
case 'ù':
case 'ú':
case 'û':
case 'ü': s[i] = 'u'; break;
case 'ì':
case 'í':
case 'î':
case 'ï': s[i] = 'i'; break;
case 'ñ': s[i] = 'n'; break;
}
switch(s[len-1]) {
case 's':
if (s[len-2] == 'a' || s[len-2] == 'o') {
return len-1;
}
if (s[len-2] == 'e') {
if (s[len-3] == 's' && s[len-4] == 'e') {
return len-2;
}
if (s[len-3] == 'c') {
s[len-3] = 'z';
return len-2;
} else {
return len-2;
}
} else {
return len-1;
}
}
return len;
}
}

View File

@ -45,6 +45,7 @@ org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory
org.apache.lucene.analysis.en.KStemFilterFactory
org.apache.lucene.analysis.en.PorterStemFilterFactory
org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
/**
* Simple tests for {@link SpanishMinimalStemFilter}
*/
public class TestSpanishMinimalStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer;
@Override
public void setUp() throws Exception {
super.setUp();
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new SpanishMinimalStemFilter(source));
}
};
}
@Override
public void tearDown() throws Exception {
analyzer.close();
super.tearDown();
}
/** Test some examples */
public void testExamples() throws IOException {
checkOneTerm(analyzer, "actrices", "actriz");
checkOneTerm(analyzer, "niños", "nino");
checkOneTerm(analyzer, "países", "pais");
checkOneTerm(analyzer, "caragodor", "caragodor");
checkOneTerm(analyzer, "móviles", "movil");
checkOneTerm(analyzer, "chicas", "chica");
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 50*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
}

View File

@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.es;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
/**
* Simple tests to ensure the spanish minimal stem factory is working.
*/
public class TestSpanishMinimalStemFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("camisetas");
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer)stream).setReader(reader);
stream = tokenFilterFactory("spanishMinimalStem").create(stream);
assertTokenStreamContents(stream, new String[] { "camiseta" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("spanishMinimalStem", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}