LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer

This commit is contained in:
Adrien Gallou 2019-07-30 21:38:05 +09:00 committed by Tomoko Uchida
parent cb94eeb491
commit d9d16eec95
3 changed files with 22 additions and 2 deletions

View File

@ -40,6 +40,9 @@ Improvements
docs on equal scores. Also, remove the ability of TopDocs.merge to set shard docs on equal scores. Also, remove the ability of TopDocs.merge to set shard
indices (Atri Sharma, Adrien Grand, Simon Willnauer) indices (Atri Sharma, Adrien Grand, Simon Willnauer)
* LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer.
(Adrien Gallou via Tomoko Uchida)
Bug fixes Bug fixes
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while

View File

@ -74,7 +74,7 @@ public class FrenchMinimalStemmer {
if (s[len-1] == 'r') len--; if (s[len-1] == 'r') len--;
if (s[len-1] == 'e') len--; if (s[len-1] == 'e') len--;
if (s[len-1] == 'é') len--; if (s[len-1] == 'é') len--;
if (s[len-1] == s[len-2]) len--; if (s[len-1] == s[len-2] && Character.isLetter(s[len-1])) len--;
return len; return len;
} }
} }

View File

@ -68,6 +68,23 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
checkOneTerm(analyzer, "baron", "baron"); checkOneTerm(analyzer, "baron", "baron");
} }
public void testIntergerWithLastCharactersEqual() throws IOException {
// Trailing repeated char elision :
checkOneTerm(analyzer, "1234555", "1234555");
// Repeated char within numbers with more than 6 characters :
checkOneTerm(analyzer, "12333345", "12333345");
// Short numbers weren't affected already:
checkOneTerm(analyzer, "1234", "1234");
// Ensure behaviour is preserved for words!
// Trailing repeated char elision :
checkOneTerm(analyzer, "abcdeff", "abcdef");
// Repeated char within words with more than 6 characters :
checkOneTerm(analyzer, "abcccddeef", "abcccddeef");
checkOneTerm(analyzer, "créées", "cré");
// Combined letter and digit repetition
checkOneTerm(analyzer, "22hh00", "22hh00"); // 10:00pm
}
public void testKeyword() throws IOException { public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false); final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
Analyzer a = new Analyzer() { Analyzer a = new Analyzer() {