LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer

This commit is contained in:
Adrien Gallou 2019-07-30 21:38:05 +09:00 committed by Tomoko Uchida
parent cb94eeb491
commit d9d16eec95
3 changed files with 22 additions and 2 deletions

View File

@ -40,6 +40,9 @@ Improvements
docs on equal scores. Also, remove the ability of TopDocs.merge to set shard
indices (Atri Sharma, Adrien Grand, Simon Willnauer)
* LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer.
(Adrien Gallou via Tomoko Uchida)
Bug fixes
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while

View File

@ -74,7 +74,7 @@ public class FrenchMinimalStemmer {
if (s[len-1] == 'r') len--;
if (s[len-1] == 'e') len--;
if (s[len-1] == 'é') len--;
if (s[len-1] == s[len-2]) len--;
if (s[len-1] == s[len-2] && Character.isLetter(s[len-1])) len--;
return len;
}
}

View File

@ -67,7 +67,24 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
checkOneTerm(analyzer, "barons", "baron");
checkOneTerm(analyzer, "baron", "baron");
}
public void testIntergerWithLastCharactersEqual() throws IOException {
// Trailing repeated char elision :
checkOneTerm(analyzer, "1234555", "1234555");
// Repeated char within numbers with more than 6 characters :
checkOneTerm(analyzer, "12333345", "12333345");
// Short numbers weren't affected already:
checkOneTerm(analyzer, "1234", "1234");
// Ensure behaviour is preserved for words!
// Trailing repeated char elision :
checkOneTerm(analyzer, "abcdeff", "abcdef");
// Repeated char within words with more than 6 characters :
checkOneTerm(analyzer, "abcccddeef", "abcccddeef");
checkOneTerm(analyzer, "créées", "cré");
// Combined letter and digit repetition
checkOneTerm(analyzer, "22hh00", "22hh00"); // 10:00pm
}
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
Analyzer a = new Analyzer() {