mirror of https://github.com/apache/lucene.git
LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer
This commit is contained in:
parent
cb94eeb491
commit
d9d16eec95
|
@ -40,6 +40,9 @@ Improvements
|
||||||
docs on equal scores. Also, remove the ability of TopDocs.merge to set shard
|
docs on equal scores. Also, remove the ability of TopDocs.merge to set shard
|
||||||
indices (Atri Sharma, Adrien Grand, Simon Willnauer)
|
indices (Atri Sharma, Adrien Grand, Simon Willnauer)
|
||||||
|
|
||||||
|
* LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer.
|
||||||
|
(Adrien Gallou via Tomoko Uchida)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while
|
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while
|
||||||
|
|
|
@ -74,7 +74,7 @@ public class FrenchMinimalStemmer {
|
||||||
if (s[len-1] == 'r') len--;
|
if (s[len-1] == 'r') len--;
|
||||||
if (s[len-1] == 'e') len--;
|
if (s[len-1] == 'e') len--;
|
||||||
if (s[len-1] == 'é') len--;
|
if (s[len-1] == 'é') len--;
|
||||||
if (s[len-1] == s[len-2]) len--;
|
if (s[len-1] == s[len-2] && Character.isLetter(s[len-1])) len--;
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,6 +68,23 @@ public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(analyzer, "baron", "baron");
|
checkOneTerm(analyzer, "baron", "baron");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testIntergerWithLastCharactersEqual() throws IOException {
|
||||||
|
// Trailing repeated char elision :
|
||||||
|
checkOneTerm(analyzer, "1234555", "1234555");
|
||||||
|
// Repeated char within numbers with more than 6 characters :
|
||||||
|
checkOneTerm(analyzer, "12333345", "12333345");
|
||||||
|
// Short numbers weren't affected already:
|
||||||
|
checkOneTerm(analyzer, "1234", "1234");
|
||||||
|
// Ensure behaviour is preserved for words!
|
||||||
|
// Trailing repeated char elision :
|
||||||
|
checkOneTerm(analyzer, "abcdeff", "abcdef");
|
||||||
|
// Repeated char within words with more than 6 characters :
|
||||||
|
checkOneTerm(analyzer, "abcccddeef", "abcccddeef");
|
||||||
|
checkOneTerm(analyzer, "créées", "cré");
|
||||||
|
// Combined letter and digit repetition
|
||||||
|
checkOneTerm(analyzer, "22hh00", "22hh00"); // 10:00pm
|
||||||
|
}
|
||||||
|
|
||||||
public void testKeyword() throws IOException {
|
public void testKeyword() throws IOException {
|
||||||
final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
|
final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
|
||||||
Analyzer a = new Analyzer() {
|
Analyzer a = new Analyzer() {
|
||||||
|
|
Loading…
Reference in New Issue