LUCENE-4063: FrenchLightStemmer no longer deletes repeated digits.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1339333 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2012-05-16 20:03:49 +00:00
parent 89a796d7bc
commit 31ead5af58
3 changed files with 21 additions and 2 deletions

View File

@ -513,6 +513,9 @@ API Changes
immutable instances of NormalizeCharMap. (Dawid Weiss, Mike
McCandless)
* LUCENE-4063: FrenchLightStemmer no longer deletes repeated digits.
(Tanguy Moal via Steve Rowe)
New features
* LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions

View File

@ -246,7 +246,7 @@ public class FrenchLightStemmer {
char ch = s[0];
for (int i = 1; i < len; i++) {
if (s[i] == ch)
if (s[i] == ch && Character.isLetter(ch))
len = delete(s, i--, len);
else
ch = s[i];
@ -260,7 +260,7 @@ public class FrenchLightStemmer {
if (s[len-1] == 'r') len--;
if (s[len-1] == 'e') len--;
if (s[len-1] == 'e') len--;
if (s[len-1] == s[len-2]) len--;
if (s[len-1] == s[len-2] && Character.isLetter(s[len-1])) len--;
}
return len;
}

View File

@ -153,6 +153,22 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
checkOneTerm(analyzer, "disposition", "dispos");
checkOneTerm(analyzer, "dispose", "dispos");
// SOLR-3463 : abusive compression of repeated characters in numbers
// Trailing repeated char elision :
checkOneTerm(analyzer, "1234555", "1234555");
// Repeated char within numbers with more than 4 characters :
checkOneTerm(analyzer, "12333345", "12333345");
// Short numbers weren't affected already:
checkOneTerm(analyzer, "1234", "1234");
// Ensure behaviour is preserved for words!
// Trailing repeated char elision :
checkOneTerm(analyzer, "abcdeff", "abcdef");
// Repeated char within words with more than 4 characters :
checkOneTerm(analyzer, "abcccddeef", "abcdef");
checkOneTerm(analyzer, "créées", "cre");
// Combined letter and digit repetition
checkOneTerm(analyzer, "22hh00", "22h00"); // 10:00pm
}
/** Test against a vocabulary from the reference impl */