mirror of https://github.com/apache/lucene.git
LUCENE-4063: FrenchLightStemmer no longer deletes repeated digits.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1339333 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
89a796d7bc
commit
31ead5af58
|
@ -513,6 +513,9 @@ API Changes
|
|||
immutable instances of NormalizeCharMap. (Dawid Weiss, Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-4063: FrenchLightStemmer no longer deletes repeated digits.
|
||||
(Tanguy Moal via Steve Rowe)
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions
|
||||
|
|
|
@ -246,7 +246,7 @@ public class FrenchLightStemmer {
|
|||
|
||||
char ch = s[0];
|
||||
for (int i = 1; i < len; i++) {
|
||||
if (s[i] == ch)
|
||||
if (s[i] == ch && Character.isLetter(ch))
|
||||
len = delete(s, i--, len);
|
||||
else
|
||||
ch = s[i];
|
||||
|
@ -260,7 +260,7 @@ public class FrenchLightStemmer {
|
|||
if (s[len-1] == 'r') len--;
|
||||
if (s[len-1] == 'e') len--;
|
||||
if (s[len-1] == 'e') len--;
|
||||
if (s[len-1] == s[len-2]) len--;
|
||||
if (s[len-1] == s[len-2] && Character.isLetter(s[len-1])) len--;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
|
|
@ -153,6 +153,22 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
checkOneTerm(analyzer, "disposition", "dispos");
|
||||
checkOneTerm(analyzer, "dispose", "dispos");
|
||||
|
||||
// SOLR-3463 : abusive compression of repeated characters in numbers
|
||||
// Trailing repeated char elision :
|
||||
checkOneTerm(analyzer, "1234555", "1234555");
|
||||
// Repeated char within numbers with more than 4 characters :
|
||||
checkOneTerm(analyzer, "12333345", "12333345");
|
||||
// Short numbers weren't affected already:
|
||||
checkOneTerm(analyzer, "1234", "1234");
|
||||
// Ensure behaviour is preserved for words!
|
||||
// Trailing repeated char elision :
|
||||
checkOneTerm(analyzer, "abcdeff", "abcdef");
|
||||
// Repeated char within words with more than 4 characters :
|
||||
checkOneTerm(analyzer, "abcccddeef", "abcdef");
|
||||
checkOneTerm(analyzer, "créées", "cre");
|
||||
// Combined letter and digit repetition
|
||||
checkOneTerm(analyzer, "22hh00", "22h00"); // 10:00pm
|
||||
}
|
||||
|
||||
/** Test against a vocabulary from the reference impl */
|
||||
|
|
Loading…
Reference in New Issue