From 31ead5af5869cb4b42658e01bcc6fa28166e3cf7 Mon Sep 17 00:00:00 2001 From: Steven Rowe Date: Wed, 16 May 2012 20:03:49 +0000 Subject: [PATCH] LUCENE-4063: FrenchLightStemmer no longer deletes repeated digits. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1339333 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 +++ .../lucene/analysis/fr/FrenchLightStemmer.java | 4 ++-- .../analysis/fr/TestFrenchLightStemFilter.java | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6025da6c306..dce825156f8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -513,6 +513,9 @@ API Changes immutable instances of NormalizeCharMap. (Dawid Weiss, Mike McCandless) +* LUCENE-4063: FrenchLightStemmer no longer deletes repeated digits. + (Tanguy Moal via Steve Rowe) + New features * LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java index 43e2e0625d1..5be44bc032e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java @@ -246,7 +246,7 @@ public class FrenchLightStemmer { char ch = s[0]; for (int i = 1; i < len; i++) { - if (s[i] == ch) + if (s[i] == ch && Character.isLetter(ch)) len = delete(s, i--, len); else ch = s[i]; @@ -260,7 +260,7 @@ public class FrenchLightStemmer { if (s[len-1] == 'r') len--; if (s[len-1] == 'e') len--; if (s[len-1] == 'e') len--; - if (s[len-1] == s[len-2]) len--; + if (s[len-1] == s[len-2] && Character.isLetter(s[len-1])) len--; } return len; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java index 6012e8282a9..d0f7af15c24 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java @@ -153,6 +153,22 @@ public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase { checkOneTerm(analyzer, "disposition", "dispos"); checkOneTerm(analyzer, "dispose", "dispos"); + + // SOLR-3463 : abusive compression of repeated characters in numbers + // Trailing repeated char elision : + checkOneTerm(analyzer, "1234555", "1234555"); + // Repeated char within numbers with more than 4 characters : + checkOneTerm(analyzer, "12333345", "12333345"); + // Short numbers weren't affected already: + checkOneTerm(analyzer, "1234", "1234"); + // Ensure behaviour is preserved for words! + // Trailing repeated char elision : + checkOneTerm(analyzer, "abcdeff", "abcdef"); + // Repeated char within words with more than 4 characters : + checkOneTerm(analyzer, "abcccddeef", "abcdef"); + checkOneTerm(analyzer, "créées", "cre"); + // Combined letter and digit repetition + checkOneTerm(analyzer, "22hh00", "22h00"); // 10:00pm } /** Test against a vocabulary from the reference impl */