LUCENE-1963: Lowercase before stopfilter in ArabicAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823534 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-10-09 12:55:47 +00:00
parent 990fa50610
commit 956c8cda82
3 changed files with 19 additions and 5 deletions

View File

@ -10,6 +10,11 @@ Changes in backwards compatibility policy
Changes in runtime behavior
* LUCENE-1963: ArabicAnalyzer now lowercases before checking the stopword
list. This has no effect on Arabic text, but if you are using a custom
stopword list that contains some non-Arabic words, you'll need to fully
reindex. (DM Smith via Robert Muir)
API Changes
* LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms

View File

@ -142,13 +142,13 @@ public final class ArabicAnalyzer extends Analyzer {
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
* {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
* and {@link ArabicStemFilter}.
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ArabicLetterTokenizer( reader );
result = new StopFilter( result, stoptable );
result = new LowerCaseFilter(result);
result = new StopFilter( result, stoptable );
result = new ArabicNormalizationFilter( result );
result = new ArabicStemFilter( result );
@ -165,7 +165,7 @@ public final class ArabicAnalyzer extends Analyzer {
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
* {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
* and {@link ArabicStemFilter}.
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
@ -174,8 +174,8 @@ public final class ArabicAnalyzer extends Analyzer {
if (streams == null) {
streams = new SavedStreams();
streams.source = new ArabicLetterTokenizer(reader);
streams.result = new StopFilter(streams.source, stoptable);
streams.result = new LowerCaseFilter(streams.result);
streams.result = new LowerCaseFilter(streams.source);
streams.result = new StopFilter(streams.result, stoptable);
streams.result = new ArabicNormalizationFilter(streams.result);
streams.result = new ArabicStemFilter(streams.result);
setPreviousTokenStream(streams);

View File

@ -72,4 +72,13 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
assertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
"english", "text" });
}
/**
* Test that custom stopwords work, and are not case-sensitive.
*/
public void testCustomStopwords() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer(new String[] { "the", "and", "a" });
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
"brown", "fox" });
}
}