mirror of https://github.com/apache/lucene.git
LUCENE-1963: Lowercase before stopfilter in ArabicAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823534 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
990fa50610
commit
956c8cda82
|
@ -10,6 +10,11 @@ Changes in backwards compatibility policy
|
|||
|
||||
Changes in runtime behavior
|
||||
|
||||
* LUCENE-1963: ArabicAnalyzer now lowercases before checking the stopword
|
||||
list. This has no effect on Arabic text, but if you are using a custom
|
||||
stopword list that contains some non-Arabic words, you'll need to fully
|
||||
reindex. (DM Smith via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms
|
||||
|
|
|
@ -142,13 +142,13 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
||||
* {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
|
||||
* and {@link ArabicStemFilter}.
|
||||
*/
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new ArabicLetterTokenizer( reader );
|
||||
result = new StopFilter( result, stoptable );
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter( result, stoptable );
|
||||
result = new ArabicNormalizationFilter( result );
|
||||
result = new ArabicStemFilter( result );
|
||||
|
||||
|
@ -165,7 +165,7 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
* in the provided {@link Reader}.
|
||||
*
|
||||
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
||||
* {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
|
||||
* and {@link ArabicStemFilter}.
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
|
@ -174,8 +174,8 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new ArabicLetterTokenizer(reader);
|
||||
streams.result = new StopFilter(streams.source, stoptable);
|
||||
streams.result = new LowerCaseFilter(streams.result);
|
||||
streams.result = new LowerCaseFilter(streams.source);
|
||||
streams.result = new StopFilter(streams.result, stoptable);
|
||||
streams.result = new ArabicNormalizationFilter(streams.result);
|
||||
streams.result = new ArabicStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
|
|
|
@ -72,4 +72,13 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
|
||||
"english", "text" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that custom stopwords work, and are not case-sensitive.
|
||||
*/
|
||||
public void testCustomStopwords() throws Exception {
|
||||
ArabicAnalyzer a = new ArabicAnalyzer(new String[] { "the", "and", "a" });
|
||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||
"brown", "fox" });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue