mirror of https://github.com/apache/lucene.git
LUCENE-1963: Lowercase before stopfilter in ArabicAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823534 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
990fa50610
commit
956c8cda82
|
@ -10,6 +10,11 @@ Changes in backwards compatibility policy
|
||||||
|
|
||||||
Changes in runtime behavior
|
Changes in runtime behavior
|
||||||
|
|
||||||
|
* LUCENE-1963: ArabicAnalyzer now lowercases before checking the stopword
|
||||||
|
list. This has no effect on Arabic text, but if you are using a custom
|
||||||
|
stopword list that contains some non-Arabic words, you'll need to fully
|
||||||
|
reindex. (DM Smith via Robert Muir)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms
|
* LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms
|
||||||
|
|
|
@ -142,13 +142,13 @@ public final class ArabicAnalyzer extends Analyzer {
|
||||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
||||||
* {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
|
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
|
||||||
* and {@link ArabicStemFilter}.
|
* and {@link ArabicStemFilter}.
|
||||||
*/
|
*/
|
||||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new ArabicLetterTokenizer( reader );
|
TokenStream result = new ArabicLetterTokenizer( reader );
|
||||||
result = new StopFilter( result, stoptable );
|
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
|
result = new StopFilter( result, stoptable );
|
||||||
result = new ArabicNormalizationFilter( result );
|
result = new ArabicNormalizationFilter( result );
|
||||||
result = new ArabicStemFilter( result );
|
result = new ArabicStemFilter( result );
|
||||||
|
|
||||||
|
@ -165,7 +165,7 @@ public final class ArabicAnalyzer extends Analyzer {
|
||||||
* in the provided {@link Reader}.
|
* in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
|
||||||
* {@link StopFilter}, {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}
|
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
|
||||||
* and {@link ArabicStemFilter}.
|
* and {@link ArabicStemFilter}.
|
||||||
*/
|
*/
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
|
@ -174,8 +174,8 @@ public final class ArabicAnalyzer extends Analyzer {
|
||||||
if (streams == null) {
|
if (streams == null) {
|
||||||
streams = new SavedStreams();
|
streams = new SavedStreams();
|
||||||
streams.source = new ArabicLetterTokenizer(reader);
|
streams.source = new ArabicLetterTokenizer(reader);
|
||||||
streams.result = new StopFilter(streams.source, stoptable);
|
streams.result = new LowerCaseFilter(streams.source);
|
||||||
streams.result = new LowerCaseFilter(streams.result);
|
streams.result = new StopFilter(streams.result, stoptable);
|
||||||
streams.result = new ArabicNormalizationFilter(streams.result);
|
streams.result = new ArabicNormalizationFilter(streams.result);
|
||||||
streams.result = new ArabicStemFilter(streams.result);
|
streams.result = new ArabicStemFilter(streams.result);
|
||||||
setPreviousTokenStream(streams);
|
setPreviousTokenStream(streams);
|
||||||
|
|
|
@ -72,4 +72,13 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
|
assertAnalyzesTo(new ArabicAnalyzer(), "English text.", new String[] {
|
||||||
"english", "text" });
|
"english", "text" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that custom stopwords work, and are not case-sensitive.
|
||||||
|
*/
|
||||||
|
public void testCustomStopwords() throws Exception {
|
||||||
|
ArabicAnalyzer a = new ArabicAnalyzer(new String[] { "the", "and", "a" });
|
||||||
|
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||||
|
"brown", "fox" });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue