From 820620f3a7224c6416d001984570897682112676 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 5 Aug 2009 18:22:22 +0000 Subject: [PATCH] LUCENE-1758: Update ArabicAnalyzer to light10 stemming, stopwords improvements, lowercase non-arabic text git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@801348 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/CHANGES.txt | 6 + .../lucene/analysis/ar/ArabicAnalyzer.java | 11 +- .../lucene/analysis/ar/ArabicStemmer.java | 1 + .../apache/lucene/analysis/ar/stopwords.txt | 249 +++--------------- .../analysis/ar/TestArabicAnalyzer.java | 50 +++- .../analysis/ar/TestArabicStemFilter.java | 4 + 6 files changed, 96 insertions(+), 225 deletions(-) diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 2d67273ad0d..741558f1d34 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -8,6 +8,12 @@ Changes in runtime behavior number conversion. You'll need to fully re-index any previously created indexes. This isn't a break in back-compatibility because local Lucene has not yet been released. (Mike McCandless) + + 2. LUCENE-1758: ArabicAnalyzer now uses the light10 algorithm, has a refined + default stopword list, and lowercases non-Arabic text. + You'll need to fully re-index any previously created indexes. This isn't a + break in back-compatibility because ArabicAnalyzer has not yet been + released. (Robert Muir) API Changes diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java index e0606f8ed15..8929cabfa05 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java @@ -27,6 +27,7 @@ import java.util.Hashtable; import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WordlistLoader; @@ -36,10 +37,9 @@ import org.apache.lucene.analysis.WordlistLoader; *

* This analyzer implements light-stemming as specified by: * - * Improving Stemming for Arabic Information Retrieval: - * Light Stemming and Co-occurrence Analysis + * Light Stemming for Arabic Information Retrieval * - * http://ciir.cs.umass.edu/pubfiles/ir-249.pdf + * http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf *

* The analysis package contains three primary components: *