diff --git a/CHANGES.txt b/CHANGES.txt
index 9b51789dcad..9494cfa6fa8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -34,6 +34,8 @@ New features
static methods. (Shalin Shekhar Mangar via Mike McCandless)
+ 3. LUCENE-1406: Added Arabic analyzer. (Robert Muir via Grant Ingersoll)
+
Optimizations
Documentation
diff --git a/NOTICE.txt b/NOTICE.txt
index 92fd3447bc7..c826ff590be 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -9,3 +9,8 @@ The snowball stemmers in
were developed by Martin Porter and Richard Boulton.
The full snowball package is available from
http://snowball.tartarus.org/
+
+The Arabic stemmer (contrib/analyzer) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy. The file
+resides in contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
+See http://members.unine.ch/jacques.savoy/clef/index.html.
\ No newline at end of file
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
new file mode 100644
index 00000000000..e0606f8ed15
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@@ -0,0 +1,124 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WordlistLoader;
+
+/**
+ * Analyzer for Arabic.
+ *
+ * This analyzer implements light-stemming as specified by:
+ *
+ * Improving Stemming for Arabic Information Retrieval:
+ * Light Stemming and Co-occurrence Analysis
+ *
+ * http://ciir.cs.umass.edu/pubfiles/ir-249.pdf
+ *
+ * The analysis package contains three primary components:
+ *
+ * - {@link ArabicNormalizationFilter}: Arabic orthographic normalization.
+ *
- {@link ArabicStemFilter}: Arabic light stemming
+ *
- Arabic stop words file: a set of default Arabic stop words.
+ *
+ *
+ */
+public final class ArabicAnalyzer extends Analyzer {
+
+ /**
+ * File containing default Arabic stopwords.
+ *
+ * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+ * The stopword list is BSD-Licensed.
+ */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ private Set stoptable = new HashSet();
+ /**
+ * The comment character in the stopwords file. All lines prefixed with this will be ignored
+ */
+ public static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public ArabicAnalyzer() {
+ try {
+ InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
+ InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+ stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
+ reader.close();
+ stream.close();
+ } catch (IOException e) {
+ // TODO: throw IOException
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public ArabicAnalyzer( String[] stopwords ) {
+ stoptable = StopFilter.makeStopSet( stopwords );
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public ArabicAnalyzer( Hashtable stopwords ) {
+ stoptable = new HashSet(stopwords.keySet());
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
+ */
+ public ArabicAnalyzer( File stopwords ) throws IOException {
+ stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
+ }
+
+
+ /**
+ * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ *
+ * @return A TokenStream build from a StandardTokenizer filtered with
+ * StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter.
+ */
+ public final TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream result = new ArabicLetterTokenizer( reader );
+ result = new StopFilter( result, stoptable );
+ result = new ArabicNormalizationFilter( result );
+ result = new ArabicStemFilter( result );
+
+ return result;
+ }
+}
+
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
new file mode 100644
index 00000000000..d414ef1b5cb
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
@@ -0,0 +1,43 @@
+package org.apache.lucene.analysis.ar;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.LetterTokenizer;
+
+/**
+ * The problem with the standard Letter tokenizer is that it fails on diacritics.
+ * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
+ *
+ *
+ */
+public class ArabicLetterTokenizer extends LetterTokenizer {
+
+ public ArabicLetterTokenizer(Reader in) {
+ super(in);
+ }
+
+ /**
+ * Allows for Letter category or NonspacingMark category
+ * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
+ */
+ protected boolean isTokenChar(char c) {
+ return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
+ }
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
new file mode 100644
index 00000000000..90b504fc446
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
+ *
+ */
+
+public class ArabicNormalizationFilter extends TokenFilter {
+
+ protected ArabicNormalizer normalizer = null;
+
+ protected ArabicNormalizationFilter(TokenStream input) {
+ super(input);
+ normalizer = new ArabicNormalizer();
+ }
+
+
+
+ public Token next(Token reusableToken) throws IOException {
+ if ((reusableToken = input.next(reusableToken)) == null) {
+ return null;
+ } else {
+ int oldlen = reusableToken.termLength();
+ int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
+ if (oldlen != newlen)
+ reusableToken.setTermLength(newlen);
+ return reusableToken;
+ }
+ }
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
new file mode 100644
index 00000000000..6693c03b90b
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
@@ -0,0 +1,102 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Normalizer for Arabic.
+ *
+ * Normalization is done in-place for efficiency, operating on a termbuffer.
+ *
+ * Normalization is defined as:
+ *
+ * - Normalization of hamza with alef seat to a bare alef.
+ *
- Normalization of teh marbuta to heh
+ *
- Normalization of dotless yeh (alef maksura) to yeh.
+ *
- Removal of Arabic diacritics (the harakat)
+ *
- Removal of tatweel (stretching character).
+ *
+ *
+ */
+public class ArabicNormalizer {
+ public static final char ALEF = '\u0627';
+ public static final char ALEF_MADDA = '\u0622';
+ public static final char ALEF_HAMZA_ABOVE = '\u0623';
+ public static final char ALEF_HAMZA_BELOW = '\u0625';
+
+ public static final char YEH = '\u064A';
+ public static final char DOTLESS_YEH = '\u0649';
+
+ public static final char TEH_MARBUTA = '\u0629';
+ public static final char HEH = '\u0647';
+
+ public static final char TATWEEL = '\u0640';
+
+ public static final char FATHATAN = '\u064B';
+ public static final char DAMMATAN = '\u064C';
+ public static final char KASRATAN = '\u064D';
+ public static final char FATHA = '\u064E';
+ public static final char DAMMA = '\u064F';
+ public static final char KASRA = '\u0650';
+ public static final char SHADDA = '\u0651';
+ public static final char SUKUN = '\u0652';
+
+ /**
+ * Normalize an input buffer of Arabic text
+ *
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return length of input buffer after normalization
+ */
+ public int normalize(char s[], int len) {
+
+ for (int i = 0; i < len; i++) {
+ if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
+ s[i] = ALEF;
+
+ if (s[i] == DOTLESS_YEH)
+ s[i] = YEH;
+
+ if (s[i] == TEH_MARBUTA)
+ s[i] = HEH;
+
+ if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
+ s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) {
+ len = delete(s, i, len);
+ i--;
+ }
+ }
+
+ return len;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ protected int delete(char s[], int pos, int len) {
+ if (pos < len)
+ System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
new file mode 100644
index 00000000000..dd615d7b691
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
@@ -0,0 +1,61 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
+ *
+ */
+
+public class ArabicStemFilter extends TokenFilter {
+
+ protected ArabicStemmer stemmer = null;
+
+ protected ArabicStemFilter(TokenStream input) {
+ super(input);
+ stemmer = new ArabicStemmer();
+ }
+
+
+
+ /**
+ * @return Returns the next token in the stream, or null at EOS
+ */
+ public Token next(Token reusableToken) throws IOException {
+ /**
+ * The actual token in the input stream.
+ */
+
+
+ if ((reusableToken = input.next(reusableToken)) == null) {
+ return null;
+ } else {
+ int oldlen = reusableToken.termLength();
+ int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen);
+ if (oldlen != newlen)
+ reusableToken.setTermLength(newlen);
+ return reusableToken;
+ }
+ }
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
new file mode 100644
index 00000000000..3e84a75e2b1
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
@@ -0,0 +1,177 @@
+package org.apache.lucene.analysis.ar;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Stemmer for Arabic.
+ *
+ * Stemming is done in-place for efficiency, operating on a termbuffer.
+ *
+ * Stemming is defined as:
+ *
+ * - Removal of attached definite article, conjunction, and prepositions.
+ *
- Stemming of common suffixes.
+ *
+ *
+ */
+public class ArabicStemmer {
+ public static final char ALEF = '\u0627';
+ public static final char BEH = '\u0628';
+ public static final char TEH_MARBUTA = '\u0629';
+ public static final char TEH = '\u062A';
+ public static final char FEH = '\u0641';
+ public static final char KAF = '\u0643';
+ public static final char LAM = '\u0644';
+ public static final char NOON = '\u0646';
+ public static final char HEH = '\u0647';
+ public static final char WAW = '\u0648';
+ public static final char YEH = '\u064A';
+
+ public static final char prefixes[][] = {
+ ("" + ALEF + LAM).toCharArray(),
+ ("" + WAW + ALEF + LAM).toCharArray(),
+ ("" + BEH + ALEF + LAM).toCharArray(),
+ ("" + KAF + ALEF + LAM).toCharArray(),
+ ("" + FEH + ALEF + LAM).toCharArray(),
+ ("" + WAW).toCharArray(),
+ };
+
+ public static final char suffixes[][] = {
+ ("" + HEH + ALEF).toCharArray(),
+ ("" + ALEF + NOON).toCharArray(),
+ ("" + ALEF + TEH).toCharArray(),
+ ("" + WAW + NOON).toCharArray(),
+ ("" + YEH + NOON).toCharArray(),
+ ("" + YEH + HEH).toCharArray(),
+ ("" + YEH + TEH_MARBUTA).toCharArray(),
+ ("" + HEH).toCharArray(),
+ ("" + TEH_MARBUTA).toCharArray(),
+ ("" + YEH).toCharArray(),
+};
+
+ /**
+ * Stem an input buffer of Arabic text.
+ *
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return length of input buffer after normalization
+ */
+ public int stem(char s[], int len) {
+ len = stemPrefix(s, len);
+ len = stemSuffix(s, len);
+
+ return len;
+ }
+
+ /**
+ * Stem a prefix off an Arabic word.
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return new length of input buffer after stemming.
+ */
+ public int stemPrefix(char s[], int len) {
+ for (int i = 0; i < prefixes.length; i++)
+ if (startsWith(s, len, prefixes[i]))
+ return deleteN(s, 0, len, prefixes[i].length);
+ return len;
+ }
+
+ /**
+ * Stem suffix(es) off an Arabic word.
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return new length of input buffer after stemming
+ */
+ public int stemSuffix(char s[], int len) {
+ for (int i = 0; i < suffixes.length; i++)
+ if (endsWith(s, len, suffixes[i]))
+ len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
+ return len;
+ }
+
+ /**
+ * Returns true if the prefix matches and can be stemmed
+ * @param s input buffer
+ * @param len length of input buffer
+ * @param prefix prefix to check
+ * @return true if the prefix matches and can be stemmed
+ */
+ boolean startsWith(char s[], int len, char prefix[]) {
+ if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
+ return false;
+ } else if (len < prefix.length + 2) { // other prefixes require only 2.
+ return false;
+ } else {
+ for (int i = 0; i < prefix.length; i++)
+ if (s[i] != prefix[i])
+ return false;
+
+ return true;
+ }
+ }
+
+ /**
+ * Returns true if the suffix matches and can be stemmed
+ * @param s input buffer
+ * @param len length of input buffer
+ * @param suffix suffix to check
+ * @return true if the suffix matches and can be stemmed
+ */
+ boolean endsWith(char s[], int len, char suffix[]) {
+ if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
+ return false;
+ } else {
+ for (int i = 0; i < suffix.length; i++)
+ if (s[len - suffix.length + i] != suffix[i])
+ return false;
+
+ return true;
+ }
+ }
+
+
+ /**
+ * Delete n characters in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len Length of input buffer
+ * @param nChars number of characters to delete
+ * @return length of input buffer after deletion
+ */
+ protected int deleteN(char s[], int pos, int len, int nChars) {
+ for (int i = 0; i < nChars; i++)
+ len = delete(s, pos, len);
+ return len;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ protected int delete(char s[], int pos, int len) {
+ if (pos < len)
+ System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
new file mode 100644
index 00000000000..0a8f4964283
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
@@ -0,0 +1,5 @@
+
+
+Analyzer for Arabic.
+
+
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
new file mode 100644
index 00000000000..4bb557b3138
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
@@ -0,0 +1,350 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+ب
+ا
+أ
+،
+عشر
+عبد
+عدد
+عدة
+عشرة
+عدم
+عام
+عاما
+عرفات
+عن
+عند
+عمان
+عندما
+على
+علي
+عليه
+عليها
+عملية
+زيارة
+سبتمبر
+ساراييفو
+سنة
+سوريا
+سنوات
+تشرين
+تم
+تموز
+ضد
+بعد
+بعض
+اعادة
+اعلن
+اعلنت
+حزب
+حزيران
+بسبب
+اسرائيل
+حسين
+حتى
+اتفاق
+صرب
+اذا
+احد
+اثر
+غزة
+برس
+باسم
+اجتماع
+غدا
+شخصا
+صباح
+اطار
+اربعة
+بغداد
+اخرى
+باريس
+رابين
+شرق
+بان
+ابو
+اجل
+غير
+حركة
+رئيس
+جديدة
+اطلاق
+بشكل
+بطولة
+صحيفة
+حاليا
+بن
+به
+ثم
+اف
+ان
+او
+اي
+بها
+جهة
+صفر
+حيث
+اكد
+الا
+اما
+العسكرية
+العراق
+العاصمة
+العربية
+العراقي
+العراقية
+العام
+العالم
+العلاقات
+العمل
+امس
+السعودية
+الساعة
+السبت
+السابق
+روسيا
+السلطة
+السلطات
+السلام
+التعاون
+التحرير
+التى
+التي
+اكتوبر
+دورة
+اكثر
+ايار
+ايضا
+الجزائر
+حماس
+الاسرائيلي
+الاسرائيلية
+الاسبوع
+الاسلحة
+الاسلامية
+ذكرت
+الاتحاد
+الاتفاق
+ثلاثة
+الحرب
+الاحد
+الذاتي
+الشرطة
+الاربعاء
+الغربية
+الخارجية
+الاردن
+الشرق
+ايران
+الحدود
+الرئيس
+الاخيرة
+الثاني
+الثانية
+الاثنين
+شمال
+بيان
+دمشق
+الذى
+الذي
+الان
+امام
+ايام
+خلال
+الشيخ
+الجيش
+الدور
+الضفة
+الجمعة
+بيريز
+الاوسط
+الروسي
+البوسنة
+الروسية
+بيروت
+الانتخابات
+البلاد
+الدفاع
+الثلثاء
+الانباء
+الثلاثاء
+الاوروبي
+حوالى
+الذين
+الدول
+الحكم
+الامم
+الامن
+الاول
+الدولة
+الخليج
+الخميس
+الاميركي
+الاميركية
+الدولي
+الاولى
+الدولية
+الحكومة
+بين
+ذلك
+دول
+دون
+حول
+حين
+الف
+الى
+انه
+اول
+ضمن
+جنوب
+دولة
+انها
+جميع
+الوزراء
+المتحدث
+المتحدة
+دولار
+النار
+الوضع
+القدس
+المحتلة
+المصدر
+المباراة
+المصري
+الماضي
+المصرية
+المرحلة
+القدم
+اللجنة
+المجلس
+الفرنسي
+الفرنسية
+القاهرة
+المدينة
+المانيا
+الوطنية
+المجموعة
+الله
+الفلسطيني
+الفلسطينية
+الفلسطينيين
+الوقت
+المقرر
+القوات
+النهائي
+المقبل
+المنطقة
+الولايات
+المفاوضات
+الملك
+اليمن
+اليوم
+ايلول
+الكويت
+ـ
+ف
+و
+و6
+قد
+لا
+ما
+مع
+وزارة
+وزير
+مساء
+قتل
+كرة
+مصر
+هذا
+فاز
+كأس
+ياسر
+قرار
+مصدر
+واحد
+قطاع
+مصادر
+مباراة
+مبارك
+واضاف
+واضافت
+فرانس
+واشنطن
+فان
+قبل
+قال
+كان
+لدى
+نحو
+هذه
+وان
+محمد
+واكد
+يذكر
+مجلس
+فرنسا
+كريستوفر
+كانت
+واوضح
+لبنان
+مايو
+مدينة
+مجموعة
+كانون
+فى
+في
+كل
+لم
+لن
+له
+من
+هو
+هي
+قوة
+كما
+لها
+منذ
+وقد
+ولا
+نفسه
+موسكو
+مقتل
+لقاء
+لكرة
+نقطة
+قوات
+مقابل
+لندن
+هناك
+وقال
+وكان
+منطقة
+منظمة
+نهاية
+وكالة
+وقالت
+وكانت
+للامم
+فيه
+كلم
+لكن
+وفي
+وقف
+ولم
+ومن
+وهو
+وهي
+يوم
+فيها
+منها
+مليار
+لوكالة
+يكون
+يمكن
+كلينتون
+مليون
+يوليو
+يونيو
+نيويورك
diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
new file mode 100644
index 00000000000..bbebe979a0c
--- /dev/null
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
@@ -0,0 +1,106 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the Arabic Normalization Filter
+ *
+ */
+public class TestArabicNormalizationFilter extends TestCase {
+
+ public void testAlifMadda() throws IOException {
+ check("آجن", "اجن");
+ }
+
+ public void testAlifHamzaAbove() throws IOException {
+ check("أحمد", "احمد");
+ }
+
+ public void testAlifHamzaBelow() throws IOException {
+ check("إعاذ", "اعاذ");
+ }
+
+ public void testAlifMaksura() throws IOException {
+ check("بنى", "بني");
+ }
+
+ public void testTehMarbuta() throws IOException {
+ check("فاطمة", "فاطمه");
+ }
+
+ public void testTatweel() throws IOException {
+ check("روبرـــــت", "روبرت");
+ }
+
+ public void testFatha() throws IOException {
+ check("مَبنا", "مبنا");
+ }
+
+ public void testKasra() throws IOException {
+ check("علِي", "علي");
+ }
+
+ public void testDamma() throws IOException {
+ check("بُوات", "بوات");
+ }
+
+ public void testFathatan() throws IOException {
+ check("ولداً", "ولدا");
+ }
+
+ public void testKasratan() throws IOException {
+ check("ولدٍ", "ولد");
+ }
+
+ public void testDammatan() throws IOException {
+ check("ولدٌ", "ولد");
+ }
+
+ public void testSukun() throws IOException {
+ check("نلْسون", "نلسون");
+ }
+
+ public void testShaddah() throws IOException {
+ check("هتميّ", "هتمي");
+ }
+
+ private void check(final String input, final String expected) throws IOException {
+ ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+ ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
+ final Token reusableToken = new Token();
+ Token nextToken = filter.next(reusableToken);
+ if (nextToken == null)
+ fail();
+ assertEquals(expected, nextToken.term());
+ filter.close();
+ }
+
+}
diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
new file mode 100644
index 00000000000..01dc5449ade
--- /dev/null
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the Arabic Normalization Filter
+ *
+ */
+public class TestArabicStemFilter extends TestCase {
+
+ public void testAlPrefix() throws IOException {
+ check("الحسن", "حسن");
+ }
+
+ public void testWalPrefix() throws IOException {
+ check("والحسن", "حسن");
+ }
+
+ public void testBalPrefix() throws IOException {
+ check("بالحسن", "حسن");
+ }
+
+ public void testKalPrefix() throws IOException {
+ check("كالحسن", "حسن");
+ }
+
+ public void testFalPrefix() throws IOException {
+ check("فالحسن", "حسن");
+ }
+
+ public void testWaPrefix() throws IOException {
+ check("وحسن", "حسن");
+ }
+
+ public void testAhSuffix() throws IOException {
+ check("زوجها", "زوج");
+ }
+
+ public void testAnSuffix() throws IOException {
+ check("ساهدان", "ساهد");
+ }
+
+ public void testAtSuffix() throws IOException {
+ check("ساهدات", "ساهد");
+ }
+
+ public void testWnSuffix() throws IOException {
+ check("ساهدون", "ساهد");
+ }
+
+ public void testYnSuffix() throws IOException {
+ check("ساهدين", "ساهد");
+ }
+
+ public void testYhSuffix() throws IOException {
+ check("ساهديه", "ساهد");
+ }
+
+ public void testYpSuffix() throws IOException {
+ check("ساهدية", "ساهد");
+ }
+
+ public void testHSuffix() throws IOException {
+ check("ساهده", "ساهد");
+ }
+
+ public void testPSuffix() throws IOException {
+ check("ساهدة", "ساهد");
+ }
+
+ public void testYSuffix() throws IOException {
+ check("ساهدي", "ساهد");
+ }
+
+ public void testComboPrefSuf() throws IOException {
+ check("وساهدون", "ساهد");
+ }
+
+ public void testComboSuf() throws IOException {
+ check("ساهدهات", "ساهد");
+ }
+
+ public void testShouldntStem() throws IOException {
+ check("الو", "الو");
+ }
+
+ public void testNonArabic() throws IOException {
+ check("English", "English");
+ }
+
+ private void check(final String input, final String expected) throws IOException {
+ ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+ ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
+ final Token reusableToken = new Token();
+ Token nextToken = filter.next(reusableToken);
+ if (nextToken == null)
+ fail();
+ assertEquals(expected, nextToken.term());
+ filter.close();
+ }
+
+}
diff --git a/src/java/org/apache/lucene/analysis/WordlistLoader.java b/src/java/org/apache/lucene/analysis/WordlistLoader.java
index 6a5f9d6941a..ca7705081c2 100644
--- a/src/java/org/apache/lucene/analysis/WordlistLoader.java
+++ b/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -56,6 +56,31 @@ public class WordlistLoader {
return result;
}
+ /**
+ * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
+ * leading and trailing whitespace). Every line of the file should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param wordfile File containing the wordlist
+ * @param comment The comment string to ignore
+ * @return A HashSet with the file's words
+ */
+ public static HashSet getWordSet(File wordfile, String comment) throws IOException {
+ HashSet result = new HashSet();
+ FileReader reader = null;
+ try {
+ reader = new FileReader(wordfile);
+ result = getWordSet(reader, comment);
+ }
+ finally {
+ if (reader != null)
+ reader.close();
+ }
+ return result;
+ }
+
+
/**
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
@@ -86,6 +111,41 @@ public class WordlistLoader {
return result;
}
+ /**
+ * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
+ * leading and trailing whitespace). Every line of the Reader should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+ *
+ * @param reader Reader containing the wordlist
+ * @param comment The string representing a comment.
+ * @return A HashSet with the reader's words
+ */
+ public static HashSet getWordSet(Reader reader, String comment) throws IOException {
+ HashSet result = new HashSet();
+ BufferedReader br = null;
+ try {
+ if (reader instanceof BufferedReader) {
+ br = (BufferedReader) reader;
+ } else {
+ br = new BufferedReader(reader);
+ }
+ String word = null;
+ while ((word = br.readLine()) != null) {
+ if (word.startsWith(comment) == false){
+ result.add(word.trim());
+ }
+ }
+ }
+ finally {
+ if (br != null)
+ br.close();
+ }
+ return result;
+ }
+
+
+
/**
* Reads a stem dictionary. Each line contains:
* word\tstem
diff --git a/src/test/org/apache/lucene/index/TestWordlistLoader.java b/src/test/org/apache/lucene/index/TestWordlistLoader.java
index a59f38b9430..3ebfa9c5b7e 100644
--- a/src/test/org/apache/lucene/index/TestWordlistLoader.java
+++ b/src/test/org/apache/lucene/index/TestWordlistLoader.java
@@ -35,7 +35,16 @@ public class TestWordlistLoader extends LuceneTestCase {
HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
checkSet(wordSet2);
}
-
+
+ public void testComments() throws Exception {
+ String s = "ONE\n two \nthree\n#comment";
+ HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
+ checkSet(wordSet1);
+ assertFalse(wordSet1.contains("#comment"));
+ assertFalse(wordSet1.contains("comment"));
+ }
+
+
private void checkSet(HashSet wordset) {
assertEquals(3, wordset.size());
assertTrue(wordset.contains("ONE")); // case is not modified