LUCENE-1406. Added Arabic stemming and normalization. Also added new method to WordListLoader to allow for comments in word lists.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@706342 13f79535-47bb-0310-9956-ffa450edef68
2008-10-20 17:19:29 +00:00 · 2008-10-20 17:19:29 +00:00 · 8dfe073760
parent 0b90b5e23d
commit 8dfe073760
14 changed files with 1227 additions and 1 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -34,6 +34,8 @@ New features
    static methods.  (Shalin Shekhar Mangar via Mike McCandless)


+ 3. LUCENE-1406: Added Arabic analyzer.  (Robert Muir via Grant Ingersoll)
+
 Optimizations

 Documentation
--- a/NOTICE.txt
+++ b/NOTICE.txt
@ -9,3 +9,8 @@ The snowball stemmers in
 were developed by Martin Porter and Richard Boulton.
 The full snowball package is available from
  http://snowball.tartarus.org/
+
+The Arabic stemmer (contrib/analyzer) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy.  The file
+resides in contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
+See http://members.unine.ch/jacques.savoy/clef/index.html.
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@ -0,0 +1,124 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WordlistLoader;
+
+/**
+ * Analyzer for Arabic. 
+ * <p>
+ * This analyzer implements light-stemming as specified by:
+ * <i>
+ * Improving Stemming for Arabic Information Retrieval: 
+ *      Light Stemming and Co-occurrence Analysis
+ * </i>    
+ * http://ciir.cs.umass.edu/pubfiles/ir-249.pdf
+ * <p>
+ * The analysis package contains three primary components:
+ * <ul>
+ *  <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
+ *  <li>{@link ArabicStemFilter}: Arabic light stemming
+ *  <li>Arabic stop words file: a set of default Arabic stop words.
+ * </ul>
+ * 
+ */
+public final class ArabicAnalyzer extends Analyzer {
+
+  /**
+   * File containing default Arabic stopwords.
+   * 
+   * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+   * The stopword list is BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  /**
+   * Contains the stopwords used with the StopFilter.
+   */
+  private Set stoptable = new HashSet();
+  /**
+   * The comment character in the stopwords file.  All lines prefixed with this will be ignored  
+   */
+  public static final String STOPWORDS_COMMENT = "#";
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public ArabicAnalyzer() {
+    try {
+      InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+      stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
+      reader.close();
+      stream.close();
+    } catch (IOException e) {
+      // TODO: throw IOException
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public ArabicAnalyzer( String[] stopwords ) {
+    stoptable = StopFilter.makeStopSet( stopwords );
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public ArabicAnalyzer( Hashtable stopwords ) {
+    stoptable = new HashSet(stopwords.keySet());
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.  Lines can be commented out using {@link #STOPWORDS_COMMENT}
+   */
+  public ArabicAnalyzer( File stopwords ) throws IOException {
+    stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
+  }
+
+
+  /**
+   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   *
+   * @return  A TokenStream build from a StandardTokenizer filtered with
+   * 			StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter.
+   */
+  public final TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new ArabicLetterTokenizer( reader );
+    result = new StopFilter( result, stoptable );
+    result = new ArabicNormalizationFilter( result );
+    result = new ArabicStemFilter( result );
+
+    return result;
+  }
+}
+
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
@ -0,0 +1,43 @@
+package org.apache.lucene.analysis.ar;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.LetterTokenizer;
+
+/**
+ * The problem with the standard Letter tokenizer is that it fails on diacritics.
+ * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
+ * 
+ *
+ */
+public class ArabicLetterTokenizer extends LetterTokenizer {
+
+  public ArabicLetterTokenizer(Reader in) {
+    super(in);
+  }
+
+  /** 
+   * Allows for Letter category or NonspacingMark category
+   * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
+   */
+  protected boolean isTokenChar(char c) {
+    return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
+  }
+
+}
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
+ * 
+ */
+
+public class ArabicNormalizationFilter extends TokenFilter {
+
+  protected ArabicNormalizer normalizer = null;
+
+  protected ArabicNormalizationFilter(TokenStream input) {
+    super(input);
+    normalizer = new ArabicNormalizer();
+  }
+
+
+
+  public Token next(Token reusableToken) throws IOException {
+    if ((reusableToken = input.next(reusableToken)) == null) {
+      return null;
+    } else {
+      int oldlen = reusableToken.termLength();
+      int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
+      if (oldlen != newlen)
+        reusableToken.setTermLength(newlen);
+      return reusableToken;
+    }
+  }
+}
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java
@ -0,0 +1,102 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *  Normalizer for Arabic.
+ *  <p>
+ *  Normalization is done in-place for efficiency, operating on a termbuffer.
+ *  <p>
+ *  Normalization is defined as:
+ *  <ul>
+ *  <li> Normalization of hamza with alef seat to a bare alef.
+ *  <li> Normalization of teh marbuta to heh
+ *  <li> Normalization of dotless yeh (alef maksura) to yeh.
+ *  <li> Removal of Arabic diacritics (the harakat)
+ *  <li> Removal of tatweel (stretching character).
+ * </ul>
+ *
+ */
+public class ArabicNormalizer {
+  public static final char ALEF = '\u0627';
+  public static final char ALEF_MADDA = '\u0622';
+  public static final char ALEF_HAMZA_ABOVE = '\u0623';
+  public static final char ALEF_HAMZA_BELOW = '\u0625';
+
+  public static final char YEH = '\u064A';
+  public static final char DOTLESS_YEH = '\u0649';
+
+  public static final char TEH_MARBUTA = '\u0629';
+  public static final char HEH = '\u0647';
+
+  public static final char TATWEEL = '\u0640';
+
+  public static final char FATHATAN = '\u064B';
+  public static final char DAMMATAN = '\u064C';
+  public static final char KASRATAN = '\u064D';
+  public static final char FATHA = '\u064E';
+  public static final char DAMMA = '\u064F';
+  public static final char KASRA = '\u0650';
+  public static final char SHADDA = '\u0651';
+  public static final char SUKUN = '\u0652';
+
+  /**
+   * Normalize an input buffer of Arabic text
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int normalize(char s[], int len) {
+ 
+    for (int i = 0; i < len; i++) {
+      if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
+        s[i] = ALEF;
+
+      if (s[i] == DOTLESS_YEH)
+        s[i] = YEH;
+
+      if (s[i] == TEH_MARBUTA)
+        s[i] = HEH;
+
+      if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
+          s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) {
+        len = delete(s, i, len);
+        i--;
+      }
+    }
+
+    return len;
+  }
+
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  protected int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+    return len - 1;
+  }
+
+}
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
@ -0,0 +1,61 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
+ * 
+ */
+
+public class ArabicStemFilter extends TokenFilter {
+
+  protected ArabicStemmer stemmer = null;
+
+  protected ArabicStemFilter(TokenStream input) {
+    super(input);
+    stemmer = new ArabicStemmer();
+  }
+
+
+
+  /**
+   * @return  Returns the next token in the stream, or null at EOS
+   */
+  public Token next(Token reusableToken) throws IOException {
+    /**
+     * The actual token in the input stream.
+     */
+
+
+    if ((reusableToken = input.next(reusableToken)) == null) {
+      return null;
+    } else {
+      int oldlen = reusableToken.termLength();
+      int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen);
+      if (oldlen != newlen)
+        reusableToken.setTermLength(newlen);
+      return reusableToken;
+    }
+  }
+}
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java
@ -0,0 +1,177 @@
+package org.apache.lucene.analysis.ar;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *  Stemmer for Arabic.
+ *  <p>
+ *  Stemming  is done in-place for efficiency, operating on a termbuffer.
+ *  <p>
+ *  Stemming is defined as:
+ *  <ul>
+ *  <li> Removal of attached definite article, conjunction, and prepositions.
+ *  <li> Stemming of common suffixes.
+ * </ul>
+ *
+ */
+public class ArabicStemmer {
+  public static final char ALEF = '\u0627';
+  public static final char BEH = '\u0628';
+  public static final char TEH_MARBUTA = '\u0629';
+  public static final char TEH = '\u062A';
+  public static final char FEH = '\u0641';
+  public static final char KAF = '\u0643';
+  public static final char LAM = '\u0644';
+  public static final char NOON = '\u0646';
+  public static final char HEH = '\u0647';
+  public static final char WAW = '\u0648';
+  public static final char YEH = '\u064A';
+  
+  public static final char prefixes[][] = {
+      ("" + ALEF + LAM).toCharArray(), 
+      ("" + WAW + ALEF + LAM).toCharArray(), 
+      ("" + BEH + ALEF + LAM).toCharArray(),
+      ("" + KAF + ALEF + LAM).toCharArray(),
+      ("" + FEH + ALEF + LAM).toCharArray(),
+      ("" + WAW).toCharArray(),
+  };
+  
+  public static final char suffixes[][] = {
+    ("" + HEH + ALEF).toCharArray(), 
+    ("" + ALEF + NOON).toCharArray(), 
+    ("" + ALEF + TEH).toCharArray(), 
+    ("" + WAW + NOON).toCharArray(), 
+    ("" + YEH + NOON).toCharArray(), 
+    ("" + YEH + HEH).toCharArray(),
+    ("" + YEH + TEH_MARBUTA).toCharArray(),
+    ("" + HEH).toCharArray(),
+    ("" + TEH_MARBUTA).toCharArray(),
+    ("" + YEH).toCharArray(),
+};
+  
+  /**
+   * Stem an input buffer of Arabic text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(char s[], int len) {
+    len = stemPrefix(s, len);
+    len = stemSuffix(s, len);
+    
+    return len;
+  }
+  
+  /**
+   * Stem a prefix off an Arabic word.
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new length of input buffer after stemming.
+   */
+  public int stemPrefix(char s[], int len) {
+    for (int i = 0; i < prefixes.length; i++) 
+      if (startsWith(s, len, prefixes[i]))
+        return deleteN(s, 0, len, prefixes[i].length);
+    return len;
+  }
+
+  /**
+   * Stem suffix(es) off an Arabic word.
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new length of input buffer after stemming
+   */
+  public int stemSuffix(char s[], int len) {
+    for (int i = 0; i < suffixes.length; i++) 
+      if (endsWith(s, len, suffixes[i]))
+        len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
+    return len;
+  }
+  
+  /**
+   * Returns true if the prefix matches and can be stemmed
+   * @param s input buffer
+   * @param len length of input buffer
+   * @param prefix prefix to check
+   * @return true if the prefix matches and can be stemmed
+   */
+  boolean startsWith(char s[], int len, char prefix[]) {
+    if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
+      return false;
+    } else if (len < prefix.length + 2) { // other prefixes require only 2.
+      return false;
+    } else {
+      for (int i = 0; i < prefix.length; i++)
+        if (s[i] != prefix[i])
+          return false;
+        
+      return true;
+    }
+  }
+  
+  /**
+   * Returns true if the suffix matches and can be stemmed
+   * @param s input buffer
+   * @param len length of input buffer
+   * @param suffix suffix to check
+   * @return true if the suffix matches and can be stemmed
+   */
+  boolean endsWith(char s[], int len, char suffix[]) {
+    if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
+      return false;
+    } else {
+      for (int i = 0; i < suffix.length; i++)
+        if (s[len - suffix.length + i] != suffix[i])
+          return false;
+        
+      return true;
+    }
+  }
+  
+  
+  /**
+   * Delete n characters in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len Length of input buffer
+   * @param nChars number of characters to delete
+   * @return length of input buffer after deletion
+   */
+  protected int deleteN(char s[], int pos, int len, int nChars) {
+    for (int i = 0; i < nChars; i++)
+      len = delete(s, pos, len);
+    return len;
+  }
+  
+  /**
+   * Delete a character in-place
+   * 
+   * @param s Input Buffer
+   * @param pos Position of character to delete
+   * @param len length of input buffer
+   * @return length of input buffer after deletion
+   */
+  protected int delete(char s[], int pos, int len) {
+    if (pos < len) 
+      System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+    
+    return len - 1;
+  }
+  
+}
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html
@ -0,0 +1,5 @@
+<html><head></head>
+<body>
+Analyzer for Arabic.
+</body>
+</html>
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
@ -0,0 +1,350 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+ب
+ا
+أ
+،
+عشر
+عبد
+عدد
+عدة
+عشرة
+عدم
+عام
+عاما
+عرفات
+عن
+عند
+عمان
+عندما
+على
+علي
+عليه
+عليها
+عملية
+زيارة
+سبتمبر
+ساراييفو
+سنة
+سوريا
+سنوات
+تشرين
+تم
+تموز
+ضد
+بعد
+بعض
+اعادة
+اعلن
+اعلنت
+حزب
+حزيران
+بسبب
+اسرائيل
+حسين
+حتى
+اتفاق
+صرب
+اذا
+احد
+اثر
+غزة
+برس
+باسم
+اجتماع
+غدا
+شخصا
+صباح
+اطار
+اربعة
+بغداد
+اخرى
+باريس
+رابين
+شرق
+بان
+ابو
+اجل
+غير
+حركة
+رئيس
+جديدة
+اطلاق
+بشكل
+بطولة
+صحيفة
+حاليا
+بن
+به
+ثم
+اف
+ان
+او
+اي
+بها
+جهة
+صفر
+حيث
+اكد
+الا
+اما
+العسكرية
+العراق
+العاصمة
+العربية
+العراقي
+العراقية
+العام
+العالم
+العلاقات
+العمل
+امس
+السعودية
+الساعة
+السبت
+السابق
+روسيا
+السلطة
+السلطات
+السلام
+التعاون
+التحرير
+التى
+التي
+اكتوبر
+دورة
+اكثر
+ايار
+ايضا
+الجزائر
+حماس
+الاسرائيلي
+الاسرائيلية
+الاسبوع
+الاسلحة
+الاسلامية
+ذكرت
+الاتحاد
+الاتفاق
+ثلاثة
+الحرب
+الاحد
+الذاتي
+الشرطة
+الاربعاء
+الغربية
+الخارجية
+الاردن
+الشرق
+ايران
+الحدود
+الرئيس
+الاخيرة
+الثاني
+الثانية
+الاثنين
+شمال
+بيان
+دمشق
+الذى
+الذي
+الان
+امام
+ايام
+خلال
+الشيخ
+الجيش
+الدور
+الضفة
+الجمعة
+بيريز
+الاوسط
+الروسي
+البوسنة
+الروسية
+بيروت
+الانتخابات
+البلاد
+الدفاع
+الثلثاء
+الانباء
+الثلاثاء
+الاوروبي
+حوالى
+الذين
+الدول
+الحكم
+الامم
+الامن
+الاول
+الدولة
+الخليج
+الخميس
+الاميركي
+الاميركية
+الدولي
+الاولى
+الدولية
+الحكومة
+بين
+ذلك
+دول
+دون
+حول
+حين
+الف
+الى
+انه
+اول
+ضمن
+جنوب
+دولة
+انها
+جميع
+الوزراء
+المتحدث
+المتحدة
+دولار
+النار
+الوضع
+القدس
+المحتلة
+المصدر
+المباراة
+المصري
+الماضي
+المصرية
+المرحلة
+القدم
+اللجنة
+المجلس
+الفرنسي
+الفرنسية
+القاهرة
+المدينة
+المانيا
+الوطنية
+المجموعة
+الله
+الفلسطيني
+الفلسطينية
+الفلسطينيين
+الوقت
+المقرر
+القوات
+النهائي
+المقبل
+المنطقة
+الولايات
+المفاوضات
+الملك
+اليمن
+اليوم
+ايلول
+الكويت
+ـ
+ف
+و
+و6
+قد
+لا
+ما
+مع
+وزارة
+وزير
+مساء
+قتل
+كرة
+مصر
+هذا
+فاز
+كأس
+ياسر
+قرار
+مصدر
+واحد
+قطاع
+مصادر
+مباراة
+مبارك
+واضاف
+واضافت
+فرانس
+واشنطن
+فان
+قبل
+قال
+كان
+لدى
+نحو
+هذه
+وان
+محمد
+واكد
+يذكر
+مجلس
+فرنسا
+كريستوفر
+كانت
+واوضح
+لبنان
+مايو
+مدينة
+مجموعة
+كانون
+فى
+في
+كل
+لم
+لن
+له
+من
+هو
+هي
+قوة
+كما
+لها
+منذ
+وقد
+ولا
+نفسه
+موسكو
+مقتل
+لقاء
+لكرة
+نقطة
+قوات
+مقابل
+لندن
+هناك
+وقال
+وكان
+منطقة
+منظمة
+نهاية
+وكالة
+وقالت
+وكانت
+للامم
+فيه
+كلم
+لكن
+وفي
+وقف
+ولم
+ومن
+وهو
+وهي
+يوم
+فيها
+منها
+مليار
+لوكالة
+يكون
+يمكن
+كلينتون
+مليون
+يوليو
+يونيو
+نيويورك
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
@ -0,0 +1,106 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the Arabic Normalization Filter
+ *
+ */
+public class TestArabicNormalizationFilter extends TestCase {
+
+  public void testAlifMadda() throws IOException {
+    check("آجن", "اجن");
+  }
+  
+  public void testAlifHamzaAbove() throws IOException {
+    check("أحمد", "احمد");
+  }
+  
+  public void testAlifHamzaBelow() throws IOException {
+    check("إعاذ", "اعاذ");
+  }
+  
+  public void testAlifMaksura() throws IOException {
+    check("بنى", "بني");
+  }
+
+  public void testTehMarbuta() throws IOException {
+    check("فاطمة", "فاطمه");
+  }
+  
+  public void testTatweel() throws IOException {
+    check("روبرـــــت", "روبرت");
+  }
+  
+  public void testFatha() throws IOException {
+    check("مَبنا", "مبنا");
+  }
+  
+  public void testKasra() throws IOException {
+    check("علِي", "علي");
+  }
+  
+  public void testDamma() throws IOException {
+    check("بُوات", "بوات");
+  }
+  
+  public void testFathatan() throws IOException {
+    check("ولداً", "ولدا");
+  }
+  
+  public void testKasratan() throws IOException {
+    check("ولدٍ", "ولد");
+  }
+  
+  public void testDammatan() throws IOException {
+    check("ولدٌ", "ولد");
+  }  
+  
+  public void testSukun() throws IOException {
+    check("نلْسون", "نلسون");
+  }
+  
+  public void testShaddah() throws IOException {
+    check("هتميّ", "هتمي");
+  }  
+  
+  private void check(final String input, final String expected) throws IOException {
+    ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+    ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
+    final Token reusableToken = new Token();
+    Token nextToken = filter.next(reusableToken);
+    if (nextToken == null)
+      fail();
+    assertEquals(expected, nextToken.term());
+    filter.close();
+  }
+
+}
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.ar;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the Arabic Normalization Filter
+ *
+ */
+public class TestArabicStemFilter extends TestCase {
+  
+  public void testAlPrefix() throws IOException {
+    check("الحسن", "حسن");
+  }    
+
+  public void testWalPrefix() throws IOException {
+    check("والحسن", "حسن");
+  }    
+  
+  public void testBalPrefix() throws IOException {
+    check("بالحسن", "حسن");
+  }    
+  
+  public void testKalPrefix() throws IOException {
+    check("كالحسن", "حسن");
+  }    
+  
+  public void testFalPrefix() throws IOException {
+    check("فالحسن", "حسن");
+  }    
+
+  public void testWaPrefix() throws IOException {
+    check("وحسن", "حسن");
+  } 
+  
+  public void testAhSuffix() throws IOException {
+    check("زوجها", "زوج");
+  } 
+  
+  public void testAnSuffix() throws IOException {
+    check("ساهدان", "ساهد");
+  } 
+  
+  public void testAtSuffix() throws IOException {
+    check("ساهدات", "ساهد");
+  } 
+  
+  public void testWnSuffix() throws IOException {
+    check("ساهدون", "ساهد");
+  } 
+  
+  public void testYnSuffix() throws IOException {
+    check("ساهدين", "ساهد");
+  } 
+  
+  public void testYhSuffix() throws IOException {
+    check("ساهديه", "ساهد");
+  } 
+
+  public void testYpSuffix() throws IOException {
+    check("ساهدية", "ساهد");
+  } 
+  
+  public void testHSuffix() throws IOException {
+    check("ساهده", "ساهد");
+  } 
+  
+  public void testPSuffix() throws IOException {
+    check("ساهدة", "ساهد");
+  }
+  
+  public void testYSuffix() throws IOException {
+    check("ساهدي", "ساهد");
+  }
+  
+  public void testComboPrefSuf() throws IOException {
+    check("وساهدون", "ساهد");
+  }
+  
+  public void testComboSuf() throws IOException {
+    check("ساهدهات", "ساهد");
+  }
+  
+  public void testShouldntStem() throws IOException {
+    check("الو", "الو");
+  }
+
+  public void testNonArabic() throws IOException {
+    check("English", "English");
+  }
+
+  private void check(final String input, final String expected) throws IOException {
+    ArabicLetterTokenizer tokenStream  = new ArabicLetterTokenizer(new StringReader(input));
+    ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
+    final Token reusableToken = new Token();
+    Token nextToken = filter.next(reusableToken);
+    if (nextToken == null)
+      fail();
+    assertEquals(expected, nextToken.term());
+    filter.close();
+  }
+
+}
--- a/src/java/org/apache/lucene/analysis/WordlistLoader.java
+++ b/src/java/org/apache/lucene/analysis/WordlistLoader.java
@ -56,6 +56,31 @@ public class WordlistLoader {
    return result;
  }

+  /**
+   * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
+   * leading and trailing whitespace). Every line of the file should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param wordfile File containing the wordlist
+   * @param comment The comment string to ignore
+   * @return A HashSet with the file's words
+   */
+  public static HashSet getWordSet(File wordfile, String comment) throws IOException {
+    HashSet result = new HashSet();
+    FileReader reader = null;
+    try {
+      reader = new FileReader(wordfile);
+      result = getWordSet(reader, comment);
+    }
+    finally {
+      if (reader != null)
+        reader.close();
+    }
+    return result;
+  }
+
+
  /**
   * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
   * leading and trailing whitespace). Every line of the Reader should contain only
@ -86,6 +111,41 @@ public class WordlistLoader {
    return result;
  }

+  /**
+   * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param reader Reader containing the wordlist
+   * @param comment The string representing a comment.
+   * @return A HashSet with the reader's words
+   */
+  public static HashSet getWordSet(Reader reader, String comment) throws IOException {
+    HashSet result = new HashSet();
+    BufferedReader br = null;
+    try {
+      if (reader instanceof BufferedReader) {
+        br = (BufferedReader) reader;
+      } else {
+        br = new BufferedReader(reader);
+      }
+      String word = null;
+      while ((word = br.readLine()) != null) {
+        if (word.startsWith(comment) == false){
+          result.add(word.trim());
+        }
+      }
+    }
+    finally {
+      if (br != null)
+        br.close();
+    }
+    return result;
+  }
+
+
+
  /**
   * Reads a stem dictionary. Each line contains:
   * <pre>word<b>\t</b>stem</pre>
--- a/src/test/org/apache/lucene/index/TestWordlistLoader.java
+++ b/src/test/org/apache/lucene/index/TestWordlistLoader.java
@ -36,6 +36,15 @@ public class TestWordlistLoader extends LuceneTestCase {
    checkSet(wordSet2);
  }

+  public void testComments() throws Exception {
+    String s = "ONE\n  two \nthree\n#comment";
+    HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
+    checkSet(wordSet1);
+    assertFalse(wordSet1.contains("#comment"));
+    assertFalse(wordSet1.contains("comment"));
+  }
+
+
  private void checkSet(HashSet wordset) {
    assertEquals(3, wordset.size());
    assertTrue(wordset.contains("ONE"));		// case is not modified