diff --git a/CHANGES.txt b/CHANGES.txt index 9b51789dcad..9494cfa6fa8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -34,6 +34,8 @@ New features static methods. (Shalin Shekhar Mangar via Mike McCandless) + 3. LUCENE-1406: Added Arabic analyzer. (Robert Muir via Grant Ingersoll) + Optimizations Documentation diff --git a/NOTICE.txt b/NOTICE.txt index 92fd3447bc7..c826ff590be 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -9,3 +9,8 @@ The snowball stemmers in were developed by Martin Porter and Richard Boulton. The full snowball package is available from http://snowball.tartarus.org/ + +The Arabic stemmer (contrib/analyzer) comes with a default +stopword list that is BSD-licensed created by Jacques Savoy. The file +resides in contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. \ No newline at end of file diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java new file mode 100644 index 00000000000..e0606f8ed15 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java @@ -0,0 +1,124 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WordlistLoader; + +/** + * Analyzer for Arabic. + *

+ * This analyzer implements light-stemming as specified by: + * + * Improving Stemming for Arabic Information Retrieval: + * Light Stemming and Co-occurrence Analysis + * + * http://ciir.cs.umass.edu/pubfiles/ir-249.pdf + *

+ * The analysis package contains three primary components: + *

+ * + */ +public final class ArabicAnalyzer extends Analyzer { + + /** + * File containing default Arabic stopwords. + * + * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html + * The stopword list is BSD-Licensed. + */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Contains the stopwords used with the StopFilter. + */ + private Set stoptable = new HashSet(); + /** + * The comment character in the stopwords file. All lines prefixed with this will be ignored + */ + public static final String STOPWORDS_COMMENT = "#"; + + /** + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. + */ + public ArabicAnalyzer() { + try { + InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE); + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT); + reader.close(); + stream.close(); + } catch (IOException e) { + // TODO: throw IOException + throw new RuntimeException(e); + } + } + + /** + * Builds an analyzer with the given stop words. + */ + public ArabicAnalyzer( String[] stopwords ) { + stoptable = StopFilter.makeStopSet( stopwords ); + } + + /** + * Builds an analyzer with the given stop words. + */ + public ArabicAnalyzer( Hashtable stopwords ) { + stoptable = new HashSet(stopwords.keySet()); + } + + /** + * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} + */ + public ArabicAnalyzer( File stopwords ) throws IOException { + stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT); + } + + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter. + */ + public final TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new ArabicLetterTokenizer( reader ); + result = new StopFilter( result, stoptable ); + result = new ArabicNormalizationFilter( result ); + result = new ArabicStemFilter( result ); + + return result; + } +} + diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java new file mode 100644 index 00000000000..d414ef1b5cb --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java @@ -0,0 +1,43 @@ +package org.apache.lucene.analysis.ar; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.LetterTokenizer; + +/** + * The problem with the standard Letter tokenizer is that it fails on diacritics. + * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc. + * + * + */ +public class ArabicLetterTokenizer extends LetterTokenizer { + + public ArabicLetterTokenizer(Reader in) { + super(in); + } + + /** + * Allows for Letter category or NonspacingMark category + * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char) + */ + protected boolean isTokenChar(char c) { + return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK; + } + +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java new file mode 100644 index 00000000000..90b504fc446 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography. + * + */ + +public class ArabicNormalizationFilter extends TokenFilter { + + protected ArabicNormalizer normalizer = null; + + protected ArabicNormalizationFilter(TokenStream input) { + super(input); + normalizer = new ArabicNormalizer(); + } + + + + public Token next(Token reusableToken) throws IOException { + if ((reusableToken = input.next(reusableToken)) == null) { + return null; + } else { + int oldlen = reusableToken.termLength(); + int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen); + if (oldlen != newlen) + reusableToken.setTermLength(newlen); + return reusableToken; + } + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java new file mode 100644 index 00000000000..6693c03b90b --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java @@ -0,0 +1,102 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Normalizer for Arabic. + *

+ * Normalization is done in-place for efficiency, operating on a termbuffer. + *

+ * Normalization is defined as: + *

+ * + */ +public class ArabicNormalizer { + public static final char ALEF = '\u0627'; + public static final char ALEF_MADDA = '\u0622'; + public static final char ALEF_HAMZA_ABOVE = '\u0623'; + public static final char ALEF_HAMZA_BELOW = '\u0625'; + + public static final char YEH = '\u064A'; + public static final char DOTLESS_YEH = '\u0649'; + + public static final char TEH_MARBUTA = '\u0629'; + public static final char HEH = '\u0647'; + + public static final char TATWEEL = '\u0640'; + + public static final char FATHATAN = '\u064B'; + public static final char DAMMATAN = '\u064C'; + public static final char KASRATAN = '\u064D'; + public static final char FATHA = '\u064E'; + public static final char DAMMA = '\u064F'; + public static final char KASRA = '\u0650'; + public static final char SHADDA = '\u0651'; + public static final char SUKUN = '\u0652'; + + /** + * Normalize an input buffer of Arabic text + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + */ + public int normalize(char s[], int len) { + + for (int i = 0; i < len; i++) { + if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW) + s[i] = ALEF; + + if (s[i] == DOTLESS_YEH) + s[i] = YEH; + + if (s[i] == TEH_MARBUTA) + s[i] = HEH; + + if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN || + s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) { + len = delete(s, i, len); + i--; + } + } + + return len; + } + + /** + * Delete a character in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len length of input buffer + * @return length of input buffer after deletion + */ + protected int delete(char s[], int pos, int len) { + if (pos < len) + System.arraycopy(s, pos + 1, s, pos, len - pos - 1); + + return len - 1; + } + +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java new file mode 100644 index 00000000000..dd615d7b691 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words.. + * + */ + +public class ArabicStemFilter extends TokenFilter { + + protected ArabicStemmer stemmer = null; + + protected ArabicStemFilter(TokenStream input) { + super(input); + stemmer = new ArabicStemmer(); + } + + + + /** + * @return Returns the next token in the stream, or null at EOS + */ + public Token next(Token reusableToken) throws IOException { + /** + * The actual token in the input stream. + */ + + + if ((reusableToken = input.next(reusableToken)) == null) { + return null; + } else { + int oldlen = reusableToken.termLength(); + int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen); + if (oldlen != newlen) + reusableToken.setTermLength(newlen); + return reusableToken; + } + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java new file mode 100644 index 00000000000..3e84a75e2b1 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java @@ -0,0 +1,177 @@ +package org.apache.lucene.analysis.ar; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Stemmer for Arabic. + *

+ * Stemming is done in-place for efficiency, operating on a termbuffer. + *

+ * Stemming is defined as: + *

+ * + */ +public class ArabicStemmer { + public static final char ALEF = '\u0627'; + public static final char BEH = '\u0628'; + public static final char TEH_MARBUTA = '\u0629'; + public static final char TEH = '\u062A'; + public static final char FEH = '\u0641'; + public static final char KAF = '\u0643'; + public static final char LAM = '\u0644'; + public static final char NOON = '\u0646'; + public static final char HEH = '\u0647'; + public static final char WAW = '\u0648'; + public static final char YEH = '\u064A'; + + public static final char prefixes[][] = { + ("" + ALEF + LAM).toCharArray(), + ("" + WAW + ALEF + LAM).toCharArray(), + ("" + BEH + ALEF + LAM).toCharArray(), + ("" + KAF + ALEF + LAM).toCharArray(), + ("" + FEH + ALEF + LAM).toCharArray(), + ("" + WAW).toCharArray(), + }; + + public static final char suffixes[][] = { + ("" + HEH + ALEF).toCharArray(), + ("" + ALEF + NOON).toCharArray(), + ("" + ALEF + TEH).toCharArray(), + ("" + WAW + NOON).toCharArray(), + ("" + YEH + NOON).toCharArray(), + ("" + YEH + HEH).toCharArray(), + ("" + YEH + TEH_MARBUTA).toCharArray(), + ("" + HEH).toCharArray(), + ("" + TEH_MARBUTA).toCharArray(), + ("" + YEH).toCharArray(), +}; + + /** + * Stem an input buffer of Arabic text. + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + */ + public int stem(char s[], int len) { + len = stemPrefix(s, len); + len = stemSuffix(s, len); + + return len; + } + + /** + * Stem a prefix off an Arabic word. + * @param s input buffer + * @param len length of input buffer + * @return new length of input buffer after stemming. + */ + public int stemPrefix(char s[], int len) { + for (int i = 0; i < prefixes.length; i++) + if (startsWith(s, len, prefixes[i])) + return deleteN(s, 0, len, prefixes[i].length); + return len; + } + + /** + * Stem suffix(es) off an Arabic word. + * @param s input buffer + * @param len length of input buffer + * @return new length of input buffer after stemming + */ + public int stemSuffix(char s[], int len) { + for (int i = 0; i < suffixes.length; i++) + if (endsWith(s, len, suffixes[i])) + len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length); + return len; + } + + /** + * Returns true if the prefix matches and can be stemmed + * @param s input buffer + * @param len length of input buffer + * @param prefix prefix to check + * @return true if the prefix matches and can be stemmed + */ + boolean startsWith(char s[], int len, char prefix[]) { + if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters + return false; + } else if (len < prefix.length + 2) { // other prefixes require only 2. + return false; + } else { + for (int i = 0; i < prefix.length; i++) + if (s[i] != prefix[i]) + return false; + + return true; + } + } + + /** + * Returns true if the suffix matches and can be stemmed + * @param s input buffer + * @param len length of input buffer + * @param suffix suffix to check + * @return true if the suffix matches and can be stemmed + */ + boolean endsWith(char s[], int len, char suffix[]) { + if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming + return false; + } else { + for (int i = 0; i < suffix.length; i++) + if (s[len - suffix.length + i] != suffix[i]) + return false; + + return true; + } + } + + + /** + * Delete n characters in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len Length of input buffer + * @param nChars number of characters to delete + * @return length of input buffer after deletion + */ + protected int deleteN(char s[], int pos, int len, int nChars) { + for (int i = 0; i < nChars; i++) + len = delete(s, pos, len); + return len; + } + + /** + * Delete a character in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len length of input buffer + * @return length of input buffer after deletion + */ + protected int delete(char s[], int pos, int len) { + if (pos < len) + System.arraycopy(s, pos + 1, s, pos, len - pos - 1); + + return len - 1; + } + +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html new file mode 100644 index 00000000000..0a8f4964283 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html @@ -0,0 +1,5 @@ + + +Analyzer for Arabic. + + diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt new file mode 100644 index 00000000000..4bb557b3138 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt @@ -0,0 +1,350 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +ب +ا +أ +، +عشر +عبد +عدد +عدة +عشرة +عدم +عام +عاما +عرفات +عن +عند +عمان +عندما +على +علي +عليه +عليها +عملية +زيارة +سبتمبر +ساراييفو +سنة +سوريا +سنوات +تشرين +تم +تموز +ضد +بعد +بعض +اعادة +اعلن +اعلنت +حزب +حزيران +بسبب +اسرائيل +حسين +حتى +اتفاق +صرب +اذا +احد +اثر +غزة +برس +باسم +اجتماع +غدا +شخصا +صباح +اطار +اربعة +بغداد +اخرى +باريس +رابين +شرق +بان +ابو +اجل +غير +حركة +رئيس +جديدة +اطلاق +بشكل +بطولة +صحيفة +حاليا +بن +به +ثم +اف +ان +او +اي +بها +جهة +صفر +حيث +اكد +الا +اما +العسكرية +العراق +العاصمة +العربية +العراقي +العراقية +العام +العالم +العلاقات +العمل +امس +السعودية +الساعة +السبت +السابق +روسيا +السلطة +السلطات +السلام +التعاون +التحرير +التى +التي +اكتوبر +دورة +اكثر +ايار +ايضا +الجزائر +حماس +الاسرائيلي +الاسرائيلية +الاسبوع +الاسلحة +الاسلامية +ذكرت +الاتحاد +الاتفاق +ثلاثة +الحرب +الاحد +الذاتي +الشرطة +الاربعاء +الغربية +الخارجية +الاردن +الشرق +ايران +الحدود +الرئيس +الاخيرة +الثاني +الثانية +الاثنين +شمال +بيان +دمشق +الذى +الذي +الان +امام +ايام +خلال +الشيخ +الجيش +الدور +الضفة +الجمعة +بيريز +الاوسط +الروسي +البوسنة +الروسية +بيروت +الانتخابات +البلاد +الدفاع +الثلثاء +الانباء +الثلاثاء +الاوروبي +حوالى +الذين +الدول +الحكم +الامم +الامن +الاول +الدولة +الخليج +الخميس +الاميركي +الاميركية +الدولي +الاولى +الدولية +الحكومة +بين +ذلك +دول +دون +حول +حين +الف +الى +انه +اول +ضمن +جنوب +دولة +انها +جميع +الوزراء +المتحدث +المتحدة +دولار +النار +الوضع +القدس +المحتلة +المصدر +المباراة +المصري +الماضي +المصرية +المرحلة +القدم +اللجنة +المجلس +الفرنسي +الفرنسية +القاهرة +المدينة +المانيا +الوطنية +المجموعة +الله +الفلسطيني +الفلسطينية +الفلسطينيين +الوقت +المقرر +القوات +النهائي +المقبل +المنطقة +الولايات +المفاوضات +الملك +اليمن +اليوم +ايلول +الكويت +ـ +ف +و +و6 +قد +لا +ما +مع +وزارة +وزير +مساء +قتل +كرة +مصر +هذا +فاز +كأس +ياسر +قرار +مصدر +واحد +قطاع +مصادر +مباراة +مبارك +واضاف +واضافت +فرانس +واشنطن +فان +قبل +قال +كان +لدى +نحو +هذه +وان +محمد +واكد +يذكر +مجلس +فرنسا +كريستوفر +كانت +واوضح +لبنان +مايو +مدينة +مجموعة +كانون +فى +في +كل +لم +لن +له +من +هو +هي +قوة +كما +لها +منذ +وقد +ولا +نفسه +موسكو +مقتل +لقاء +لكرة +نقطة +قوات +مقابل +لندن +هناك +وقال +وكان +منطقة +منظمة +نهاية +وكالة +وقالت +وكانت +للامم +فيه +كلم +لكن +وفي +وقف +ولم +ومن +وهو +وهي +يوم +فيها +منها +مليار +لوكالة +يكون +يمكن +كلينتون +مليون +يوليو +يونيو +نيويورك diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java new file mode 100644 index 00000000000..bbebe979a0c --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java @@ -0,0 +1,106 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Test the Arabic Normalization Filter + * + */ +public class TestArabicNormalizationFilter extends TestCase { + + public void testAlifMadda() throws IOException { + check("آجن", "اجن"); + } + + public void testAlifHamzaAbove() throws IOException { + check("أحمد", "احمد"); + } + + public void testAlifHamzaBelow() throws IOException { + check("إعاذ", "اعاذ"); + } + + public void testAlifMaksura() throws IOException { + check("بنى", "بني"); + } + + public void testTehMarbuta() throws IOException { + check("فاطمة", "فاطمه"); + } + + public void testTatweel() throws IOException { + check("روبرـــــت", "روبرت"); + } + + public void testFatha() throws IOException { + check("مَبنا", "مبنا"); + } + + public void testKasra() throws IOException { + check("علِي", "علي"); + } + + public void testDamma() throws IOException { + check("بُوات", "بوات"); + } + + public void testFathatan() throws IOException { + check("ولداً", "ولدا"); + } + + public void testKasratan() throws IOException { + check("ولدٍ", "ولد"); + } + + public void testDammatan() throws IOException { + check("ولدٌ", "ولد"); + } + + public void testSukun() throws IOException { + check("نلْسون", "نلسون"); + } + + public void testShaddah() throws IOException { + check("هتميّ", "هتمي"); + } + + private void check(final String input, final String expected) throws IOException { + ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); + ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream); + final Token reusableToken = new Token(); + Token nextToken = filter.next(reusableToken); + if (nextToken == null) + fail(); + assertEquals(expected, nextToken.term()); + filter.close(); + } + +} diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java new file mode 100644 index 00000000000..01dc5449ade --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java @@ -0,0 +1,129 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Test the Arabic Normalization Filter + * + */ +public class TestArabicStemFilter extends TestCase { + + public void testAlPrefix() throws IOException { + check("الحسن", "حسن"); + } + + public void testWalPrefix() throws IOException { + check("والحسن", "حسن"); + } + + public void testBalPrefix() throws IOException { + check("بالحسن", "حسن"); + } + + public void testKalPrefix() throws IOException { + check("كالحسن", "حسن"); + } + + public void testFalPrefix() throws IOException { + check("فالحسن", "حسن"); + } + + public void testWaPrefix() throws IOException { + check("وحسن", "حسن"); + } + + public void testAhSuffix() throws IOException { + check("زوجها", "زوج"); + } + + public void testAnSuffix() throws IOException { + check("ساهدان", "ساهد"); + } + + public void testAtSuffix() throws IOException { + check("ساهدات", "ساهد"); + } + + public void testWnSuffix() throws IOException { + check("ساهدون", "ساهد"); + } + + public void testYnSuffix() throws IOException { + check("ساهدين", "ساهد"); + } + + public void testYhSuffix() throws IOException { + check("ساهديه", "ساهد"); + } + + public void testYpSuffix() throws IOException { + check("ساهدية", "ساهد"); + } + + public void testHSuffix() throws IOException { + check("ساهده", "ساهد"); + } + + public void testPSuffix() throws IOException { + check("ساهدة", "ساهد"); + } + + public void testYSuffix() throws IOException { + check("ساهدي", "ساهد"); + } + + public void testComboPrefSuf() throws IOException { + check("وساهدون", "ساهد"); + } + + public void testComboSuf() throws IOException { + check("ساهدهات", "ساهد"); + } + + public void testShouldntStem() throws IOException { + check("الو", "الو"); + } + + public void testNonArabic() throws IOException { + check("English", "English"); + } + + private void check(final String input, final String expected) throws IOException { + ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); + ArabicStemFilter filter = new ArabicStemFilter(tokenStream); + final Token reusableToken = new Token(); + Token nextToken = filter.next(reusableToken); + if (nextToken == null) + fail(); + assertEquals(expected, nextToken.term()); + filter.close(); + } + +} diff --git a/src/java/org/apache/lucene/analysis/WordlistLoader.java b/src/java/org/apache/lucene/analysis/WordlistLoader.java index 6a5f9d6941a..ca7705081c2 100644 --- a/src/java/org/apache/lucene/analysis/WordlistLoader.java +++ b/src/java/org/apache/lucene/analysis/WordlistLoader.java @@ -56,6 +56,31 @@ public class WordlistLoader { return result; } + /** + * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting + * leading and trailing whitespace). Every line of the file should contain only + * one word. The words need to be in lowercase if you make use of an + * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + * + * @param wordfile File containing the wordlist + * @param comment The comment string to ignore + * @return A HashSet with the file's words + */ + public static HashSet getWordSet(File wordfile, String comment) throws IOException { + HashSet result = new HashSet(); + FileReader reader = null; + try { + reader = new FileReader(wordfile); + result = getWordSet(reader, comment); + } + finally { + if (reader != null) + reader.close(); + } + return result; + } + + /** * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting * leading and trailing whitespace). Every line of the Reader should contain only @@ -86,6 +111,41 @@ public class WordlistLoader { return result; } + /** + * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting + * leading and trailing whitespace). Every line of the Reader should contain only + * one word. The words need to be in lowercase if you make use of an + * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + * + * @param reader Reader containing the wordlist + * @param comment The string representing a comment. + * @return A HashSet with the reader's words + */ + public static HashSet getWordSet(Reader reader, String comment) throws IOException { + HashSet result = new HashSet(); + BufferedReader br = null; + try { + if (reader instanceof BufferedReader) { + br = (BufferedReader) reader; + } else { + br = new BufferedReader(reader); + } + String word = null; + while ((word = br.readLine()) != null) { + if (word.startsWith(comment) == false){ + result.add(word.trim()); + } + } + } + finally { + if (br != null) + br.close(); + } + return result; + } + + + /** * Reads a stem dictionary. Each line contains: *
word\tstem
diff --git a/src/test/org/apache/lucene/index/TestWordlistLoader.java b/src/test/org/apache/lucene/index/TestWordlistLoader.java index a59f38b9430..3ebfa9c5b7e 100644 --- a/src/test/org/apache/lucene/index/TestWordlistLoader.java +++ b/src/test/org/apache/lucene/index/TestWordlistLoader.java @@ -35,7 +35,16 @@ public class TestWordlistLoader extends LuceneTestCase { HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s))); checkSet(wordSet2); } - + + public void testComments() throws Exception { + String s = "ONE\n two \nthree\n#comment"; + HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#"); + checkSet(wordSet1); + assertFalse(wordSet1.contains("#comment")); + assertFalse(wordSet1.contains("comment")); + } + + private void checkSet(HashSet wordset) { assertEquals(3, wordset.size()); assertTrue(wordset.contains("ONE")); // case is not modified