LUCENE-1406. Added Arabic stemming and normalization. Also added new method to WordListLoader to allow for comments in word lists.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@706342 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-10-20 17:19:29 +00:00
parent 0b90b5e23d
commit 8dfe073760
14 changed files with 1227 additions and 1 deletions

View File

@ -34,6 +34,8 @@ New features
static methods. (Shalin Shekhar Mangar via Mike McCandless)
3. LUCENE-1406: Added Arabic analyzer. (Robert Muir via Grant Ingersoll)
Optimizations
Documentation

View File

@ -9,3 +9,8 @@ The snowball stemmers in
were developed by Martin Porter and Richard Boulton.
The full snowball package is available from
http://snowball.tartarus.org/
The Arabic stemmer (contrib/analyzer) comes with a default
stopword list that is BSD-licensed created by Jacques Savoy. The file
resides in contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html.

View File

@ -0,0 +1,124 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
/**
* Analyzer for Arabic.
* <p>
* This analyzer implements light-stemming as specified by:
* <i>
* Improving Stemming for Arabic Information Retrieval:
* Light Stemming and Co-occurrence Analysis
* </i>
* http://ciir.cs.umass.edu/pubfiles/ir-249.pdf
* <p>
* The analysis package contains three primary components:
* <ul>
* <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
* <li>{@link ArabicStemFilter}: Arabic light stemming
* <li>Arabic stop words file: a set of default Arabic stop words.
* </ul>
*
*/
public final class ArabicAnalyzer extends Analyzer {
/**
* File containing default Arabic stopwords.
*
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
* The stopword list is BSD-Licensed.
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Contains the stopwords used with the StopFilter.
*/
private Set stoptable = new HashSet();
/**
* The comment character in the stopwords file. All lines prefixed with this will be ignored
*/
public static final String STOPWORDS_COMMENT = "#";
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public ArabicAnalyzer() {
try {
InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
reader.close();
stream.close();
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
}
/**
* Builds an analyzer with the given stop words.
*/
public ArabicAnalyzer( String[] stopwords ) {
stoptable = StopFilter.makeStopSet( stopwords );
}
/**
* Builds an analyzer with the given stop words.
*/
public ArabicAnalyzer( Hashtable stopwords ) {
stoptable = new HashSet(stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
*/
public ArabicAnalyzer( File stopwords ) throws IOException {
stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
}
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
* StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter.
*/
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ArabicLetterTokenizer( reader );
result = new StopFilter( result, stoptable );
result = new ArabicNormalizationFilter( result );
result = new ArabicStemFilter( result );
return result;
}
}

View File

@ -0,0 +1,43 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.LetterTokenizer;
/**
* The problem with the standard Letter tokenizer is that it fails on diacritics.
* Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
*
*
*/
public class ArabicLetterTokenizer extends LetterTokenizer {
public ArabicLetterTokenizer(Reader in) {
super(in);
}
/**
* Allows for Letter category or NonspacingMark category
* @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
*/
protected boolean isTokenChar(char c) {
return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
}
}

View File

@ -0,0 +1,53 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
*
*/
public class ArabicNormalizationFilter extends TokenFilter {
protected ArabicNormalizer normalizer = null;
protected ArabicNormalizationFilter(TokenStream input) {
super(input);
normalizer = new ArabicNormalizer();
}
public Token next(Token reusableToken) throws IOException {
if ((reusableToken = input.next(reusableToken)) == null) {
return null;
} else {
int oldlen = reusableToken.termLength();
int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
if (oldlen != newlen)
reusableToken.setTermLength(newlen);
return reusableToken;
}
}
}

View File

@ -0,0 +1,102 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Normalizer for Arabic.
* <p>
* Normalization is done in-place for efficiency, operating on a termbuffer.
* <p>
* Normalization is defined as:
* <ul>
* <li> Normalization of hamza with alef seat to a bare alef.
* <li> Normalization of teh marbuta to heh
* <li> Normalization of dotless yeh (alef maksura) to yeh.
* <li> Removal of Arabic diacritics (the harakat)
* <li> Removal of tatweel (stretching character).
* </ul>
*
*/
public class ArabicNormalizer {
public static final char ALEF = '\u0627';
public static final char ALEF_MADDA = '\u0622';
public static final char ALEF_HAMZA_ABOVE = '\u0623';
public static final char ALEF_HAMZA_BELOW = '\u0625';
public static final char YEH = '\u064A';
public static final char DOTLESS_YEH = '\u0649';
public static final char TEH_MARBUTA = '\u0629';
public static final char HEH = '\u0647';
public static final char TATWEEL = '\u0640';
public static final char FATHATAN = '\u064B';
public static final char DAMMATAN = '\u064C';
public static final char KASRATAN = '\u064D';
public static final char FATHA = '\u064E';
public static final char DAMMA = '\u064F';
public static final char KASRA = '\u0650';
public static final char SHADDA = '\u0651';
public static final char SUKUN = '\u0652';
/**
* Normalize an input buffer of Arabic text
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int normalize(char s[], int len) {
for (int i = 0; i < len; i++) {
if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
s[i] = ALEF;
if (s[i] == DOTLESS_YEH)
s[i] = YEH;
if (s[i] == TEH_MARBUTA)
s[i] = HEH;
if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) {
len = delete(s, i, len);
i--;
}
}
return len;
}
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
protected int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
}

View File

@ -0,0 +1,61 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
*
*/
public class ArabicStemFilter extends TokenFilter {
protected ArabicStemmer stemmer = null;
protected ArabicStemFilter(TokenStream input) {
super(input);
stemmer = new ArabicStemmer();
}
/**
* @return Returns the next token in the stream, or null at EOS
*/
public Token next(Token reusableToken) throws IOException {
/**
* The actual token in the input stream.
*/
if ((reusableToken = input.next(reusableToken)) == null) {
return null;
} else {
int oldlen = reusableToken.termLength();
int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen);
if (oldlen != newlen)
reusableToken.setTermLength(newlen);
return reusableToken;
}
}
}

View File

@ -0,0 +1,177 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Stemmer for Arabic.
* <p>
* Stemming is done in-place for efficiency, operating on a termbuffer.
* <p>
* Stemming is defined as:
* <ul>
* <li> Removal of attached definite article, conjunction, and prepositions.
* <li> Stemming of common suffixes.
* </ul>
*
*/
public class ArabicStemmer {
public static final char ALEF = '\u0627';
public static final char BEH = '\u0628';
public static final char TEH_MARBUTA = '\u0629';
public static final char TEH = '\u062A';
public static final char FEH = '\u0641';
public static final char KAF = '\u0643';
public static final char LAM = '\u0644';
public static final char NOON = '\u0646';
public static final char HEH = '\u0647';
public static final char WAW = '\u0648';
public static final char YEH = '\u064A';
public static final char prefixes[][] = {
("" + ALEF + LAM).toCharArray(),
("" + WAW + ALEF + LAM).toCharArray(),
("" + BEH + ALEF + LAM).toCharArray(),
("" + KAF + ALEF + LAM).toCharArray(),
("" + FEH + ALEF + LAM).toCharArray(),
("" + WAW).toCharArray(),
};
public static final char suffixes[][] = {
("" + HEH + ALEF).toCharArray(),
("" + ALEF + NOON).toCharArray(),
("" + ALEF + TEH).toCharArray(),
("" + WAW + NOON).toCharArray(),
("" + YEH + NOON).toCharArray(),
("" + YEH + HEH).toCharArray(),
("" + YEH + TEH_MARBUTA).toCharArray(),
("" + HEH).toCharArray(),
("" + TEH_MARBUTA).toCharArray(),
("" + YEH).toCharArray(),
};
/**
* Stem an input buffer of Arabic text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int stem(char s[], int len) {
len = stemPrefix(s, len);
len = stemSuffix(s, len);
return len;
}
/**
* Stem a prefix off an Arabic word.
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming.
*/
public int stemPrefix(char s[], int len) {
for (int i = 0; i < prefixes.length; i++)
if (startsWith(s, len, prefixes[i]))
return deleteN(s, 0, len, prefixes[i].length);
return len;
}
/**
* Stem suffix(es) off an Arabic word.
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming
*/
public int stemSuffix(char s[], int len) {
for (int i = 0; i < suffixes.length; i++)
if (endsWith(s, len, suffixes[i]))
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
return len;
}
/**
* Returns true if the prefix matches and can be stemmed
* @param s input buffer
* @param len length of input buffer
* @param prefix prefix to check
* @return true if the prefix matches and can be stemmed
*/
boolean startsWith(char s[], int len, char prefix[]) {
if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
return false;
} else if (len < prefix.length + 2) { // other prefixes require only 2.
return false;
} else {
for (int i = 0; i < prefix.length; i++)
if (s[i] != prefix[i])
return false;
return true;
}
}
/**
* Returns true if the suffix matches and can be stemmed
* @param s input buffer
* @param len length of input buffer
* @param suffix suffix to check
* @return true if the suffix matches and can be stemmed
*/
boolean endsWith(char s[], int len, char suffix[]) {
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
return false;
} else {
for (int i = 0; i < suffix.length; i++)
if (s[len - suffix.length + i] != suffix[i])
return false;
return true;
}
}
/**
* Delete n characters in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len Length of input buffer
* @param nChars number of characters to delete
* @return length of input buffer after deletion
*/
protected int deleteN(char s[], int pos, int len, int nChars) {
for (int i = 0; i < nChars; i++)
len = delete(s, pos, len);
return len;
}
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
protected int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
}

View File

@ -0,0 +1,5 @@
<html><head></head>
<body>
Analyzer for Arabic.
</body>
</html>

View File

@ -0,0 +1,350 @@
# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
ب
ا
أ
،
عشر
عبد
عدد
عدة
عشرة
عدم
عام
عاما
عرفات
عن
عند
عمان
عندما
على
علي
عليه
عليها
عملية
زيارة
سبتمبر
ساراييفو
سنة
سوريا
سنوات
تشرين
تم
تموز
ضد
بعد
بعض
اعادة
اعلن
اعلنت
حزب
حزيران
بسبب
اسرائيل
حسين
حتى
اتفاق
صرب
اذا
احد
اثر
غزة
برس
باسم
اجتماع
غدا
شخصا
صباح
اطار
اربعة
بغداد
اخرى
باريس
رابين
شرق
بان
ابو
اجل
غير
حركة
رئيس
جديدة
اطلاق
بشكل
بطولة
صحيفة
حاليا
بن
به
ثم
اف
ان
او
اي
بها
جهة
صفر
حيث
اكد
الا
اما
العسكرية
العراق
العاصمة
العربية
العراقي
العراقية
العام
العالم
العلاقات
العمل
امس
السعودية
الساعة
السبت
السابق
روسيا
السلطة
السلطات
السلام
التعاون
التحرير
التى
التي
اكتوبر
دورة
اكثر
ايار
ايضا
الجزائر
حماس
الاسرائيلي
الاسرائيلية
الاسبوع
الاسلحة
الاسلامية
ذكرت
الاتحاد
الاتفاق
ثلاثة
الحرب
الاحد
الذاتي
الشرطة
الاربعاء
الغربية
الخارجية
الاردن
الشرق
ايران
الحدود
الرئيس
الاخيرة
الثاني
الثانية
الاثنين
شمال
بيان
دمشق
الذى
الذي
الان
امام
ايام
خلال
الشيخ
الجيش
الدور
الضفة
الجمعة
بيريز
الاوسط
الروسي
البوسنة
الروسية
بيروت
الانتخابات
البلاد
الدفاع
الثلثاء
الانباء
الثلاثاء
الاوروبي
حوالى
الذين
الدول
الحكم
الامم
الامن
الاول
الدولة
الخليج
الخميس
الاميركي
الاميركية
الدولي
الاولى
الدولية
الحكومة
بين
ذلك
دول
دون
حول
حين
الف
الى
انه
اول
ضمن
جنوب
دولة
انها
جميع
الوزراء
المتحدث
المتحدة
دولار
النار
الوضع
القدس
المحتلة
المصدر
المباراة
المصري
الماضي
المصرية
المرحلة
القدم
اللجنة
المجلس
الفرنسي
الفرنسية
القاهرة
المدينة
المانيا
الوطنية
المجموعة
الله
الفلسطيني
الفلسطينية
الفلسطينيين
الوقت
المقرر
القوات
النهائي
المقبل
المنطقة
الولايات
المفاوضات
الملك
اليمن
اليوم
ايلول
الكويت
ـ
ف
و
و6
قد
لا
ما
مع
وزارة
وزير
مساء
قتل
كرة
مصر
هذا
فاز
كأس
ياسر
قرار
مصدر
واحد
قطاع
مصادر
مباراة
مبارك
واضاف
واضافت
فرانس
واشنطن
فان
قبل
قال
كان
لدى
نحو
هذه
وان
محمد
واكد
يذكر
مجلس
فرنسا
كريستوفر
كانت
واوضح
لبنان
مايو
مدينة
مجموعة
كانون
فى
في
كل
لم
لن
له
من
هو
هي
قوة
كما
لها
منذ
وقد
ولا
نفسه
موسكو
مقتل
لقاء
لكرة
نقطة
قوات
مقابل
لندن
هناك
وقال
وكان
منطقة
منظمة
نهاية
وكالة
وقالت
وكانت
للامم
فيه
كلم
لكن
وفي
وقف
ولم
ومن
وهو
وهي
يوم
فيها
منها
مليار
لوكالة
يكون
يمكن
كلينتون
مليون
يوليو
يونيو
نيويورك

View File

@ -0,0 +1,106 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* Test the Arabic Normalization Filter
*
*/
public class TestArabicNormalizationFilter extends TestCase {
public void testAlifMadda() throws IOException {
check("آجن", "اجن");
}
public void testAlifHamzaAbove() throws IOException {
check("أحمد", "احمد");
}
public void testAlifHamzaBelow() throws IOException {
check("إعاذ", "اعاذ");
}
public void testAlifMaksura() throws IOException {
check("بنى", "بني");
}
public void testTehMarbuta() throws IOException {
check("فاطمة", "فاطمه");
}
public void testTatweel() throws IOException {
check("روبرـــــت", "روبرت");
}
public void testFatha() throws IOException {
check("مَبنا", "مبنا");
}
public void testKasra() throws IOException {
check("علِي", "علي");
}
public void testDamma() throws IOException {
check("بُوات", "بوات");
}
public void testFathatan() throws IOException {
check("ولداً", "ولدا");
}
public void testKasratan() throws IOException {
check("ولدٍ", "ولد");
}
public void testDammatan() throws IOException {
check("ولدٌ", "ولد");
}
public void testSukun() throws IOException {
check("نلْسون", "نلسون");
}
public void testShaddah() throws IOException {
check("هتميّ", "هتمي");
}
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
final Token reusableToken = new Token();
Token nextToken = filter.next(reusableToken);
if (nextToken == null)
fail();
assertEquals(expected, nextToken.term());
filter.close();
}
}

View File

@ -0,0 +1,129 @@
package org.apache.lucene.analysis.ar;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* Test the Arabic Normalization Filter
*
*/
public class TestArabicStemFilter extends TestCase {
public void testAlPrefix() throws IOException {
check("الحسن", "حسن");
}
public void testWalPrefix() throws IOException {
check("والحسن", "حسن");
}
public void testBalPrefix() throws IOException {
check("بالحسن", "حسن");
}
public void testKalPrefix() throws IOException {
check("كالحسن", "حسن");
}
public void testFalPrefix() throws IOException {
check("فالحسن", "حسن");
}
public void testWaPrefix() throws IOException {
check("وحسن", "حسن");
}
public void testAhSuffix() throws IOException {
check("زوجها", "زوج");
}
public void testAnSuffix() throws IOException {
check("ساهدان", "ساهد");
}
public void testAtSuffix() throws IOException {
check("ساهدات", "ساهد");
}
public void testWnSuffix() throws IOException {
check("ساهدون", "ساهد");
}
public void testYnSuffix() throws IOException {
check("ساهدين", "ساهد");
}
public void testYhSuffix() throws IOException {
check("ساهديه", "ساهد");
}
public void testYpSuffix() throws IOException {
check("ساهدية", "ساهد");
}
public void testHSuffix() throws IOException {
check("ساهده", "ساهد");
}
public void testPSuffix() throws IOException {
check("ساهدة", "ساهد");
}
public void testYSuffix() throws IOException {
check("ساهدي", "ساهد");
}
public void testComboPrefSuf() throws IOException {
check("وساهدون", "ساهد");
}
public void testComboSuf() throws IOException {
check("ساهدهات", "ساهد");
}
public void testShouldntStem() throws IOException {
check("الو", "الو");
}
public void testNonArabic() throws IOException {
check("English", "English");
}
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
final Token reusableToken = new Token();
Token nextToken = filter.next(reusableToken);
if (nextToken == null)
fail();
assertEquals(expected, nextToken.term());
filter.close();
}
}

View File

@ -56,6 +56,31 @@ public class WordlistLoader {
return result;
}
/**
* Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param wordfile File containing the wordlist
* @param comment The comment string to ignore
* @return A HashSet with the file's words
*/
public static HashSet getWordSet(File wordfile, String comment) throws IOException {
HashSet result = new HashSet();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
result = getWordSet(reader, comment);
}
finally {
if (reader != null)
reader.close();
}
return result;
}
/**
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
@ -86,6 +111,41 @@ public class WordlistLoader {
return result;
}
/**
* Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
* @return A HashSet with the reader's words
*/
public static HashSet getWordSet(Reader reader, String comment) throws IOException {
HashSet result = new HashSet();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false){
result.add(word.trim());
}
}
}
finally {
if (br != null)
br.close();
}
return result;
}
/**
* Reads a stem dictionary. Each line contains:
* <pre>word<b>\t</b>stem</pre>

View File

@ -35,7 +35,16 @@ public class TestWordlistLoader extends LuceneTestCase {
HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
checkSet(wordSet2);
}
public void testComments() throws Exception {
String s = "ONE\n two \nthree\n#comment";
HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
checkSet(wordSet1);
assertFalse(wordSet1.contains("#comment"));
assertFalse(wordSet1.contains("comment"));
}
private void checkSet(HashSet wordset) {
assertEquals(3, wordset.size());
assertTrue(wordset.contains("ONE")); // case is not modified