mirror of https://github.com/apache/lucene.git
LUCENE-1406. Added Arabic stemming and normalization. Also added new method to WordListLoader to allow for comments in word lists.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@706342 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0b90b5e23d
commit
8dfe073760
|
@ -34,6 +34,8 @@ New features
|
|||
static methods. (Shalin Shekhar Mangar via Mike McCandless)
|
||||
|
||||
|
||||
3. LUCENE-1406: Added Arabic analyzer. (Robert Muir via Grant Ingersoll)
|
||||
|
||||
Optimizations
|
||||
|
||||
Documentation
|
||||
|
|
|
@ -9,3 +9,8 @@ The snowball stemmers in
|
|||
were developed by Martin Porter and Richard Boulton.
|
||||
The full snowball package is available from
|
||||
http://snowball.tartarus.org/
|
||||
|
||||
The Arabic stemmer (contrib/analyzer) comes with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. The file
|
||||
resides in contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
|
@ -0,0 +1,124 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
|
||||
/**
|
||||
* Analyzer for Arabic.
|
||||
* <p>
|
||||
* This analyzer implements light-stemming as specified by:
|
||||
* <i>
|
||||
* Improving Stemming for Arabic Information Retrieval:
|
||||
* Light Stemming and Co-occurrence Analysis
|
||||
* </i>
|
||||
* http://ciir.cs.umass.edu/pubfiles/ir-249.pdf
|
||||
* <p>
|
||||
* The analysis package contains three primary components:
|
||||
* <ul>
|
||||
* <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
|
||||
* <li>{@link ArabicStemFilter}: Arabic light stemming
|
||||
* <li>Arabic stop words file: a set of default Arabic stop words.
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
public final class ArabicAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* File containing default Arabic stopwords.
|
||||
*
|
||||
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
|
||||
* The stopword list is BSD-Licensed.
|
||||
*/
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Set stoptable = new HashSet();
|
||||
/**
|
||||
* The comment character in the stopwords file. All lines prefixed with this will be ignored
|
||||
*/
|
||||
public static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public ArabicAnalyzer() {
|
||||
try {
|
||||
InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
|
||||
reader.close();
|
||||
stream.close();
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public ArabicAnalyzer( String[] stopwords ) {
|
||||
stoptable = StopFilter.makeStopSet( stopwords );
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public ArabicAnalyzer( Hashtable stopwords ) {
|
||||
stoptable = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
|
||||
*/
|
||||
public ArabicAnalyzer( File stopwords ) throws IOException {
|
||||
stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||
* StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter.
|
||||
*/
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new ArabicLetterTokenizer( reader );
|
||||
result = new StopFilter( result, stoptable );
|
||||
result = new ArabicNormalizationFilter( result );
|
||||
result = new ArabicStemFilter( result );
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.LetterTokenizer;
|
||||
|
||||
/**
|
||||
* The problem with the standard Letter tokenizer is that it fails on diacritics.
|
||||
* Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class ArabicLetterTokenizer extends LetterTokenizer {
|
||||
|
||||
public ArabicLetterTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows for Letter category or NonspacingMark category
|
||||
* @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
|
||||
*/
|
||||
protected boolean isTokenChar(char c) {
|
||||
return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
|
||||
*
|
||||
*/
|
||||
|
||||
public class ArabicNormalizationFilter extends TokenFilter {
|
||||
|
||||
protected ArabicNormalizer normalizer = null;
|
||||
|
||||
protected ArabicNormalizationFilter(TokenStream input) {
|
||||
super(input);
|
||||
normalizer = new ArabicNormalizer();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Token next(Token reusableToken) throws IOException {
|
||||
if ((reusableToken = input.next(reusableToken)) == null) {
|
||||
return null;
|
||||
} else {
|
||||
int oldlen = reusableToken.termLength();
|
||||
int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
|
||||
if (oldlen != newlen)
|
||||
reusableToken.setTermLength(newlen);
|
||||
return reusableToken;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Normalizer for Arabic.
|
||||
* <p>
|
||||
* Normalization is done in-place for efficiency, operating on a termbuffer.
|
||||
* <p>
|
||||
* Normalization is defined as:
|
||||
* <ul>
|
||||
* <li> Normalization of hamza with alef seat to a bare alef.
|
||||
* <li> Normalization of teh marbuta to heh
|
||||
* <li> Normalization of dotless yeh (alef maksura) to yeh.
|
||||
* <li> Removal of Arabic diacritics (the harakat)
|
||||
* <li> Removal of tatweel (stretching character).
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
public class ArabicNormalizer {
|
||||
public static final char ALEF = '\u0627';
|
||||
public static final char ALEF_MADDA = '\u0622';
|
||||
public static final char ALEF_HAMZA_ABOVE = '\u0623';
|
||||
public static final char ALEF_HAMZA_BELOW = '\u0625';
|
||||
|
||||
public static final char YEH = '\u064A';
|
||||
public static final char DOTLESS_YEH = '\u0649';
|
||||
|
||||
public static final char TEH_MARBUTA = '\u0629';
|
||||
public static final char HEH = '\u0647';
|
||||
|
||||
public static final char TATWEEL = '\u0640';
|
||||
|
||||
public static final char FATHATAN = '\u064B';
|
||||
public static final char DAMMATAN = '\u064C';
|
||||
public static final char KASRATAN = '\u064D';
|
||||
public static final char FATHA = '\u064E';
|
||||
public static final char DAMMA = '\u064F';
|
||||
public static final char KASRA = '\u0650';
|
||||
public static final char SHADDA = '\u0651';
|
||||
public static final char SUKUN = '\u0652';
|
||||
|
||||
/**
|
||||
* Normalize an input buffer of Arabic text
|
||||
*
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after normalization
|
||||
*/
|
||||
public int normalize(char s[], int len) {
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
|
||||
s[i] = ALEF;
|
||||
|
||||
if (s[i] == DOTLESS_YEH)
|
||||
s[i] = YEH;
|
||||
|
||||
if (s[i] == TEH_MARBUTA)
|
||||
s[i] = HEH;
|
||||
|
||||
if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
|
||||
s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) {
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a character in-place
|
||||
*
|
||||
* @param s Input Buffer
|
||||
* @param pos Position of character to delete
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after deletion
|
||||
*/
|
||||
protected int delete(char s[], int pos, int len) {
|
||||
if (pos < len)
|
||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||
|
||||
return len - 1;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
|
||||
*
|
||||
*/
|
||||
|
||||
public class ArabicStemFilter extends TokenFilter {
|
||||
|
||||
protected ArabicStemmer stemmer = null;
|
||||
|
||||
protected ArabicStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
stemmer = new ArabicStemmer();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public Token next(Token reusableToken) throws IOException {
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
|
||||
|
||||
if ((reusableToken = input.next(reusableToken)) == null) {
|
||||
return null;
|
||||
} else {
|
||||
int oldlen = reusableToken.termLength();
|
||||
int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen);
|
||||
if (oldlen != newlen)
|
||||
reusableToken.setTermLength(newlen);
|
||||
return reusableToken;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Stemmer for Arabic.
|
||||
* <p>
|
||||
* Stemming is done in-place for efficiency, operating on a termbuffer.
|
||||
* <p>
|
||||
* Stemming is defined as:
|
||||
* <ul>
|
||||
* <li> Removal of attached definite article, conjunction, and prepositions.
|
||||
* <li> Stemming of common suffixes.
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
public class ArabicStemmer {
|
||||
public static final char ALEF = '\u0627';
|
||||
public static final char BEH = '\u0628';
|
||||
public static final char TEH_MARBUTA = '\u0629';
|
||||
public static final char TEH = '\u062A';
|
||||
public static final char FEH = '\u0641';
|
||||
public static final char KAF = '\u0643';
|
||||
public static final char LAM = '\u0644';
|
||||
public static final char NOON = '\u0646';
|
||||
public static final char HEH = '\u0647';
|
||||
public static final char WAW = '\u0648';
|
||||
public static final char YEH = '\u064A';
|
||||
|
||||
public static final char prefixes[][] = {
|
||||
("" + ALEF + LAM).toCharArray(),
|
||||
("" + WAW + ALEF + LAM).toCharArray(),
|
||||
("" + BEH + ALEF + LAM).toCharArray(),
|
||||
("" + KAF + ALEF + LAM).toCharArray(),
|
||||
("" + FEH + ALEF + LAM).toCharArray(),
|
||||
("" + WAW).toCharArray(),
|
||||
};
|
||||
|
||||
public static final char suffixes[][] = {
|
||||
("" + HEH + ALEF).toCharArray(),
|
||||
("" + ALEF + NOON).toCharArray(),
|
||||
("" + ALEF + TEH).toCharArray(),
|
||||
("" + WAW + NOON).toCharArray(),
|
||||
("" + YEH + NOON).toCharArray(),
|
||||
("" + YEH + HEH).toCharArray(),
|
||||
("" + YEH + TEH_MARBUTA).toCharArray(),
|
||||
("" + HEH).toCharArray(),
|
||||
("" + TEH_MARBUTA).toCharArray(),
|
||||
("" + YEH).toCharArray(),
|
||||
};
|
||||
|
||||
/**
|
||||
* Stem an input buffer of Arabic text.
|
||||
*
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after normalization
|
||||
*/
|
||||
public int stem(char s[], int len) {
|
||||
len = stemPrefix(s, len);
|
||||
len = stemSuffix(s, len);
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stem a prefix off an Arabic word.
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return new length of input buffer after stemming.
|
||||
*/
|
||||
public int stemPrefix(char s[], int len) {
|
||||
for (int i = 0; i < prefixes.length; i++)
|
||||
if (startsWith(s, len, prefixes[i]))
|
||||
return deleteN(s, 0, len, prefixes[i].length);
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stem suffix(es) off an Arabic word.
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return new length of input buffer after stemming
|
||||
*/
|
||||
public int stemSuffix(char s[], int len) {
|
||||
for (int i = 0; i < suffixes.length; i++)
|
||||
if (endsWith(s, len, suffixes[i]))
|
||||
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the prefix matches and can be stemmed
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @param prefix prefix to check
|
||||
* @return true if the prefix matches and can be stemmed
|
||||
*/
|
||||
boolean startsWith(char s[], int len, char prefix[]) {
|
||||
if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
|
||||
return false;
|
||||
} else if (len < prefix.length + 2) { // other prefixes require only 2.
|
||||
return false;
|
||||
} else {
|
||||
for (int i = 0; i < prefix.length; i++)
|
||||
if (s[i] != prefix[i])
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the suffix matches and can be stemmed
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @param suffix suffix to check
|
||||
* @return true if the suffix matches and can be stemmed
|
||||
*/
|
||||
boolean endsWith(char s[], int len, char suffix[]) {
|
||||
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
|
||||
return false;
|
||||
} else {
|
||||
for (int i = 0; i < suffix.length; i++)
|
||||
if (s[len - suffix.length + i] != suffix[i])
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Delete n characters in-place
|
||||
*
|
||||
* @param s Input Buffer
|
||||
* @param pos Position of character to delete
|
||||
* @param len Length of input buffer
|
||||
* @param nChars number of characters to delete
|
||||
* @return length of input buffer after deletion
|
||||
*/
|
||||
protected int deleteN(char s[], int pos, int len, int nChars) {
|
||||
for (int i = 0; i < nChars; i++)
|
||||
len = delete(s, pos, len);
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a character in-place
|
||||
*
|
||||
* @param s Input Buffer
|
||||
* @param pos Position of character to delete
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after deletion
|
||||
*/
|
||||
protected int delete(char s[], int pos, int len) {
|
||||
if (pos < len)
|
||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||
|
||||
return len - 1;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
<html><head></head>
|
||||
<body>
|
||||
Analyzer for Arabic.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,350 @@
|
|||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
ب
|
||||
ا
|
||||
أ
|
||||
،
|
||||
عشر
|
||||
عبد
|
||||
عدد
|
||||
عدة
|
||||
عشرة
|
||||
عدم
|
||||
عام
|
||||
عاما
|
||||
عرفات
|
||||
عن
|
||||
عند
|
||||
عمان
|
||||
عندما
|
||||
على
|
||||
علي
|
||||
عليه
|
||||
عليها
|
||||
عملية
|
||||
زيارة
|
||||
سبتمبر
|
||||
ساراييفو
|
||||
سنة
|
||||
سوريا
|
||||
سنوات
|
||||
تشرين
|
||||
تم
|
||||
تموز
|
||||
ضد
|
||||
بعد
|
||||
بعض
|
||||
اعادة
|
||||
اعلن
|
||||
اعلنت
|
||||
حزب
|
||||
حزيران
|
||||
بسبب
|
||||
اسرائيل
|
||||
حسين
|
||||
حتى
|
||||
اتفاق
|
||||
صرب
|
||||
اذا
|
||||
احد
|
||||
اثر
|
||||
غزة
|
||||
برس
|
||||
باسم
|
||||
اجتماع
|
||||
غدا
|
||||
شخصا
|
||||
صباح
|
||||
اطار
|
||||
اربعة
|
||||
بغداد
|
||||
اخرى
|
||||
باريس
|
||||
رابين
|
||||
شرق
|
||||
بان
|
||||
ابو
|
||||
اجل
|
||||
غير
|
||||
حركة
|
||||
رئيس
|
||||
جديدة
|
||||
اطلاق
|
||||
بشكل
|
||||
بطولة
|
||||
صحيفة
|
||||
حاليا
|
||||
بن
|
||||
به
|
||||
ثم
|
||||
اف
|
||||
ان
|
||||
او
|
||||
اي
|
||||
بها
|
||||
جهة
|
||||
صفر
|
||||
حيث
|
||||
اكد
|
||||
الا
|
||||
اما
|
||||
العسكرية
|
||||
العراق
|
||||
العاصمة
|
||||
العربية
|
||||
العراقي
|
||||
العراقية
|
||||
العام
|
||||
العالم
|
||||
العلاقات
|
||||
العمل
|
||||
امس
|
||||
السعودية
|
||||
الساعة
|
||||
السبت
|
||||
السابق
|
||||
روسيا
|
||||
السلطة
|
||||
السلطات
|
||||
السلام
|
||||
التعاون
|
||||
التحرير
|
||||
التى
|
||||
التي
|
||||
اكتوبر
|
||||
دورة
|
||||
اكثر
|
||||
ايار
|
||||
ايضا
|
||||
الجزائر
|
||||
حماس
|
||||
الاسرائيلي
|
||||
الاسرائيلية
|
||||
الاسبوع
|
||||
الاسلحة
|
||||
الاسلامية
|
||||
ذكرت
|
||||
الاتحاد
|
||||
الاتفاق
|
||||
ثلاثة
|
||||
الحرب
|
||||
الاحد
|
||||
الذاتي
|
||||
الشرطة
|
||||
الاربعاء
|
||||
الغربية
|
||||
الخارجية
|
||||
الاردن
|
||||
الشرق
|
||||
ايران
|
||||
الحدود
|
||||
الرئيس
|
||||
الاخيرة
|
||||
الثاني
|
||||
الثانية
|
||||
الاثنين
|
||||
شمال
|
||||
بيان
|
||||
دمشق
|
||||
الذى
|
||||
الذي
|
||||
الان
|
||||
امام
|
||||
ايام
|
||||
خلال
|
||||
الشيخ
|
||||
الجيش
|
||||
الدور
|
||||
الضفة
|
||||
الجمعة
|
||||
بيريز
|
||||
الاوسط
|
||||
الروسي
|
||||
البوسنة
|
||||
الروسية
|
||||
بيروت
|
||||
الانتخابات
|
||||
البلاد
|
||||
الدفاع
|
||||
الثلثاء
|
||||
الانباء
|
||||
الثلاثاء
|
||||
الاوروبي
|
||||
حوالى
|
||||
الذين
|
||||
الدول
|
||||
الحكم
|
||||
الامم
|
||||
الامن
|
||||
الاول
|
||||
الدولة
|
||||
الخليج
|
||||
الخميس
|
||||
الاميركي
|
||||
الاميركية
|
||||
الدولي
|
||||
الاولى
|
||||
الدولية
|
||||
الحكومة
|
||||
بين
|
||||
ذلك
|
||||
دول
|
||||
دون
|
||||
حول
|
||||
حين
|
||||
الف
|
||||
الى
|
||||
انه
|
||||
اول
|
||||
ضمن
|
||||
جنوب
|
||||
دولة
|
||||
انها
|
||||
جميع
|
||||
الوزراء
|
||||
المتحدث
|
||||
المتحدة
|
||||
دولار
|
||||
النار
|
||||
الوضع
|
||||
القدس
|
||||
المحتلة
|
||||
المصدر
|
||||
المباراة
|
||||
المصري
|
||||
الماضي
|
||||
المصرية
|
||||
المرحلة
|
||||
القدم
|
||||
اللجنة
|
||||
المجلس
|
||||
الفرنسي
|
||||
الفرنسية
|
||||
القاهرة
|
||||
المدينة
|
||||
المانيا
|
||||
الوطنية
|
||||
المجموعة
|
||||
الله
|
||||
الفلسطيني
|
||||
الفلسطينية
|
||||
الفلسطينيين
|
||||
الوقت
|
||||
المقرر
|
||||
القوات
|
||||
النهائي
|
||||
المقبل
|
||||
المنطقة
|
||||
الولايات
|
||||
المفاوضات
|
||||
الملك
|
||||
اليمن
|
||||
اليوم
|
||||
ايلول
|
||||
الكويت
|
||||
ـ
|
||||
ف
|
||||
و
|
||||
و6
|
||||
قد
|
||||
لا
|
||||
ما
|
||||
مع
|
||||
وزارة
|
||||
وزير
|
||||
مساء
|
||||
قتل
|
||||
كرة
|
||||
مصر
|
||||
هذا
|
||||
فاز
|
||||
كأس
|
||||
ياسر
|
||||
قرار
|
||||
مصدر
|
||||
واحد
|
||||
قطاع
|
||||
مصادر
|
||||
مباراة
|
||||
مبارك
|
||||
واضاف
|
||||
واضافت
|
||||
فرانس
|
||||
واشنطن
|
||||
فان
|
||||
قبل
|
||||
قال
|
||||
كان
|
||||
لدى
|
||||
نحو
|
||||
هذه
|
||||
وان
|
||||
محمد
|
||||
واكد
|
||||
يذكر
|
||||
مجلس
|
||||
فرنسا
|
||||
كريستوفر
|
||||
كانت
|
||||
واوضح
|
||||
لبنان
|
||||
مايو
|
||||
مدينة
|
||||
مجموعة
|
||||
كانون
|
||||
فى
|
||||
في
|
||||
كل
|
||||
لم
|
||||
لن
|
||||
له
|
||||
من
|
||||
هو
|
||||
هي
|
||||
قوة
|
||||
كما
|
||||
لها
|
||||
منذ
|
||||
وقد
|
||||
ولا
|
||||
نفسه
|
||||
موسكو
|
||||
مقتل
|
||||
لقاء
|
||||
لكرة
|
||||
نقطة
|
||||
قوات
|
||||
مقابل
|
||||
لندن
|
||||
هناك
|
||||
وقال
|
||||
وكان
|
||||
منطقة
|
||||
منظمة
|
||||
نهاية
|
||||
وكالة
|
||||
وقالت
|
||||
وكانت
|
||||
للامم
|
||||
فيه
|
||||
كلم
|
||||
لكن
|
||||
وفي
|
||||
وقف
|
||||
ولم
|
||||
ومن
|
||||
وهو
|
||||
وهي
|
||||
يوم
|
||||
فيها
|
||||
منها
|
||||
مليار
|
||||
لوكالة
|
||||
يكون
|
||||
يمكن
|
||||
كلينتون
|
||||
مليون
|
||||
يوليو
|
||||
يونيو
|
||||
نيويورك
|
|
@ -0,0 +1,106 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
/**
|
||||
* Test the Arabic Normalization Filter
|
||||
*
|
||||
*/
|
||||
public class TestArabicNormalizationFilter extends TestCase {
|
||||
|
||||
public void testAlifMadda() throws IOException {
|
||||
check("آجن", "اجن");
|
||||
}
|
||||
|
||||
public void testAlifHamzaAbove() throws IOException {
|
||||
check("أحمد", "احمد");
|
||||
}
|
||||
|
||||
public void testAlifHamzaBelow() throws IOException {
|
||||
check("إعاذ", "اعاذ");
|
||||
}
|
||||
|
||||
public void testAlifMaksura() throws IOException {
|
||||
check("بنى", "بني");
|
||||
}
|
||||
|
||||
public void testTehMarbuta() throws IOException {
|
||||
check("فاطمة", "فاطمه");
|
||||
}
|
||||
|
||||
public void testTatweel() throws IOException {
|
||||
check("روبرـــــت", "روبرت");
|
||||
}
|
||||
|
||||
public void testFatha() throws IOException {
|
||||
check("مَبنا", "مبنا");
|
||||
}
|
||||
|
||||
public void testKasra() throws IOException {
|
||||
check("علِي", "علي");
|
||||
}
|
||||
|
||||
public void testDamma() throws IOException {
|
||||
check("بُوات", "بوات");
|
||||
}
|
||||
|
||||
public void testFathatan() throws IOException {
|
||||
check("ولداً", "ولدا");
|
||||
}
|
||||
|
||||
public void testKasratan() throws IOException {
|
||||
check("ولدٍ", "ولد");
|
||||
}
|
||||
|
||||
public void testDammatan() throws IOException {
|
||||
check("ولدٌ", "ولد");
|
||||
}
|
||||
|
||||
public void testSukun() throws IOException {
|
||||
check("نلْسون", "نلسون");
|
||||
}
|
||||
|
||||
public void testShaddah() throws IOException {
|
||||
check("هتميّ", "هتمي");
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
|
||||
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = filter.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
fail();
|
||||
assertEquals(expected, nextToken.term());
|
||||
filter.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
/**
|
||||
* Test the Arabic Normalization Filter
|
||||
*
|
||||
*/
|
||||
public class TestArabicStemFilter extends TestCase {
|
||||
|
||||
public void testAlPrefix() throws IOException {
|
||||
check("الحسن", "حسن");
|
||||
}
|
||||
|
||||
public void testWalPrefix() throws IOException {
|
||||
check("والحسن", "حسن");
|
||||
}
|
||||
|
||||
public void testBalPrefix() throws IOException {
|
||||
check("بالحسن", "حسن");
|
||||
}
|
||||
|
||||
public void testKalPrefix() throws IOException {
|
||||
check("كالحسن", "حسن");
|
||||
}
|
||||
|
||||
public void testFalPrefix() throws IOException {
|
||||
check("فالحسن", "حسن");
|
||||
}
|
||||
|
||||
public void testWaPrefix() throws IOException {
|
||||
check("وحسن", "حسن");
|
||||
}
|
||||
|
||||
public void testAhSuffix() throws IOException {
|
||||
check("زوجها", "زوج");
|
||||
}
|
||||
|
||||
public void testAnSuffix() throws IOException {
|
||||
check("ساهدان", "ساهد");
|
||||
}
|
||||
|
||||
public void testAtSuffix() throws IOException {
|
||||
check("ساهدات", "ساهد");
|
||||
}
|
||||
|
||||
public void testWnSuffix() throws IOException {
|
||||
check("ساهدون", "ساهد");
|
||||
}
|
||||
|
||||
public void testYnSuffix() throws IOException {
|
||||
check("ساهدين", "ساهد");
|
||||
}
|
||||
|
||||
public void testYhSuffix() throws IOException {
|
||||
check("ساهديه", "ساهد");
|
||||
}
|
||||
|
||||
public void testYpSuffix() throws IOException {
|
||||
check("ساهدية", "ساهد");
|
||||
}
|
||||
|
||||
public void testHSuffix() throws IOException {
|
||||
check("ساهده", "ساهد");
|
||||
}
|
||||
|
||||
public void testPSuffix() throws IOException {
|
||||
check("ساهدة", "ساهد");
|
||||
}
|
||||
|
||||
public void testYSuffix() throws IOException {
|
||||
check("ساهدي", "ساهد");
|
||||
}
|
||||
|
||||
public void testComboPrefSuf() throws IOException {
|
||||
check("وساهدون", "ساهد");
|
||||
}
|
||||
|
||||
public void testComboSuf() throws IOException {
|
||||
check("ساهدهات", "ساهد");
|
||||
}
|
||||
|
||||
public void testShouldntStem() throws IOException {
|
||||
check("الو", "الو");
|
||||
}
|
||||
|
||||
public void testNonArabic() throws IOException {
|
||||
check("English", "English");
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
|
||||
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = filter.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
fail();
|
||||
assertEquals(expected, nextToken.term());
|
||||
filter.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -56,6 +56,31 @@ public class WordlistLoader {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the file should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param wordfile File containing the wordlist
|
||||
* @param comment The comment string to ignore
|
||||
* @return A HashSet with the file's words
|
||||
*/
|
||||
public static HashSet getWordSet(File wordfile, String comment) throws IOException {
|
||||
HashSet result = new HashSet();
|
||||
FileReader reader = null;
|
||||
try {
|
||||
reader = new FileReader(wordfile);
|
||||
result = getWordSet(reader, comment);
|
||||
}
|
||||
finally {
|
||||
if (reader != null)
|
||||
reader.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||
|
@ -86,6 +111,41 @@ public class WordlistLoader {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @param comment The string representing a comment.
|
||||
* @return A HashSet with the reader's words
|
||||
*/
|
||||
public static HashSet getWordSet(Reader reader, String comment) throws IOException {
|
||||
HashSet result = new HashSet();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
br = (BufferedReader) reader;
|
||||
} else {
|
||||
br = new BufferedReader(reader);
|
||||
}
|
||||
String word = null;
|
||||
while ((word = br.readLine()) != null) {
|
||||
if (word.startsWith(comment) == false){
|
||||
result.add(word.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
if (br != null)
|
||||
br.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reads a stem dictionary. Each line contains:
|
||||
* <pre>word<b>\t</b>stem</pre>
|
||||
|
|
|
@ -36,6 +36,15 @@ public class TestWordlistLoader extends LuceneTestCase {
|
|||
checkSet(wordSet2);
|
||||
}
|
||||
|
||||
public void testComments() throws Exception {
|
||||
String s = "ONE\n two \nthree\n#comment";
|
||||
HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
|
||||
checkSet(wordSet1);
|
||||
assertFalse(wordSet1.contains("#comment"));
|
||||
assertFalse(wordSet1.contains("comment"));
|
||||
}
|
||||
|
||||
|
||||
private void checkSet(HashSet wordset) {
|
||||
assertEquals(3, wordset.size());
|
||||
assertTrue(wordset.contains("ONE")); // case is not modified
|
||||
|
|
Loading…
Reference in New Issue