mirror of https://github.com/apache/lucene.git
LUCENE-1406. Added Arabic stemming and normalization. Also added new method to WordListLoader to allow for comments in word lists.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@706342 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0b90b5e23d
commit
8dfe073760
|
@ -34,6 +34,8 @@ New features
|
||||||
static methods. (Shalin Shekhar Mangar via Mike McCandless)
|
static methods. (Shalin Shekhar Mangar via Mike McCandless)
|
||||||
|
|
||||||
|
|
||||||
|
3. LUCENE-1406: Added Arabic analyzer. (Robert Muir via Grant Ingersoll)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
|
@ -9,3 +9,8 @@ The snowball stemmers in
|
||||||
were developed by Martin Porter and Richard Boulton.
|
were developed by Martin Porter and Richard Boulton.
|
||||||
The full snowball package is available from
|
The full snowball package is available from
|
||||||
http://snowball.tartarus.org/
|
http://snowball.tartarus.org/
|
||||||
|
|
||||||
|
The Arabic stemmer (contrib/analyzer) comes with a default
|
||||||
|
stopword list that is BSD-licensed created by Jacques Savoy. The file
|
||||||
|
resides in contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
|
||||||
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
|
@ -0,0 +1,124 @@
|
||||||
|
package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Arabic.
|
||||||
|
* <p>
|
||||||
|
* This analyzer implements light-stemming as specified by:
|
||||||
|
* <i>
|
||||||
|
* Improving Stemming for Arabic Information Retrieval:
|
||||||
|
* Light Stemming and Co-occurrence Analysis
|
||||||
|
* </i>
|
||||||
|
* http://ciir.cs.umass.edu/pubfiles/ir-249.pdf
|
||||||
|
* <p>
|
||||||
|
* The analysis package contains three primary components:
|
||||||
|
* <ul>
|
||||||
|
* <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
|
||||||
|
* <li>{@link ArabicStemFilter}: Arabic light stemming
|
||||||
|
* <li>Arabic stop words file: a set of default Arabic stop words.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public final class ArabicAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File containing default Arabic stopwords.
|
||||||
|
*
|
||||||
|
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
|
||||||
|
* The stopword list is BSD-Licensed.
|
||||||
|
*/
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Contains the stopwords used with the StopFilter.
|
||||||
|
*/
|
||||||
|
private Set stoptable = new HashSet();
|
||||||
|
/**
|
||||||
|
* The comment character in the stopwords file. All lines prefixed with this will be ignored
|
||||||
|
*/
|
||||||
|
public static final String STOPWORDS_COMMENT = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public ArabicAnalyzer() {
|
||||||
|
try {
|
||||||
|
InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||||
|
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||||
|
stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
|
||||||
|
reader.close();
|
||||||
|
stream.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
// TODO: throw IOException
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public ArabicAnalyzer( String[] stopwords ) {
|
||||||
|
stoptable = StopFilter.makeStopSet( stopwords );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public ArabicAnalyzer( Hashtable stopwords ) {
|
||||||
|
stoptable = new HashSet(stopwords.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
|
||||||
|
*/
|
||||||
|
public ArabicAnalyzer( File stopwords ) throws IOException {
|
||||||
|
stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||||
|
*
|
||||||
|
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||||
|
* StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter.
|
||||||
|
*/
|
||||||
|
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream result = new ArabicLetterTokenizer( reader );
|
||||||
|
result = new StopFilter( result, stoptable );
|
||||||
|
result = new ArabicNormalizationFilter( result );
|
||||||
|
result = new ArabicStemFilter( result );
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
package org.apache.lucene.analysis.ar;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.LetterTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The problem with the standard Letter tokenizer is that it fails on diacritics.
|
||||||
|
* Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ArabicLetterTokenizer extends LetterTokenizer {
|
||||||
|
|
||||||
|
public ArabicLetterTokenizer(Reader in) {
|
||||||
|
super(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allows for Letter category or NonspacingMark category
|
||||||
|
* @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char)
|
||||||
|
*/
|
||||||
|
protected boolean isTokenChar(char c) {
|
||||||
|
return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,53 @@
|
||||||
|
package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class ArabicNormalizationFilter extends TokenFilter {
|
||||||
|
|
||||||
|
protected ArabicNormalizer normalizer = null;
|
||||||
|
|
||||||
|
protected ArabicNormalizationFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
normalizer = new ArabicNormalizer();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public Token next(Token reusableToken) throws IOException {
|
||||||
|
if ((reusableToken = input.next(reusableToken)) == null) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
int oldlen = reusableToken.termLength();
|
||||||
|
int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
|
||||||
|
if (oldlen != newlen)
|
||||||
|
reusableToken.setTermLength(newlen);
|
||||||
|
return reusableToken;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,102 @@
|
||||||
|
package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizer for Arabic.
|
||||||
|
* <p>
|
||||||
|
* Normalization is done in-place for efficiency, operating on a termbuffer.
|
||||||
|
* <p>
|
||||||
|
* Normalization is defined as:
|
||||||
|
* <ul>
|
||||||
|
* <li> Normalization of hamza with alef seat to a bare alef.
|
||||||
|
* <li> Normalization of teh marbuta to heh
|
||||||
|
* <li> Normalization of dotless yeh (alef maksura) to yeh.
|
||||||
|
* <li> Removal of Arabic diacritics (the harakat)
|
||||||
|
* <li> Removal of tatweel (stretching character).
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ArabicNormalizer {
|
||||||
|
public static final char ALEF = '\u0627';
|
||||||
|
public static final char ALEF_MADDA = '\u0622';
|
||||||
|
public static final char ALEF_HAMZA_ABOVE = '\u0623';
|
||||||
|
public static final char ALEF_HAMZA_BELOW = '\u0625';
|
||||||
|
|
||||||
|
public static final char YEH = '\u064A';
|
||||||
|
public static final char DOTLESS_YEH = '\u0649';
|
||||||
|
|
||||||
|
public static final char TEH_MARBUTA = '\u0629';
|
||||||
|
public static final char HEH = '\u0647';
|
||||||
|
|
||||||
|
public static final char TATWEEL = '\u0640';
|
||||||
|
|
||||||
|
public static final char FATHATAN = '\u064B';
|
||||||
|
public static final char DAMMATAN = '\u064C';
|
||||||
|
public static final char KASRATAN = '\u064D';
|
||||||
|
public static final char FATHA = '\u064E';
|
||||||
|
public static final char DAMMA = '\u064F';
|
||||||
|
public static final char KASRA = '\u0650';
|
||||||
|
public static final char SHADDA = '\u0651';
|
||||||
|
public static final char SUKUN = '\u0652';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize an input buffer of Arabic text
|
||||||
|
*
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after normalization
|
||||||
|
*/
|
||||||
|
public int normalize(char s[], int len) {
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
|
||||||
|
s[i] = ALEF;
|
||||||
|
|
||||||
|
if (s[i] == DOTLESS_YEH)
|
||||||
|
s[i] = YEH;
|
||||||
|
|
||||||
|
if (s[i] == TEH_MARBUTA)
|
||||||
|
s[i] = HEH;
|
||||||
|
|
||||||
|
if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
|
||||||
|
s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) {
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete a character in-place
|
||||||
|
*
|
||||||
|
* @param s Input Buffer
|
||||||
|
* @param pos Position of character to delete
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after deletion
|
||||||
|
*/
|
||||||
|
protected int delete(char s[], int pos, int len) {
|
||||||
|
if (pos < len)
|
||||||
|
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||||
|
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,61 @@
|
||||||
|
package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class ArabicStemFilter extends TokenFilter {
|
||||||
|
|
||||||
|
protected ArabicStemmer stemmer = null;
|
||||||
|
|
||||||
|
protected ArabicStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
stemmer = new ArabicStemmer();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Returns the next token in the stream, or null at EOS
|
||||||
|
*/
|
||||||
|
public Token next(Token reusableToken) throws IOException {
|
||||||
|
/**
|
||||||
|
* The actual token in the input stream.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
if ((reusableToken = input.next(reusableToken)) == null) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
int oldlen = reusableToken.termLength();
|
||||||
|
int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen);
|
||||||
|
if (oldlen != newlen)
|
||||||
|
reusableToken.setTermLength(newlen);
|
||||||
|
return reusableToken;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,177 @@
|
||||||
|
package org.apache.lucene.analysis.ar;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stemmer for Arabic.
|
||||||
|
* <p>
|
||||||
|
* Stemming is done in-place for efficiency, operating on a termbuffer.
|
||||||
|
* <p>
|
||||||
|
* Stemming is defined as:
|
||||||
|
* <ul>
|
||||||
|
* <li> Removal of attached definite article, conjunction, and prepositions.
|
||||||
|
* <li> Stemming of common suffixes.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ArabicStemmer {
|
||||||
|
public static final char ALEF = '\u0627';
|
||||||
|
public static final char BEH = '\u0628';
|
||||||
|
public static final char TEH_MARBUTA = '\u0629';
|
||||||
|
public static final char TEH = '\u062A';
|
||||||
|
public static final char FEH = '\u0641';
|
||||||
|
public static final char KAF = '\u0643';
|
||||||
|
public static final char LAM = '\u0644';
|
||||||
|
public static final char NOON = '\u0646';
|
||||||
|
public static final char HEH = '\u0647';
|
||||||
|
public static final char WAW = '\u0648';
|
||||||
|
public static final char YEH = '\u064A';
|
||||||
|
|
||||||
|
public static final char prefixes[][] = {
|
||||||
|
("" + ALEF + LAM).toCharArray(),
|
||||||
|
("" + WAW + ALEF + LAM).toCharArray(),
|
||||||
|
("" + BEH + ALEF + LAM).toCharArray(),
|
||||||
|
("" + KAF + ALEF + LAM).toCharArray(),
|
||||||
|
("" + FEH + ALEF + LAM).toCharArray(),
|
||||||
|
("" + WAW).toCharArray(),
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final char suffixes[][] = {
|
||||||
|
("" + HEH + ALEF).toCharArray(),
|
||||||
|
("" + ALEF + NOON).toCharArray(),
|
||||||
|
("" + ALEF + TEH).toCharArray(),
|
||||||
|
("" + WAW + NOON).toCharArray(),
|
||||||
|
("" + YEH + NOON).toCharArray(),
|
||||||
|
("" + YEH + HEH).toCharArray(),
|
||||||
|
("" + YEH + TEH_MARBUTA).toCharArray(),
|
||||||
|
("" + HEH).toCharArray(),
|
||||||
|
("" + TEH_MARBUTA).toCharArray(),
|
||||||
|
("" + YEH).toCharArray(),
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem an input buffer of Arabic text.
|
||||||
|
*
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after normalization
|
||||||
|
*/
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
len = stemPrefix(s, len);
|
||||||
|
len = stemSuffix(s, len);
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem a prefix off an Arabic word.
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return new length of input buffer after stemming.
|
||||||
|
*/
|
||||||
|
public int stemPrefix(char s[], int len) {
|
||||||
|
for (int i = 0; i < prefixes.length; i++)
|
||||||
|
if (startsWith(s, len, prefixes[i]))
|
||||||
|
return deleteN(s, 0, len, prefixes[i].length);
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem suffix(es) off an Arabic word.
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return new length of input buffer after stemming
|
||||||
|
*/
|
||||||
|
public int stemSuffix(char s[], int len) {
|
||||||
|
for (int i = 0; i < suffixes.length; i++)
|
||||||
|
if (endsWith(s, len, suffixes[i]))
|
||||||
|
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if the prefix matches and can be stemmed
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @param prefix prefix to check
|
||||||
|
* @return true if the prefix matches and can be stemmed
|
||||||
|
*/
|
||||||
|
boolean startsWith(char s[], int len, char prefix[]) {
|
||||||
|
if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
|
||||||
|
return false;
|
||||||
|
} else if (len < prefix.length + 2) { // other prefixes require only 2.
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < prefix.length; i++)
|
||||||
|
if (s[i] != prefix[i])
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if the suffix matches and can be stemmed
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @param suffix suffix to check
|
||||||
|
* @return true if the suffix matches and can be stemmed
|
||||||
|
*/
|
||||||
|
boolean endsWith(char s[], int len, char suffix[]) {
|
||||||
|
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < suffix.length; i++)
|
||||||
|
if (s[len - suffix.length + i] != suffix[i])
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete n characters in-place
|
||||||
|
*
|
||||||
|
* @param s Input Buffer
|
||||||
|
* @param pos Position of character to delete
|
||||||
|
* @param len Length of input buffer
|
||||||
|
* @param nChars number of characters to delete
|
||||||
|
* @return length of input buffer after deletion
|
||||||
|
*/
|
||||||
|
protected int deleteN(char s[], int pos, int len, int nChars) {
|
||||||
|
for (int i = 0; i < nChars; i++)
|
||||||
|
len = delete(s, pos, len);
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete a character in-place
|
||||||
|
*
|
||||||
|
* @param s Input Buffer
|
||||||
|
* @param pos Position of character to delete
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after deletion
|
||||||
|
*/
|
||||||
|
protected int delete(char s[], int pos, int len) {
|
||||||
|
if (pos < len)
|
||||||
|
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||||
|
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html><head></head>
|
||||||
|
<body>
|
||||||
|
Analyzer for Arabic.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,350 @@
|
||||||
|
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
ب
|
||||||
|
ا
|
||||||
|
أ
|
||||||
|
،
|
||||||
|
عشر
|
||||||
|
عبد
|
||||||
|
عدد
|
||||||
|
عدة
|
||||||
|
عشرة
|
||||||
|
عدم
|
||||||
|
عام
|
||||||
|
عاما
|
||||||
|
عرفات
|
||||||
|
عن
|
||||||
|
عند
|
||||||
|
عمان
|
||||||
|
عندما
|
||||||
|
على
|
||||||
|
علي
|
||||||
|
عليه
|
||||||
|
عليها
|
||||||
|
عملية
|
||||||
|
زيارة
|
||||||
|
سبتمبر
|
||||||
|
ساراييفو
|
||||||
|
سنة
|
||||||
|
سوريا
|
||||||
|
سنوات
|
||||||
|
تشرين
|
||||||
|
تم
|
||||||
|
تموز
|
||||||
|
ضد
|
||||||
|
بعد
|
||||||
|
بعض
|
||||||
|
اعادة
|
||||||
|
اعلن
|
||||||
|
اعلنت
|
||||||
|
حزب
|
||||||
|
حزيران
|
||||||
|
بسبب
|
||||||
|
اسرائيل
|
||||||
|
حسين
|
||||||
|
حتى
|
||||||
|
اتفاق
|
||||||
|
صرب
|
||||||
|
اذا
|
||||||
|
احد
|
||||||
|
اثر
|
||||||
|
غزة
|
||||||
|
برس
|
||||||
|
باسم
|
||||||
|
اجتماع
|
||||||
|
غدا
|
||||||
|
شخصا
|
||||||
|
صباح
|
||||||
|
اطار
|
||||||
|
اربعة
|
||||||
|
بغداد
|
||||||
|
اخرى
|
||||||
|
باريس
|
||||||
|
رابين
|
||||||
|
شرق
|
||||||
|
بان
|
||||||
|
ابو
|
||||||
|
اجل
|
||||||
|
غير
|
||||||
|
حركة
|
||||||
|
رئيس
|
||||||
|
جديدة
|
||||||
|
اطلاق
|
||||||
|
بشكل
|
||||||
|
بطولة
|
||||||
|
صحيفة
|
||||||
|
حاليا
|
||||||
|
بن
|
||||||
|
به
|
||||||
|
ثم
|
||||||
|
اف
|
||||||
|
ان
|
||||||
|
او
|
||||||
|
اي
|
||||||
|
بها
|
||||||
|
جهة
|
||||||
|
صفر
|
||||||
|
حيث
|
||||||
|
اكد
|
||||||
|
الا
|
||||||
|
اما
|
||||||
|
العسكرية
|
||||||
|
العراق
|
||||||
|
العاصمة
|
||||||
|
العربية
|
||||||
|
العراقي
|
||||||
|
العراقية
|
||||||
|
العام
|
||||||
|
العالم
|
||||||
|
العلاقات
|
||||||
|
العمل
|
||||||
|
امس
|
||||||
|
السعودية
|
||||||
|
الساعة
|
||||||
|
السبت
|
||||||
|
السابق
|
||||||
|
روسيا
|
||||||
|
السلطة
|
||||||
|
السلطات
|
||||||
|
السلام
|
||||||
|
التعاون
|
||||||
|
التحرير
|
||||||
|
التى
|
||||||
|
التي
|
||||||
|
اكتوبر
|
||||||
|
دورة
|
||||||
|
اكثر
|
||||||
|
ايار
|
||||||
|
ايضا
|
||||||
|
الجزائر
|
||||||
|
حماس
|
||||||
|
الاسرائيلي
|
||||||
|
الاسرائيلية
|
||||||
|
الاسبوع
|
||||||
|
الاسلحة
|
||||||
|
الاسلامية
|
||||||
|
ذكرت
|
||||||
|
الاتحاد
|
||||||
|
الاتفاق
|
||||||
|
ثلاثة
|
||||||
|
الحرب
|
||||||
|
الاحد
|
||||||
|
الذاتي
|
||||||
|
الشرطة
|
||||||
|
الاربعاء
|
||||||
|
الغربية
|
||||||
|
الخارجية
|
||||||
|
الاردن
|
||||||
|
الشرق
|
||||||
|
ايران
|
||||||
|
الحدود
|
||||||
|
الرئيس
|
||||||
|
الاخيرة
|
||||||
|
الثاني
|
||||||
|
الثانية
|
||||||
|
الاثنين
|
||||||
|
شمال
|
||||||
|
بيان
|
||||||
|
دمشق
|
||||||
|
الذى
|
||||||
|
الذي
|
||||||
|
الان
|
||||||
|
امام
|
||||||
|
ايام
|
||||||
|
خلال
|
||||||
|
الشيخ
|
||||||
|
الجيش
|
||||||
|
الدور
|
||||||
|
الضفة
|
||||||
|
الجمعة
|
||||||
|
بيريز
|
||||||
|
الاوسط
|
||||||
|
الروسي
|
||||||
|
البوسنة
|
||||||
|
الروسية
|
||||||
|
بيروت
|
||||||
|
الانتخابات
|
||||||
|
البلاد
|
||||||
|
الدفاع
|
||||||
|
الثلثاء
|
||||||
|
الانباء
|
||||||
|
الثلاثاء
|
||||||
|
الاوروبي
|
||||||
|
حوالى
|
||||||
|
الذين
|
||||||
|
الدول
|
||||||
|
الحكم
|
||||||
|
الامم
|
||||||
|
الامن
|
||||||
|
الاول
|
||||||
|
الدولة
|
||||||
|
الخليج
|
||||||
|
الخميس
|
||||||
|
الاميركي
|
||||||
|
الاميركية
|
||||||
|
الدولي
|
||||||
|
الاولى
|
||||||
|
الدولية
|
||||||
|
الحكومة
|
||||||
|
بين
|
||||||
|
ذلك
|
||||||
|
دول
|
||||||
|
دون
|
||||||
|
حول
|
||||||
|
حين
|
||||||
|
الف
|
||||||
|
الى
|
||||||
|
انه
|
||||||
|
اول
|
||||||
|
ضمن
|
||||||
|
جنوب
|
||||||
|
دولة
|
||||||
|
انها
|
||||||
|
جميع
|
||||||
|
الوزراء
|
||||||
|
المتحدث
|
||||||
|
المتحدة
|
||||||
|
دولار
|
||||||
|
النار
|
||||||
|
الوضع
|
||||||
|
القدس
|
||||||
|
المحتلة
|
||||||
|
المصدر
|
||||||
|
المباراة
|
||||||
|
المصري
|
||||||
|
الماضي
|
||||||
|
المصرية
|
||||||
|
المرحلة
|
||||||
|
القدم
|
||||||
|
اللجنة
|
||||||
|
المجلس
|
||||||
|
الفرنسي
|
||||||
|
الفرنسية
|
||||||
|
القاهرة
|
||||||
|
المدينة
|
||||||
|
المانيا
|
||||||
|
الوطنية
|
||||||
|
المجموعة
|
||||||
|
الله
|
||||||
|
الفلسطيني
|
||||||
|
الفلسطينية
|
||||||
|
الفلسطينيين
|
||||||
|
الوقت
|
||||||
|
المقرر
|
||||||
|
القوات
|
||||||
|
النهائي
|
||||||
|
المقبل
|
||||||
|
المنطقة
|
||||||
|
الولايات
|
||||||
|
المفاوضات
|
||||||
|
الملك
|
||||||
|
اليمن
|
||||||
|
اليوم
|
||||||
|
ايلول
|
||||||
|
الكويت
|
||||||
|
ـ
|
||||||
|
ف
|
||||||
|
و
|
||||||
|
و6
|
||||||
|
قد
|
||||||
|
لا
|
||||||
|
ما
|
||||||
|
مع
|
||||||
|
وزارة
|
||||||
|
وزير
|
||||||
|
مساء
|
||||||
|
قتل
|
||||||
|
كرة
|
||||||
|
مصر
|
||||||
|
هذا
|
||||||
|
فاز
|
||||||
|
كأس
|
||||||
|
ياسر
|
||||||
|
قرار
|
||||||
|
مصدر
|
||||||
|
واحد
|
||||||
|
قطاع
|
||||||
|
مصادر
|
||||||
|
مباراة
|
||||||
|
مبارك
|
||||||
|
واضاف
|
||||||
|
واضافت
|
||||||
|
فرانس
|
||||||
|
واشنطن
|
||||||
|
فان
|
||||||
|
قبل
|
||||||
|
قال
|
||||||
|
كان
|
||||||
|
لدى
|
||||||
|
نحو
|
||||||
|
هذه
|
||||||
|
وان
|
||||||
|
محمد
|
||||||
|
واكد
|
||||||
|
يذكر
|
||||||
|
مجلس
|
||||||
|
فرنسا
|
||||||
|
كريستوفر
|
||||||
|
كانت
|
||||||
|
واوضح
|
||||||
|
لبنان
|
||||||
|
مايو
|
||||||
|
مدينة
|
||||||
|
مجموعة
|
||||||
|
كانون
|
||||||
|
فى
|
||||||
|
في
|
||||||
|
كل
|
||||||
|
لم
|
||||||
|
لن
|
||||||
|
له
|
||||||
|
من
|
||||||
|
هو
|
||||||
|
هي
|
||||||
|
قوة
|
||||||
|
كما
|
||||||
|
لها
|
||||||
|
منذ
|
||||||
|
وقد
|
||||||
|
ولا
|
||||||
|
نفسه
|
||||||
|
موسكو
|
||||||
|
مقتل
|
||||||
|
لقاء
|
||||||
|
لكرة
|
||||||
|
نقطة
|
||||||
|
قوات
|
||||||
|
مقابل
|
||||||
|
لندن
|
||||||
|
هناك
|
||||||
|
وقال
|
||||||
|
وكان
|
||||||
|
منطقة
|
||||||
|
منظمة
|
||||||
|
نهاية
|
||||||
|
وكالة
|
||||||
|
وقالت
|
||||||
|
وكانت
|
||||||
|
للامم
|
||||||
|
فيه
|
||||||
|
كلم
|
||||||
|
لكن
|
||||||
|
وفي
|
||||||
|
وقف
|
||||||
|
ولم
|
||||||
|
ومن
|
||||||
|
وهو
|
||||||
|
وهي
|
||||||
|
يوم
|
||||||
|
فيها
|
||||||
|
منها
|
||||||
|
مليار
|
||||||
|
لوكالة
|
||||||
|
يكون
|
||||||
|
يمكن
|
||||||
|
كلينتون
|
||||||
|
مليون
|
||||||
|
يوليو
|
||||||
|
يونيو
|
||||||
|
نيويورك
|
|
@ -0,0 +1,106 @@
|
||||||
|
package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the Arabic Normalization Filter
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class TestArabicNormalizationFilter extends TestCase {
|
||||||
|
|
||||||
|
public void testAlifMadda() throws IOException {
|
||||||
|
check("آجن", "اجن");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAlifHamzaAbove() throws IOException {
|
||||||
|
check("أحمد", "احمد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAlifHamzaBelow() throws IOException {
|
||||||
|
check("إعاذ", "اعاذ");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAlifMaksura() throws IOException {
|
||||||
|
check("بنى", "بني");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTehMarbuta() throws IOException {
|
||||||
|
check("فاطمة", "فاطمه");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTatweel() throws IOException {
|
||||||
|
check("روبرـــــت", "روبرت");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFatha() throws IOException {
|
||||||
|
check("مَبنا", "مبنا");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKasra() throws IOException {
|
||||||
|
check("علِي", "علي");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDamma() throws IOException {
|
||||||
|
check("بُوات", "بوات");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFathatan() throws IOException {
|
||||||
|
check("ولداً", "ولدا");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKasratan() throws IOException {
|
||||||
|
check("ولدٍ", "ولد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDammatan() throws IOException {
|
||||||
|
check("ولدٌ", "ولد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSukun() throws IOException {
|
||||||
|
check("نلْسون", "نلسون");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testShaddah() throws IOException {
|
||||||
|
check("هتميّ", "هتمي");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void check(final String input, final String expected) throws IOException {
|
||||||
|
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
|
||||||
|
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
|
||||||
|
final Token reusableToken = new Token();
|
||||||
|
Token nextToken = filter.next(reusableToken);
|
||||||
|
if (nextToken == null)
|
||||||
|
fail();
|
||||||
|
assertEquals(expected, nextToken.term());
|
||||||
|
filter.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,129 @@
|
||||||
|
package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the Arabic Normalization Filter
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class TestArabicStemFilter extends TestCase {
|
||||||
|
|
||||||
|
public void testAlPrefix() throws IOException {
|
||||||
|
check("الحسن", "حسن");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWalPrefix() throws IOException {
|
||||||
|
check("والحسن", "حسن");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBalPrefix() throws IOException {
|
||||||
|
check("بالحسن", "حسن");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKalPrefix() throws IOException {
|
||||||
|
check("كالحسن", "حسن");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFalPrefix() throws IOException {
|
||||||
|
check("فالحسن", "حسن");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWaPrefix() throws IOException {
|
||||||
|
check("وحسن", "حسن");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAhSuffix() throws IOException {
|
||||||
|
check("زوجها", "زوج");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAnSuffix() throws IOException {
|
||||||
|
check("ساهدان", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAtSuffix() throws IOException {
|
||||||
|
check("ساهدات", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWnSuffix() throws IOException {
|
||||||
|
check("ساهدون", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testYnSuffix() throws IOException {
|
||||||
|
check("ساهدين", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testYhSuffix() throws IOException {
|
||||||
|
check("ساهديه", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testYpSuffix() throws IOException {
|
||||||
|
check("ساهدية", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHSuffix() throws IOException {
|
||||||
|
check("ساهده", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPSuffix() throws IOException {
|
||||||
|
check("ساهدة", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testYSuffix() throws IOException {
|
||||||
|
check("ساهدي", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testComboPrefSuf() throws IOException {
|
||||||
|
check("وساهدون", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testComboSuf() throws IOException {
|
||||||
|
check("ساهدهات", "ساهد");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testShouldntStem() throws IOException {
|
||||||
|
check("الو", "الو");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNonArabic() throws IOException {
|
||||||
|
check("English", "English");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void check(final String input, final String expected) throws IOException {
|
||||||
|
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
|
||||||
|
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
|
||||||
|
final Token reusableToken = new Token();
|
||||||
|
Token nextToken = filter.next(reusableToken);
|
||||||
|
if (nextToken == null)
|
||||||
|
fail();
|
||||||
|
assertEquals(expected, nextToken.term());
|
||||||
|
filter.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -56,6 +56,31 @@ public class WordlistLoader {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
|
||||||
|
* leading and trailing whitespace). Every line of the file should contain only
|
||||||
|
* one word. The words need to be in lowercase if you make use of an
|
||||||
|
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||||
|
*
|
||||||
|
* @param wordfile File containing the wordlist
|
||||||
|
* @param comment The comment string to ignore
|
||||||
|
* @return A HashSet with the file's words
|
||||||
|
*/
|
||||||
|
public static HashSet getWordSet(File wordfile, String comment) throws IOException {
|
||||||
|
HashSet result = new HashSet();
|
||||||
|
FileReader reader = null;
|
||||||
|
try {
|
||||||
|
reader = new FileReader(wordfile);
|
||||||
|
result = getWordSet(reader, comment);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
if (reader != null)
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
|
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
|
||||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||||
|
@ -86,6 +111,41 @@ public class WordlistLoader {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
|
||||||
|
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||||
|
* one word. The words need to be in lowercase if you make use of an
|
||||||
|
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||||
|
*
|
||||||
|
* @param reader Reader containing the wordlist
|
||||||
|
* @param comment The string representing a comment.
|
||||||
|
* @return A HashSet with the reader's words
|
||||||
|
*/
|
||||||
|
public static HashSet getWordSet(Reader reader, String comment) throws IOException {
|
||||||
|
HashSet result = new HashSet();
|
||||||
|
BufferedReader br = null;
|
||||||
|
try {
|
||||||
|
if (reader instanceof BufferedReader) {
|
||||||
|
br = (BufferedReader) reader;
|
||||||
|
} else {
|
||||||
|
br = new BufferedReader(reader);
|
||||||
|
}
|
||||||
|
String word = null;
|
||||||
|
while ((word = br.readLine()) != null) {
|
||||||
|
if (word.startsWith(comment) == false){
|
||||||
|
result.add(word.trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
if (br != null)
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a stem dictionary. Each line contains:
|
* Reads a stem dictionary. Each line contains:
|
||||||
* <pre>word<b>\t</b>stem</pre>
|
* <pre>word<b>\t</b>stem</pre>
|
||||||
|
|
|
@ -35,7 +35,16 @@ public class TestWordlistLoader extends LuceneTestCase {
|
||||||
HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
|
HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
|
||||||
checkSet(wordSet2);
|
checkSet(wordSet2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testComments() throws Exception {
|
||||||
|
String s = "ONE\n two \nthree\n#comment";
|
||||||
|
HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
|
||||||
|
checkSet(wordSet1);
|
||||||
|
assertFalse(wordSet1.contains("#comment"));
|
||||||
|
assertFalse(wordSet1.contains("comment"));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void checkSet(HashSet wordset) {
|
private void checkSet(HashSet wordset) {
|
||||||
assertEquals(3, wordset.size());
|
assertEquals(3, wordset.size());
|
||||||
assertTrue(wordset.contains("ONE")); // case is not modified
|
assertTrue(wordset.contains("ONE")); // case is not modified
|
||||||
|
|
Loading…
Reference in New Issue