mirror of
https://github.com/apache/lucene.git
synced 2025-02-08 02:58:58 +00:00
LUCENE-1962: Cleaned up Persian & Arabic Analyzer. Prevent default stopword list from being loaded more than once.
- replace if blocks with a single switch - marking private members final where needed - changed protected visibility to final in final class. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823180 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
90fc7e18c7
commit
286cb1f9d2
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
@ -68,21 +69,51 @@ public final class ArabicAnalyzer extends Analyzer {
|
||||
* The comment character in the stopwords file. All lines prefixed with this will be ignored
|
||||
*/
|
||||
public static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<String> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<String> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||
InputStream stream = ArabicAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||
STOPWORDS_COMMENT));
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public ArabicAnalyzer() {
|
||||
try {
|
||||
InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
|
||||
reader.close();
|
||||
stream.close();
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -44,8 +44,7 @@ public final class ArabicNormalizationFilter extends TokenFilter {
|
||||
int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength());
|
||||
termAtt.setTermLength(newlen);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -63,21 +63,34 @@ public class ArabicNormalizer {
|
||||
* @return length of input buffer after normalization
|
||||
*/
|
||||
public int normalize(char s[], int len) {
|
||||
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
|
||||
switch (s[i]) {
|
||||
case ALEF_MADDA:
|
||||
case ALEF_HAMZA_ABOVE:
|
||||
case ALEF_HAMZA_BELOW:
|
||||
s[i] = ALEF;
|
||||
|
||||
if (s[i] == DOTLESS_YEH)
|
||||
break;
|
||||
case DOTLESS_YEH:
|
||||
s[i] = YEH;
|
||||
|
||||
if (s[i] == TEH_MARBUTA)
|
||||
break;
|
||||
case TEH_MARBUTA:
|
||||
s[i] = HEH;
|
||||
|
||||
if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
|
||||
s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) {
|
||||
break;
|
||||
case TATWEEL:
|
||||
case KASRATAN:
|
||||
case DAMMATAN:
|
||||
case FATHATAN:
|
||||
case FATHA:
|
||||
case DAMMA:
|
||||
case KASRA:
|
||||
case SHADDA:
|
||||
case SUKUN:
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,8 +30,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
public final class ArabicStemFilter extends TokenFilter {
|
||||
|
||||
protected ArabicStemmer stemmer = null;
|
||||
private TermAttribute termAtt;
|
||||
private final ArabicStemmer stemmer;
|
||||
private final TermAttribute termAtt;
|
||||
|
||||
public ArabicStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
@ -58,30 +59,61 @@ public final class PersianAnalyzer extends Analyzer {
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Set stoptable = new HashSet();
|
||||
private final Set stoptable;
|
||||
|
||||
/**
|
||||
* The comment character in the stopwords file. All lines prefixed with this
|
||||
* will be ignored
|
||||
*/
|
||||
public static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<String> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<String> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadDefaultStopWordSet();
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||
InputStream stream = PersianAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||
STOPWORDS_COMMENT));
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public PersianAnalyzer() {
|
||||
try {
|
||||
InputStream stream = PersianAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
|
||||
reader.close();
|
||||
stream.close();
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -125,7 +157,7 @@ public final class PersianAnalyzer extends Analyzer {
|
||||
* the order here is important: the stopword list is normalized with the
|
||||
* above!
|
||||
*/
|
||||
result = new StopFilter(result, stoptable);
|
||||
result = new StopFilter(false, result, stoptable);
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -158,7 +190,7 @@ public final class PersianAnalyzer extends Analyzer {
|
||||
* the order here is important: the stopword list is normalized with the
|
||||
* above!
|
||||
*/
|
||||
streams.result = new StopFilter(streams.result, stoptable);
|
||||
streams.result = new StopFilter(false, streams.result, stoptable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
|
@ -32,7 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
public final class PersianNormalizationFilter extends TokenFilter {
|
||||
|
||||
private final PersianNormalizer normalizer;
|
||||
private TermAttribute termAtt;
|
||||
private final TermAttribute termAtt;
|
||||
|
||||
public PersianNormalizationFilter(TokenStream input) {
|
||||
super(input);
|
||||
@ -42,12 +42,11 @@ public final class PersianNormalizationFilter extends TokenFilter {
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt
|
||||
final int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt
|
||||
.termLength());
|
||||
termAtt.setTermLength(newlen);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -59,18 +59,24 @@ public class PersianNormalizer {
|
||||
public int normalize(char s[], int len) {
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (s[i] == FARSI_YEH || s[i] == YEH_BARREE)
|
||||
switch (s[i]) {
|
||||
case FARSI_YEH:
|
||||
case YEH_BARREE:
|
||||
s[i] = YEH;
|
||||
|
||||
if (s[i] == KEHEH)
|
||||
break;
|
||||
case KEHEH:
|
||||
s[i] = KAF;
|
||||
|
||||
if (s[i] == HEH_YEH || s[i] == HEH_GOAL)
|
||||
break;
|
||||
case HEH_YEH:
|
||||
case HEH_GOAL:
|
||||
s[i] = HEH;
|
||||
|
||||
if (s[i] == HAMZA_ABOVE) { // necessary for HEH + HAMZA
|
||||
break;
|
||||
case HAMZA_ABOVE: // necessary for HEH + HAMZA
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -88,7 +94,7 @@ public class PersianNormalizer {
|
||||
protected int delete(char s[], int pos, int len) {
|
||||
if (pos < len)
|
||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||
|
||||
|
||||
return len - 1;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user