LUCENE-1962: Cleaned up Persian & Arabic Analyzer. Prevent default stopword list from being loaded more than once.

- replace if blocks with a single switch
- marking private members final where needed
- changed protected visibility to final in final class.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@823180 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2009-10-08 13:54:18 +00:00
parent 90fc7e18c7
commit 286cb1f9d2
7 changed files with 130 additions and 50 deletions

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
@ -68,21 +69,51 @@ public final class ArabicAnalyzer extends Analyzer {
* The comment character in the stopwords file. All lines prefixed with this will be ignored
*/
public static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<String> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<String> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadDefaultStopWordSet();
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
static Set<String> loadDefaultStopWordSet() throws IOException {
InputStream stream = ArabicAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORDS_COMMENT));
} finally {
stream.close();
}
}
}
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public ArabicAnalyzer() {
try {
InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
reader.close();
stream.close();
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
}
/**

View File

@ -44,8 +44,7 @@ public final class ArabicNormalizationFilter extends TokenFilter {
int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength());
termAtt.setTermLength(newlen);
return true;
} else {
return false;
}
return false;
}
}

View File

@ -63,21 +63,34 @@ public class ArabicNormalizer {
* @return length of input buffer after normalization
*/
public int normalize(char s[], int len) {
for (int i = 0; i < len; i++) {
if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW)
switch (s[i]) {
case ALEF_MADDA:
case ALEF_HAMZA_ABOVE:
case ALEF_HAMZA_BELOW:
s[i] = ALEF;
if (s[i] == DOTLESS_YEH)
break;
case DOTLESS_YEH:
s[i] = YEH;
if (s[i] == TEH_MARBUTA)
break;
case TEH_MARBUTA:
s[i] = HEH;
if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN ||
s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) {
break;
case TATWEEL:
case KASRATAN:
case DAMMATAN:
case FATHATAN:
case FATHA:
case DAMMA:
case KASRA:
case SHADDA:
case SUKUN:
len = delete(s, i, len);
i--;
break;
default:
break;
}
}

View File

@ -30,8 +30,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public final class ArabicStemFilter extends TokenFilter {
protected ArabicStemmer stemmer = null;
private TermAttribute termAtt;
private final ArabicStemmer stemmer;
private final TermAttribute termAtt;
public ArabicStemFilter(TokenStream input) {
super(input);

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
@ -58,30 +59,61 @@ public final class PersianAnalyzer extends Analyzer {
/**
* Contains the stopwords used with the StopFilter.
*/
private Set stoptable = new HashSet();
private final Set stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
*/
public static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<String> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<String> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadDefaultStopWordSet();
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
static Set<String> loadDefaultStopWordSet() throws IOException {
InputStream stream = PersianAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORDS_COMMENT));
} finally {
stream.close();
}
}
}
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
*/
public PersianAnalyzer() {
try {
InputStream stream = PersianAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
reader.close();
stream.close();
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
@ -125,7 +157,7 @@ public final class PersianAnalyzer extends Analyzer {
* the order here is important: the stopword list is normalized with the
* above!
*/
result = new StopFilter(result, stoptable);
result = new StopFilter(false, result, stoptable);
return result;
}
@ -158,7 +190,7 @@ public final class PersianAnalyzer extends Analyzer {
* the order here is important: the stopword list is normalized with the
* above!
*/
streams.result = new StopFilter(streams.result, stoptable);
streams.result = new StopFilter(false, streams.result, stoptable);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);

View File

@ -32,7 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public final class PersianNormalizationFilter extends TokenFilter {
private final PersianNormalizer normalizer;
private TermAttribute termAtt;
private final TermAttribute termAtt;
public PersianNormalizationFilter(TokenStream input) {
super(input);
@ -42,12 +42,11 @@ public final class PersianNormalizationFilter extends TokenFilter {
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt
final int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt
.termLength());
termAtt.setTermLength(newlen);
return true;
} else {
return false;
}
}
return false;
}
}

View File

@ -59,18 +59,24 @@ public class PersianNormalizer {
public int normalize(char s[], int len) {
for (int i = 0; i < len; i++) {
if (s[i] == FARSI_YEH || s[i] == YEH_BARREE)
switch (s[i]) {
case FARSI_YEH:
case YEH_BARREE:
s[i] = YEH;
if (s[i] == KEHEH)
break;
case KEHEH:
s[i] = KAF;
if (s[i] == HEH_YEH || s[i] == HEH_GOAL)
break;
case HEH_YEH:
case HEH_GOAL:
s[i] = HEH;
if (s[i] == HAMZA_ABOVE) { // necessary for HEH + HAMZA
break;
case HAMZA_ABOVE: // necessary for HEH + HAMZA
len = delete(s, i, len);
i--;
break;
default:
break;
}
}
@ -88,7 +94,7 @@ public class PersianNormalizer {
protected int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}