diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java index 7c033748756..0f54ee4f6c8 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.util.Collections; import java.util.HashSet; import java.util.Hashtable; import java.util.Set; @@ -68,21 +69,51 @@ public final class ArabicAnalyzer extends Analyzer { * The comment character in the stopwords file. All lines prefixed with this will be ignored */ public static final String STOPWORDS_COMMENT = "#"; + + /** + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. + */ + public static Set getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class + * accesses the static final set the first time.; + */ + private static class DefaultSetHolder { + static final Set DEFAULT_STOP_SET; + + static { + try { + DEFAULT_STOP_SET = loadDefaultStopWordSet(); + } catch (IOException ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load default stopword set"); + } + } + + static Set loadDefaultStopWordSet() throws IOException { + InputStream stream = ArabicAnalyzer.class + .getResourceAsStream(DEFAULT_STOPWORD_FILE); + try { + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + // make sure it is unmodifiable as we expose it in the outer class + return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, + STOPWORDS_COMMENT)); + } finally { + stream.close(); + } + } + } /** * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */ public ArabicAnalyzer() { - try { - InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE); - InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT); - reader.close(); - stream.close(); - } catch (IOException e) { - // TODO: throw IOException - throw new RuntimeException(e); - } + stoptable = DefaultSetHolder.DEFAULT_STOP_SET; } /** diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java index 2c27cf4f1ab..6c51163a20a 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java @@ -44,8 +44,7 @@ public final class ArabicNormalizationFilter extends TokenFilter { int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()); termAtt.setTermLength(newlen); return true; - } else { - return false; } + return false; } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java index 6693c03b90b..eaf89d259e3 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java @@ -63,21 +63,34 @@ public class ArabicNormalizer { * @return length of input buffer after normalization */ public int normalize(char s[], int len) { - + for (int i = 0; i < len; i++) { - if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW) + switch (s[i]) { + case ALEF_MADDA: + case ALEF_HAMZA_ABOVE: + case ALEF_HAMZA_BELOW: s[i] = ALEF; - - if (s[i] == DOTLESS_YEH) + break; + case DOTLESS_YEH: s[i] = YEH; - - if (s[i] == TEH_MARBUTA) + break; + case TEH_MARBUTA: s[i] = HEH; - - if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN || - s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) { + break; + case TATWEEL: + case KASRATAN: + case DAMMATAN: + case FATHATAN: + case FATHA: + case DAMMA: + case KASRA: + case SHADDA: + case SUKUN: len = delete(s, i, len); i--; + break; + default: + break; } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java index 75c14b59cc2..c9895695d0c 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java @@ -30,8 +30,8 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; public final class ArabicStemFilter extends TokenFilter { - protected ArabicStemmer stemmer = null; - private TermAttribute termAtt; + private final ArabicStemmer stemmer; + private final TermAttribute termAtt; public ArabicStemFilter(TokenStream input) { super(input); diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java index 65e463d38ca..743a534752d 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.util.Collections; import java.util.HashSet; import java.util.Hashtable; import java.util.Set; @@ -58,30 +59,61 @@ public final class PersianAnalyzer extends Analyzer { /** * Contains the stopwords used with the StopFilter. */ - private Set stoptable = new HashSet(); + private final Set stoptable; /** * The comment character in the stopwords file. All lines prefixed with this * will be ignored */ public static final String STOPWORDS_COMMENT = "#"; + + /** + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. + */ + public static Set getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class + * accesses the static final set the first time.; + */ + private static class DefaultSetHolder { + static final Set DEFAULT_STOP_SET; + + static { + try { + DEFAULT_STOP_SET = loadDefaultStopWordSet(); + } catch (IOException ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load default stopword set"); + } + } + + static Set loadDefaultStopWordSet() throws IOException { + InputStream stream = PersianAnalyzer.class + .getResourceAsStream(DEFAULT_STOPWORD_FILE); + try { + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + // make sure it is unmodifiable as we expose it in the outer class + return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, + STOPWORDS_COMMENT)); + } finally { + stream.close(); + } + } + } + + /** * Builds an analyzer with the default stop words: * {@link #DEFAULT_STOPWORD_FILE}. */ public PersianAnalyzer() { - try { - InputStream stream = PersianAnalyzer.class - .getResourceAsStream(DEFAULT_STOPWORD_FILE); - InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT); - reader.close(); - stream.close(); - } catch (IOException e) { - // TODO: throw IOException - throw new RuntimeException(e); - } + stoptable = DefaultSetHolder.DEFAULT_STOP_SET; } /** @@ -125,7 +157,7 @@ public final class PersianAnalyzer extends Analyzer { * the order here is important: the stopword list is normalized with the * above! */ - result = new StopFilter(result, stoptable); + result = new StopFilter(false, result, stoptable); return result; } @@ -158,7 +190,7 @@ public final class PersianAnalyzer extends Analyzer { * the order here is important: the stopword list is normalized with the * above! */ - streams.result = new StopFilter(streams.result, stoptable); + streams.result = new StopFilter(false, streams.result, stoptable); setPreviousTokenStream(streams); } else { streams.source.reset(reader); diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java index 0033e80fa47..db0ce041bfc 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java @@ -32,7 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; public final class PersianNormalizationFilter extends TokenFilter { private final PersianNormalizer normalizer; - private TermAttribute termAtt; + private final TermAttribute termAtt; public PersianNormalizationFilter(TokenStream input) { super(input); @@ -42,12 +42,11 @@ public final class PersianNormalizationFilter extends TokenFilter { public boolean incrementToken() throws IOException { if (input.incrementToken()) { - int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt + final int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt .termLength()); termAtt.setTermLength(newlen); return true; - } else { - return false; - } + } + return false; } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java index 53caa6ebbf0..8798b4e3ee0 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java @@ -59,18 +59,24 @@ public class PersianNormalizer { public int normalize(char s[], int len) { for (int i = 0; i < len; i++) { - if (s[i] == FARSI_YEH || s[i] == YEH_BARREE) + switch (s[i]) { + case FARSI_YEH: + case YEH_BARREE: s[i] = YEH; - - if (s[i] == KEHEH) + break; + case KEHEH: s[i] = KAF; - - if (s[i] == HEH_YEH || s[i] == HEH_GOAL) + break; + case HEH_YEH: + case HEH_GOAL: s[i] = HEH; - - if (s[i] == HAMZA_ABOVE) { // necessary for HEH + HAMZA + break; + case HAMZA_ABOVE: // necessary for HEH + HAMZA len = delete(s, i, len); i--; + break; + default: + break; } } @@ -88,7 +94,7 @@ public class PersianNormalizer { protected int delete(char s[], int pos, int len) { if (pos < len) System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - + return len - 1; }