diff --git a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 9782c52ee70..386987d269b 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -17,12 +17,14 @@ package org.apache.lucene.analysis.de; */ import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.File; +import java.io.IOException; import java.io.Reader; import java.io.IOException; import java.util.HashSet; @@ -93,7 +95,7 @@ public class GermanAnalyzer extends Analyzer { * Builds an analyzer with the given stop words. */ public GermanAnalyzer(File stopwords) throws IOException { - stopSet = new HashSet(WordlistLoader.getWordtable(stopwords).keySet()); + stopSet = WordlistLoader.getWordSet(stopwords); } /** @@ -114,19 +116,19 @@ public class GermanAnalyzer extends Analyzer { * Builds an exclusionlist from the words contained in the given file. */ public void setStemExclusionTable(File exclusionlist) throws IOException { - exclusionSet = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet()); + exclusionSet = WordlistLoader.getWordSet(exclusionlist); } /** * Creates a TokenStream which tokenizes all the text in the provided Reader. * * @return A TokenStream build from a StandardTokenizer filtered with - * StandardFilter, StopFilter, GermanStemFilter + * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); -// shouldn't there be a lowercaser before stop word filtering? + result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); result = new GermanStemFilter(result, exclusionSet); return result; diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java index 8e9e171f78f..df05b74ecc0 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java +++ b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java @@ -31,11 +31,6 @@ public class GermanStemmer */ private StringBuffer sb = new StringBuffer(); - /** - * Indicates if a term is handled as a noun. - */ - private boolean uppercase = false; - /** * Amount of characters that are removed with substitute() while stemming. */ @@ -49,8 +44,6 @@ public class GermanStemmer */ protected String stem( String term ) { - // Mark a possible noun. - uppercase = Character.isUpperCase( term.charAt( 0 ) ); // Use lowercase for medium stemming. term = term.toLowerCase(); if ( !isStemmable( term ) ) @@ -115,7 +108,7 @@ public class GermanStemmer buffer.deleteCharAt( buffer.length() - 1 ); } // "t" occurs only as suffix of verbs. - else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) { + else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) { buffer.deleteCharAt( buffer.length() - 1 ); } else { diff --git a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java index a39ae1e6066..0381f86c0b6 100644 --- a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java +++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java @@ -20,66 +20,42 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; +import java.util.HashSet; import java.util.Hashtable; +import java.util.Iterator; /** - * Loads a text file and adds every line as an entry to a Hashtable. Every line - * should contain only one word. + * Loader for text files that represent a list of stopwords. * * @author Gerhard Schwarz * @version $Id$ - * @todo refactor to convert to Sets instead of Hashtable + * + * @todo this is not specific to German, it should be moved up */ public class WordlistLoader { - /** - * @param path Path to the wordlist - * @param wordfile Name of the wordlist - */ - public static Hashtable getWordtable(String path, String wordfile) throws IOException { - if (path == null || wordfile == null) { - return new Hashtable(); - } - return getWordtable(new File(path, wordfile)); - } - - /** - * @param wordfile Complete path to the wordlist - */ - public static Hashtable getWordtable(String wordfile) throws IOException { - if (wordfile == null) { - return new Hashtable(); - } - return getWordtable(new File(wordfile)); - } /** + * Loads a text file and adds every line as an entry to a HashSet (omitting + * leading and trailing whitespace). Every line of the file should contain only + * one word. The words need to be in lowercase if you make use of an + * Analyzer which uses LowerCaseFilter (like GermanAnalyzer). + * * @param wordfile File containing the wordlist - * @todo Create a Set version of this method + * @return A HashSet with the file's words */ - public static Hashtable getWordtable(File wordfile) throws IOException { - if (wordfile == null) { - return new Hashtable(); - } - Hashtable result = null; + public static HashSet getWordSet(File wordfile) throws IOException { + HashSet result = new HashSet(); FileReader freader = null; LineNumberReader lnr = null; try { freader = new FileReader(wordfile); lnr = new LineNumberReader(freader); String word = null; - String[] stopwords = new String[100]; - int wordcount = 0; while ((word = lnr.readLine()) != null) { - wordcount++; - if (wordcount == stopwords.length) { - String[] tmp = new String[stopwords.length + 50]; - System.arraycopy(stopwords, 0, tmp, 0, wordcount); - stopwords = tmp; + result.add(word.trim()); } - stopwords[wordcount - 1] = word; } - result = makeWordTable(stopwords, wordcount); - } finally { + finally { if (lnr != null) lnr.close(); if (freader != null) @@ -89,15 +65,46 @@ public class WordlistLoader { } /** - * Builds the wordlist table. - * - * @param words Word that where read - * @param length Amount of words that where read into words + * @param path Path to the wordlist + * @param wordfile Name of the wordlist + * + * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead */ - private static Hashtable makeWordTable(String[] words, int length) { - Hashtable table = new Hashtable(length); - for (int i = 0; i < length; i++) { - table.put(words[i], words[i]); + public static Hashtable getWordtable(String path, String wordfile) throws IOException { + return getWordtable(new File(path, wordfile)); + } + + /** + * @param wordfile Complete path to the wordlist + * + * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead + */ + public static Hashtable getWordtable(String wordfile) throws IOException { + return getWordtable(new File(wordfile)); + } + + /** + * @param wordfile File object that points to the wordlist + * + * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead + */ + public static Hashtable getWordtable(File wordfile) throws IOException { + HashSet wordSet = (HashSet)getWordSet(wordfile); + Hashtable result = makeWordTable(wordSet); + return result; + } + + /** + * Builds a wordlist table, using words as both keys and values + * for backward compatibility. + * + * @param wordSet stopword set + */ + private static Hashtable makeWordTable(HashSet wordSet) { + Hashtable table = new Hashtable(); + for (Iterator iter = wordSet.iterator(); iter.hasNext();) { + String word = (String)iter.next(); + table.put(word, word); } return table; }