diff --git a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index 9782c52ee70..386987d269b 100644
--- a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -17,12 +17,14 @@ package org.apache.lucene.analysis.de;
*/
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
+import java.io.IOException;
import java.io.Reader;
import java.io.IOException;
import java.util.HashSet;
@@ -93,7 +95,7 @@ public class GermanAnalyzer extends Analyzer {
* Builds an analyzer with the given stop words.
*/
public GermanAnalyzer(File stopwords) throws IOException {
- stopSet = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
+ stopSet = WordlistLoader.getWordSet(stopwords);
}
/**
@@ -114,19 +116,19 @@ public class GermanAnalyzer extends Analyzer {
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable(File exclusionlist) throws IOException {
- exclusionSet = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
+ exclusionSet = WordlistLoader.getWordSet(exclusionlist);
}
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
- * StandardFilter, StopFilter, GermanStemFilter
+ * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
-// shouldn't there be a lowercaser before stop word filtering?
+ result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
result = new GermanStemFilter(result, exclusionSet);
return result;
diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
index 8e9e171f78f..df05b74ecc0 100644
--- a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
+++ b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
@@ -31,11 +31,6 @@ public class GermanStemmer
*/
private StringBuffer sb = new StringBuffer();
- /**
- * Indicates if a term is handled as a noun.
- */
- private boolean uppercase = false;
-
/**
* Amount of characters that are removed with substitute() while stemming.
*/
@@ -49,8 +44,6 @@ public class GermanStemmer
*/
protected String stem( String term )
{
- // Mark a possible noun.
- uppercase = Character.isUpperCase( term.charAt( 0 ) );
// Use lowercase for medium stemming.
term = term.toLowerCase();
if ( !isStemmable( term ) )
@@ -115,7 +108,7 @@ public class GermanStemmer
buffer.deleteCharAt( buffer.length() - 1 );
}
// "t" occurs only as suffix of verbs.
- else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
+ else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
else {
diff --git a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
index a39ae1e6066..0381f86c0b6 100644
--- a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
+++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
@@ -20,66 +20,42 @@ import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
+import java.util.HashSet;
import java.util.Hashtable;
+import java.util.Iterator;
/**
- * Loads a text file and adds every line as an entry to a Hashtable. Every line
- * should contain only one word.
+ * Loader for text files that represent a list of stopwords.
*
* @author Gerhard Schwarz
* @version $Id$
- * @todo refactor to convert to Sets instead of Hashtable
+ *
+ * @todo this is not specific to German, it should be moved up
*/
public class WordlistLoader {
- /**
- * @param path Path to the wordlist
- * @param wordfile Name of the wordlist
- */
- public static Hashtable getWordtable(String path, String wordfile) throws IOException {
- if (path == null || wordfile == null) {
- return new Hashtable();
- }
- return getWordtable(new File(path, wordfile));
- }
-
- /**
- * @param wordfile Complete path to the wordlist
- */
- public static Hashtable getWordtable(String wordfile) throws IOException {
- if (wordfile == null) {
- return new Hashtable();
- }
- return getWordtable(new File(wordfile));
- }
/**
+ * Loads a text file and adds every line as an entry to a HashSet (omitting
+ * leading and trailing whitespace). Every line of the file should contain only
+ * one word. The words need to be in lowercase if you make use of an
+ * Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
+ *
* @param wordfile File containing the wordlist
- * @todo Create a Set version of this method
+ * @return A HashSet with the file's words
*/
- public static Hashtable getWordtable(File wordfile) throws IOException {
- if (wordfile == null) {
- return new Hashtable();
- }
- Hashtable result = null;
+ public static HashSet getWordSet(File wordfile) throws IOException {
+ HashSet result = new HashSet();
FileReader freader = null;
LineNumberReader lnr = null;
try {
freader = new FileReader(wordfile);
lnr = new LineNumberReader(freader);
String word = null;
- String[] stopwords = new String[100];
- int wordcount = 0;
while ((word = lnr.readLine()) != null) {
- wordcount++;
- if (wordcount == stopwords.length) {
- String[] tmp = new String[stopwords.length + 50];
- System.arraycopy(stopwords, 0, tmp, 0, wordcount);
- stopwords = tmp;
+ result.add(word.trim());
}
- stopwords[wordcount - 1] = word;
}
- result = makeWordTable(stopwords, wordcount);
- } finally {
+ finally {
if (lnr != null)
lnr.close();
if (freader != null)
@@ -89,15 +65,46 @@ public class WordlistLoader {
}
/**
- * Builds the wordlist table.
- *
- * @param words Word that where read
- * @param length Amount of words that where read into words
+ * @param path Path to the wordlist
+ * @param wordfile Name of the wordlist
+ *
+ * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
*/
- private static Hashtable makeWordTable(String[] words, int length) {
- Hashtable table = new Hashtable(length);
- for (int i = 0; i < length; i++) {
- table.put(words[i], words[i]);
+ public static Hashtable getWordtable(String path, String wordfile) throws IOException {
+ return getWordtable(new File(path, wordfile));
+ }
+
+ /**
+ * @param wordfile Complete path to the wordlist
+ *
+ * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
+ */
+ public static Hashtable getWordtable(String wordfile) throws IOException {
+ return getWordtable(new File(wordfile));
+ }
+
+ /**
+ * @param wordfile File object that points to the wordlist
+ *
+ * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
+ */
+ public static Hashtable getWordtable(File wordfile) throws IOException {
+ HashSet wordSet = (HashSet)getWordSet(wordfile);
+ Hashtable result = makeWordTable(wordSet);
+ return result;
+ }
+
+ /**
+ * Builds a wordlist table, using words as both keys and values
+ * for backward compatibility.
+ *
+ * @param wordSet stopword set
+ */
+ private static Hashtable makeWordTable(HashSet wordSet) {
+ Hashtable table = new Hashtable();
+ for (Iterator iter = wordSet.iterator(); iter.hasNext();) {
+ String word = (String)iter.next();
+ table.put(word, word);
}
return table;
}