- Applied a patch for bug 18410: http://issues.apache.org/bugzilla/show_bug.cgi?id=18410

PR: 18410 Obtained from: Submitted by: Daniel Naber Reviewed by: git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150267 13f79535-47bb-0310-9956-ffa450edef68
2004-03-30 15:44:58 +00:00 · 2004-03-30 15:44:58 +00:00 · 25f78d7cc2
parent 16146cbd32
commit 25f78d7cc2
3 changed files with 61 additions and 59 deletions
--- a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@ -17,12 +17,14 @@ package org.apache.lucene.analysis.de;
 */

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;

 import java.io.File;
+import java.io.IOException;
 import java.io.Reader;
 import java.io.IOException;
 import java.util.HashSet;
@ -93,7 +95,7 @@ public class GermanAnalyzer extends Analyzer {
   * Builds an analyzer with the given stop words.
   */
  public GermanAnalyzer(File stopwords) throws IOException {
-    stopSet = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
+    stopSet = WordlistLoader.getWordSet(stopwords);
  }

  /**
@ -114,19 +116,19 @@ public class GermanAnalyzer extends Analyzer {
   * Builds an exclusionlist from the words contained in the given file.
   */
  public void setStemExclusionTable(File exclusionlist) throws IOException {
-    exclusionSet = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
+    exclusionSet = WordlistLoader.getWordSet(exclusionlist);
  }

  /**
   * Creates a TokenStream which tokenizes all the text in the provided Reader.
   *
   * @return A TokenStream build from a StandardTokenizer filtered with
-   *         StandardFilter, StopFilter, GermanStemFilter
+   *         StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
   */
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
-// shouldn't there be a lowercaser before stop word filtering?
+    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopSet);
    result = new GermanStemFilter(result, exclusionSet);
    return result;
--- a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
+++ b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
@ -31,11 +31,6 @@ public class GermanStemmer
     */
    private StringBuffer sb = new StringBuffer();

-    /**
-     * Indicates if a term is handled as a noun.
-     */
-    private boolean uppercase = false;
-
    /**
     * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
     */
@ -49,8 +44,6 @@ public class GermanStemmer
     */
    protected String stem( String term )
    {
-	// Mark a possible noun.
-	uppercase = Character.isUpperCase( term.charAt( 0 ) );
 	// Use lowercase for medium stemming.
 	term = term.toLowerCase();
 	if ( !isStemmable( term ) )
@ -115,7 +108,7 @@ public class GermanStemmer
 		buffer.deleteCharAt( buffer.length() - 1 );
 	    }
 	    // "t" occurs only as suffix of verbs.
-	    else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) {
+	    else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
 		buffer.deleteCharAt( buffer.length() - 1 );
 	    }
 	    else {
--- a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
+++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
@ -20,66 +20,42 @@ import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.LineNumberReader;
+import java.util.HashSet;
 import java.util.Hashtable;
+import java.util.Iterator;

 /**
- * Loads a text file and adds every line as an entry to a Hashtable. Every line
- * should contain only one word.
+ * Loader for text files that represent a list of stopwords.
 *
 * @author Gerhard Schwarz
 * @version $Id$
- * @todo refactor to convert to Sets instead of Hashtable
+ *
+ * @todo this is not specific to German, it should be moved up
 */
 public class WordlistLoader {
-  /**
-   * @param path     Path to the wordlist
-   * @param wordfile Name of the wordlist
-   */
-  public static Hashtable getWordtable(String path, String wordfile) throws IOException {
-    if (path == null || wordfile == null) {
-      return new Hashtable();
-    }
-    return getWordtable(new File(path, wordfile));
-  }
-
-  /**
-   * @param wordfile Complete path to the wordlist
-   */
-  public static Hashtable getWordtable(String wordfile) throws IOException {
-    if (wordfile == null) {
-      return new Hashtable();
-    }
-    return getWordtable(new File(wordfile));
-  }

  /**
+   * Loads a text file and adds every line as an entry to a HashSet (omitting
+   * leading and trailing whitespace). Every line of the file should contain only 
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
+   * 
   * @param wordfile File containing the wordlist
-   * @todo Create a Set version of this method
+   * @return A HashSet with the file's words
   */
-  public static Hashtable getWordtable(File wordfile) throws IOException {
-    if (wordfile == null) {
-      return new Hashtable();
-    }
-    Hashtable result = null;
+  public static HashSet getWordSet(File wordfile) throws IOException {
+    HashSet result = new HashSet();
    FileReader freader = null;
    LineNumberReader lnr = null;
    try {
      freader = new FileReader(wordfile);
      lnr = new LineNumberReader(freader);
      String word = null;
-      String[] stopwords = new String[100];
-      int wordcount = 0;
      while ((word = lnr.readLine()) != null) {
-        wordcount++;
-        if (wordcount == stopwords.length) {
-          String[] tmp = new String[stopwords.length + 50];
-          System.arraycopy(stopwords, 0, tmp, 0, wordcount);
-          stopwords = tmp;
+        result.add(word.trim());
        }
-        stopwords[wordcount - 1] = word;
      }
-      result = makeWordTable(stopwords, wordcount);
-    } finally {
+    finally {
      if (lnr != null)
        lnr.close();
      if (freader != null)
@ -89,15 +65,46 @@ public class WordlistLoader {
  }

  /**
-   * Builds the wordlist table.
+   * @param path      Path to the wordlist
+   * @param wordfile  Name of the wordlist
   * 
-   * @param words  Word that where read
-   * @param length Amount of words that where read into <tt>words</tt>
+   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
   */
-  private static Hashtable makeWordTable(String[] words, int length) {
-    Hashtable table = new Hashtable(length);
-    for (int i = 0; i < length; i++) {
-      table.put(words[i], words[i]);
+  public static Hashtable getWordtable(String path, String wordfile) throws IOException {
+    return getWordtable(new File(path, wordfile));
+  }
+
+  /**
+   * @param wordfile  Complete path to the wordlist
+   * 
+   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
+   */
+  public static Hashtable getWordtable(String wordfile) throws IOException {
+    return getWordtable(new File(wordfile));
+  }
+
+  /**
+   * @param wordfile  File object that points to the wordlist
+   *
+   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
+   */
+  public static Hashtable getWordtable(File wordfile) throws IOException {
+    HashSet wordSet = (HashSet)getWordSet(wordfile);
+    Hashtable result = makeWordTable(wordSet);
+    return result;
+  }
+
+  /**
+   * Builds a wordlist table, using words as both keys and values
+   * for backward compatibility.
+   *
+   * @param wordSet   stopword set
+   */
+  private static Hashtable makeWordTable(HashSet wordSet) {
+    Hashtable table = new Hashtable();
+    for (Iterator iter = wordSet.iterator(); iter.hasNext();) {
+      String word = (String)iter.next();
+      table.put(word, word);
    }
    return table;
  }