diff --git a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index d618a79e7e5..a89e1cb8a1f 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -14,6 +14,8 @@ import java.util.Hashtable; * Analyzer for german language. Supports an external list of stopwords (words that * will not be indexed at all) and an external list of exclusions (word that will * not be stemmed, but indexed). + * A default set of stopwords is used unless an other list is specified, the + * exclusionlist is empty by default. * * @author Gerhard Schwarz * @version $Id$ @@ -35,8 +37,8 @@ public final class GermanAnalyzer extends Analyzer { "als", "für", "von", "mit", "dich", "dir", "mich", "mir", "mein", "sein", "kein", - "durch", "wegen" - }; + "durch", "wegen", "wird" + }; /** * Contains the stopwords used with the StopFilter. @@ -98,9 +100,9 @@ public final class GermanAnalyzer extends Analyzer { * Creates a TokenStream which tokenizes all the text in the provided Reader. * * @return A TokenStream build from a StandardTokenizer filtered with - * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter. + * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter */ - public final TokenStream tokenStream(String fieldName, Reader reader) { + public final TokenStream tokenStream( String fieldName, Reader reader ) { TokenStream result = new StandardTokenizer( reader ); result = new StandardFilter( result ); result = new StopFilter( result, stoptable ); diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java index 17d61489eae..19bc6fb9f0b 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java +++ b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java @@ -8,7 +8,8 @@ import java.util.Hashtable; /** * A filter that stemms german words. It supports a table of words that should - * not be stemmed at all. + * not be stemmed at all. The used stemmer can be changed at runtime after the + * filter object is created (as long as it is a GermanStemmer). * * @author Gerhard Schwarz * @version $Id$ @@ -32,30 +33,44 @@ public final class GermanStemFilter extends TokenFilter { */ public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) { this( in ); - this.exclusions = exclusions; + exclusions = exclusiontable; } /** - * @return Returns the next token in the stream, or null at EOS. + * @return Returns the next token in the stream, or null at EOS */ public final Token next() throws IOException { if ( ( token = input.next() ) == null ) { return null; } - // Check the exclusiontable. + // Check the exclusiontable else if ( exclusions != null && exclusions.contains( token.termText() ) ) { return token; } else { String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token. + // If not stemmed, dont waste the time creating a new token if ( !s.equals( token.termText() ) ) { return new Token( s, 0, s.length(), token.type() ); } return token; } } + /** + * Set a alternative/custom GermanStemmer for this filter. + */ + public void setStemmer( GermanStemmer stemmer ) { + if ( stemmer != null ) { + this.stemmer = stemmer; + } + } + /** + * Set an alternative exclusion list for this filter. + */ + public void setExclusionTable( Hashtable exclusiontable ) { + exclusions = exclusiontable; + } } diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java index f594a39fd4b..cffb3a751ac 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java +++ b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java @@ -30,7 +30,7 @@ public class GermanStemmer { /** * Stemms the given term to an unique discriminator. * - * @param word The term that should be stemmed. + * @param term The term that should be stemmed. * @return Discriminator for term */ protected String stem( String term ) { @@ -41,6 +41,9 @@ public class GermanStemmer { if ( Character.isUpperCase( term.charAt( 0 ) ) ) { uppercase = true; } + else { + uppercase = false; + } // Use lowercase for medium stemming. term = term.toLowerCase(); // Reset the StringBuffer. @@ -79,9 +82,10 @@ public class GermanStemmer { sb.setCharAt( sb.length() - 1, 'x' ); } } - // Check the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" for all + // Strip the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" from all // other terms. Adjectives, Verbs and Adverbs have a total of 52 different - // possible suffixes. + // possible suffixes, stripping only the characters from they are build + // does mostly the same else { // Strip base suffixes as long as enough characters remain. boolean doMore = true; @@ -112,10 +116,10 @@ public class GermanStemmer { } } } + sb = resubstitute( sb ); if ( !uppercase ) { sb = removeParticleDenotion( sb ); } - sb = resubstitute( sb ); return sb.toString(); } @@ -127,8 +131,8 @@ public class GermanStemmer { */ private StringBuffer removeParticleDenotion( StringBuffer buffer ) { for ( int c = 0; c < buffer.length(); c++ ) { - // Strip from the beginning of the string to the "ge" inclusive. - if ( c < ( sb.length() - 3 ) && buffer.charAt( c ) == 'g' && buffer.charAt ( c + 1 ) == 'e' ) { + // Strip from the beginning of the string to the "ge" inclusive + if ( c < ( buffer.length() - 4 ) && buffer.charAt( c ) == 'g' && buffer.charAt ( c + 1 ) == 'e' ) { buffer.delete( 0, c + 2 ); } } @@ -140,7 +144,7 @@ public class GermanStemmer { * * - Substitute Umlauts with their corresponding vowel: äöü -> aou, * "ß" is substituted by "ss" - * - Substitute an second char of an pair of equal characters with + * - Substitute a second char of an pair of equal characters with * an asterisk: ?? -> ?* * - Substitute some common character combinations with a token: * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! @@ -149,7 +153,7 @@ public class GermanStemmer { */ private StringBuffer substitute( StringBuffer buffer ) { for ( int c = 0; c < buffer.length(); c++ ) { - // Replace the second char of a pair of the equal characters with an asterisk. + // Replace the second char of a pair of the equal characters with an asterisk if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { buffer.setCharAt( c, '*' ); } @@ -163,14 +167,14 @@ public class GermanStemmer { else if ( buffer.charAt( c ) == 'ü' ) { buffer.setCharAt( c, 'u' ); } - // Take care that enough characters at left for search. + // Take care that at least one character is left left side from the current one if ( c < buffer.length() - 1 ) { if ( buffer.charAt( c ) == 'ß' ) { buffer.setCharAt( c, 's' ); buffer.insert( c + 1, 's' ); substCount++; } - // Masking several common character combinations with an token. + // Masking several common character combinations with an token else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) { buffer.setCharAt( c, '$' ); buffer.delete( c + 1, c + 3 ); @@ -240,10 +244,11 @@ public class GermanStemmer { return true; } /** - * Undoes some changes made by substitute(). That are character pairs and - * character combinations. + * Undoes the changes made by substitute(). That are character pairs and + * character combinations. Umlauts will remain as their corresponding vowel, + * as "ß" remains as "ss". * - * @return The term without the not human reaqdable substitutions. + * @return The term without the not human readable substitutions. */ private StringBuffer resubstitute( StringBuffer buffer ) { for ( int c = 0; c < buffer.length(); c++ ) { diff --git a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java index f55b8650891..ce0e4af510c 100644 --- a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java +++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java @@ -7,8 +7,9 @@ import java.io.LineNumberReader; import java.util.Hashtable; /** - * Loads a textfile and adds every entry to a Hashtable. If a file is not found - * or on any error, an empty table is returned. + * Loads a textfile and adds every line as an entry to a Hashtable. Every line + * should contain only one word. If a file is not found or on any error, an + * empty table is returned. * * @author Gerhard Schwarz * @version $Id$ @@ -16,15 +17,14 @@ import java.util.Hashtable; public class WordlistLoader { /** - * @param path Path to the wordlist. - * @param wordfile Name of the wordlist. + * @param path Path to the wordlist + * @param wordfile Name of the wordlist */ public static Hashtable getWordtable( String path, String wordfile ) { if ( path == null || wordfile == null ) { return new Hashtable(); } - File absoluteName = new File( path, wordfile ); - return getWordtable( absoluteName ); + return getWordtable( new File( path, wordfile ) ); } /** * @param wordfile Complete path to the wordlist @@ -33,12 +33,11 @@ public class WordlistLoader { if ( wordfile == null ) { return new Hashtable(); } - File absoluteName = new File( wordfile ); - return getWordtable( absoluteName ); + return getWordtable( new File( wordfile ) ); } /** - * @param wordfile File containing the wordlist. + * @param wordfile File containing the wordlist */ public static Hashtable getWordtable( File wordfile ) { if ( wordfile == null ) { @@ -57,11 +56,11 @@ public class WordlistLoader { System.arraycopy( stopwords, 0, tmp, 0, wordcount ); stopwords = tmp; } - stopwords[wordcount] = word; + stopwords[wordcount-1] = word; } result = makeWordTable( stopwords, wordcount ); } - // On error, use an empty table. + // On error, use an empty table catch ( IOException e ) { result = new Hashtable(); } @@ -71,8 +70,8 @@ public class WordlistLoader { /** * Builds the wordlist table. * - * @param words Word that where read. - * @param length Amount of words that where read into words. + * @param words Word that where read + * @param length Amount of words that where read into words */ private static Hashtable makeWordTable( String[] words, int length ) { Hashtable table = new Hashtable( length );