diff --git a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java new file mode 100644 index 00000000000..d618a79e7e5 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -0,0 +1,113 @@ +package org.apache.lucene.analysis.de; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import java.io.File; +import java.io.Reader; +import java.util.Hashtable; + +/** + * Analyzer for german language. Supports an external list of stopwords (words that + * will not be indexed at all) and an external list of exclusions (word that will + * not be stemmed, but indexed). + * + * @author Gerhard Schwarz + * @version $Id$ + */ +public final class GermanAnalyzer extends Analyzer { + + /** + * List of typical german stopwords. + */ + private String[] GERMAN_STOP_WORDS = { + "einer", "eine", "eines", "einem", "einen", + "der", "die", "das", "dass", "daß", + "du", "er", "sie", "es", + "was", "wer", "wie", "wir", + "und", "oder", "ohne", "mit", + "am", "im", "in", "aus", "auf", + "ist", "sein", "war", "wird", + "ihr", "ihre", "ihres", + "als", "für", "von", "mit", + "dich", "dir", "mich", "mir", + "mein", "sein", "kein", + "durch", "wegen" + }; + + /** + * Contains the stopwords used with the StopFilter. + */ + private Hashtable stoptable = new Hashtable(); + /** + * Contains words that should be indexed but not stemmed. + */ + private Hashtable excltable = new Hashtable(); + + /** + * Builds an analyzer. + */ + public GermanAnalyzer() { + stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS ); + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer( String[] stopwords ) { + stoptable = StopFilter.makeStopTable( stopwords ); + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer( Hashtable stopwords ) { + stoptable = stopwords; + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer( File stopwords ) { + stoptable = WordlistLoader.getWordtable( stopwords ); + } + + /** + * Builds an exclusionlist from an array of Strings. + */ + public void setStemExclusionTable( String[] exclusionlist ) { + excltable = StopFilter.makeStopTable( exclusionlist ); + } + /** + * Builds an exclusionlist from a Hashtable. + */ + public void setStemExclusionTable( Hashtable exclusionlist ) { + excltable = exclusionlist; + } + /** + * Builds an exclusionlist from the words contained in the given file. + */ + public void setStemExclusionTable( File exclusionlist ) { + excltable = WordlistLoader.getWordtable( exclusionlist ); + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter. + */ + public final TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer( reader ); + result = new StandardFilter( result ); + result = new StopFilter( result, stoptable ); + result = new GermanStemFilter( result, excltable ); + // Convert to lowercase after stemming! + result = new LowerCaseFilter( result ); + return result; + } +} + diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java new file mode 100644 index 00000000000..17d61489eae --- /dev/null +++ b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.de; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.io.IOException; +import java.util.Hashtable; + +/** + * A filter that stemms german words. It supports a table of words that should + * not be stemmed at all. + * + * @author Gerhard Schwarz + * @version $Id$ + */ +public final class GermanStemFilter extends TokenFilter { + + /** + * The actual token in the input stream. + */ + private Token token = null; + private GermanStemmer stemmer = null; + private Hashtable exclusions = null; + + public GermanStemFilter( TokenStream in ) { + stemmer = new GermanStemmer(); + input = in; + } + + /** + * Builds a GermanStemFilter that uses an exclusiontable. + */ + public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) { + this( in ); + this.exclusions = exclusions; + } + + /** + * @return Returns the next token in the stream, or null at EOS. + */ + public final Token next() + throws IOException { + if ( ( token = input.next() ) == null ) { + return null; + } + // Check the exclusiontable. + else if ( exclusions != null && exclusions.contains( token.termText() ) ) { + return token; + } + else { + String s = stemmer.stem( token.termText() ); + // If not stemmed, dont waste the time creating a new token. + if ( !s.equals( token.termText() ) ) { + return new Token( s, 0, s.length(), token.type() ); + } + return token; + } + } +} + + diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java new file mode 100644 index 00000000000..f594a39fd4b --- /dev/null +++ b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java @@ -0,0 +1,282 @@ +package org.apache.lucene.analysis.de; + +/** + * A stemmer for german words. The algorithm is based on the report + * "A Fast and Simple Stemming Algorithm for German Words" by Jörg + * Caumanns (joerg.caumanns@isst.fhg.de). + * + * @author Gerhard Schwarz + * @version $Id$ + */ + +public class GermanStemmer { + + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + /** + * Indicates if a term is handled as a noun. + */ + private boolean uppercase = false; + /** + * Amount of characters that are removed with substitute() while stemming. + */ + private int substCount = 0; + + public GermanStemmer() { + } + + /** + * Stemms the given term to an unique discriminator. + * + * @param word The term that should be stemmed. + * @return Discriminator for term + */ + protected String stem( String term ) { + if ( !isStemmable( term ) ) { + return term; + } + // Mark a possible noun. + if ( Character.isUpperCase( term.charAt( 0 ) ) ) { + uppercase = true; + } + // Use lowercase for medium stemming. + term = term.toLowerCase(); + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + sb = substitute( sb ); + // Nouns have only seven possible suffixes. + if ( uppercase && sb.length() > 3 ) { + if ( sb.substring( sb.length() - 3, sb.length() ).equals( "ern" ) ) { + sb.delete( sb.length() - 3, sb.length() ); + } + else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "en" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "er" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "es" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( sb.charAt( sb.length() - 1 ) == 'e' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 'n' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 's' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + // Additional step for female plurals of professions and inhabitants. + if ( sb.length() > 5 && sb.substring( sb.length() - 3, sb.length() ).equals( "erin*" ) ) { + sb.deleteCharAt( sb.length() -1 ); + } + // Additional step for irregular plural nouns like "Matrizen -> Matrix". + if ( sb.charAt( sb.length() - 1 ) == ( 'z' ) ) { + sb.setCharAt( sb.length() - 1, 'x' ); + } + } + // Check the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" for all + // other terms. Adjectives, Verbs and Adverbs have a total of 52 different + // possible suffixes. + else { + // Strip base suffixes as long as enough characters remain. + boolean doMore = true; + while ( sb.length() > 3 && doMore ) { + if ( ( sb.length() + substCount > 5 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "nd" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( ( sb.length() + substCount > 4 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "er" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( ( sb.length() + substCount > 4 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "em" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( sb.charAt( sb.length() - 1 ) == 't' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 'n' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 's' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 'e' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else { + doMore = false; + } + } + } + if ( !uppercase ) { + sb = removeParticleDenotion( sb ); + } + sb = resubstitute( sb ); + return sb.toString(); + } + + /** + * Removes a particle denotion ("ge") from a term, but only if at least 3 + * characters will remain. + * + * @return The term without particle denotion, if there was one. + */ + private StringBuffer removeParticleDenotion( StringBuffer buffer ) { + for ( int c = 0; c < buffer.length(); c++ ) { + // Strip from the beginning of the string to the "ge" inclusive. + if ( c < ( sb.length() - 3 ) && buffer.charAt( c ) == 'g' && buffer.charAt ( c + 1 ) == 'e' ) { + buffer.delete( 0, c + 2 ); + } + } + return sb; + } + + /** + * Do some substitutions for the term to reduce overstemming: + * + * - Substitute Umlauts with their corresponding vowel: äöü -> aou, + * "ß" is substituted by "ss" + * - Substitute an second char of an pair of equal characters with + * an asterisk: ?? -> ?* + * - Substitute some common character combinations with a token: + * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! + * + * @return The term with all needed substitutions. + */ + private StringBuffer substitute( StringBuffer buffer ) { + for ( int c = 0; c < buffer.length(); c++ ) { + // Replace the second char of a pair of the equal characters with an asterisk. + if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { + buffer.setCharAt( c, '*' ); + } + // Substitute Umlauts. + else if ( buffer.charAt( c ) == 'ä' ) { + buffer.setCharAt( c, 'a' ); + } + else if ( buffer.charAt( c ) == 'ö' ) { + buffer.setCharAt( c, 'o' ); + } + else if ( buffer.charAt( c ) == 'ü' ) { + buffer.setCharAt( c, 'u' ); + } + // Take care that enough characters at left for search. + if ( c < buffer.length() - 1 ) { + if ( buffer.charAt( c ) == 'ß' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 's' ); + substCount++; + } + // Masking several common character combinations with an token. + else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) { + buffer.setCharAt( c, '$' ); + buffer.delete( c + 1, c + 3 ); + substCount =+ 2; + } + else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { + buffer.setCharAt( c, '§' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { + buffer.setCharAt( c, '%' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { + buffer.setCharAt( c, '&' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { + buffer.setCharAt( c, '#' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { + buffer.setCharAt( c, '!' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + } + } + return buffer; + } + + /** + * Checks a term if it can be processed correctly. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) { + boolean upper = false; + int first = -1; + for ( int c = 0; c < term.length(); c++ ) { + // Discard terms that contain non-letter characters. + if ( !Character.isLetter( term.charAt( c ) ) ) { + return false; + } + // Discard terms that contain multiple uppercase letters. + if ( Character.isUpperCase( term.charAt( c ) ) ) { + if ( upper ) { + return false; + } + // First encountered uppercase letter, set flag and save + // position. + else { + first = c; + upper = true; + } + } + } + // Discard the term if it contains a single uppercase letter that + // is not starting the term. + if ( first > 0 ) { + return false; + } + return true; + } + /** + * Undoes some changes made by substitute(). That are character pairs and + * character combinations. + * + * @return The term without the not human reaqdable substitutions. + */ + private StringBuffer resubstitute( StringBuffer buffer ) { + for ( int c = 0; c < buffer.length(); c++ ) { + if ( buffer.charAt( c ) == '*' ) { + char x = buffer.charAt( c - 1 ); + buffer.setCharAt( c, x ); + } + else if ( buffer.charAt( c ) == '$' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); + } + else if ( buffer.charAt( c ) == '§' ) { + buffer.setCharAt( c, 'c' ); + buffer.insert( c + 1, 'h' ); + } + else if ( buffer.charAt( c ) == '%' ) { + buffer.setCharAt( c, 'e' ); + buffer.insert( c + 1, 'i' ); + } + else if ( buffer.charAt( c ) == '&' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'e' ); + } + else if ( buffer.charAt( c ) == '#' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'g' ); + } + else if ( buffer.charAt( c ) == '!' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 't' ); + } + } + return buffer; + } +} + diff --git a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java new file mode 100644 index 00000000000..f55b8650891 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java @@ -0,0 +1,85 @@ +package org.apache.lucene.analysis.de; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.LineNumberReader; +import java.util.Hashtable; + +/** + * Loads a textfile and adds every entry to a Hashtable. If a file is not found + * or on any error, an empty table is returned. + * + * @author Gerhard Schwarz + * @version $Id$ + */ +public class WordlistLoader { + + /** + * @param path Path to the wordlist. + * @param wordfile Name of the wordlist. + */ + public static Hashtable getWordtable( String path, String wordfile ) { + if ( path == null || wordfile == null ) { + return new Hashtable(); + } + File absoluteName = new File( path, wordfile ); + return getWordtable( absoluteName ); + } + /** + * @param wordfile Complete path to the wordlist + */ + public static Hashtable getWordtable( String wordfile ) { + if ( wordfile == null ) { + return new Hashtable(); + } + File absoluteName = new File( wordfile ); + return getWordtable( absoluteName ); + } + + /** + * @param wordfile File containing the wordlist. + */ + public static Hashtable getWordtable( File wordfile ) { + if ( wordfile == null ) { + return new Hashtable(); + } + Hashtable result = null; + try { + LineNumberReader lnr = new LineNumberReader( new FileReader( wordfile ) ); + String word = null; + String[] stopwords = new String[100]; + int wordcount = 0; + while ( ( word = lnr.readLine() ) != null ) { + wordcount++; + if ( wordcount == stopwords.length ) { + String[] tmp = new String[stopwords.length + 50]; + System.arraycopy( stopwords, 0, tmp, 0, wordcount ); + stopwords = tmp; + } + stopwords[wordcount] = word; + } + result = makeWordTable( stopwords, wordcount ); + } + // On error, use an empty table. + catch ( IOException e ) { + result = new Hashtable(); + } + return result; + } + + /** + * Builds the wordlist table. + * + * @param words Word that where read. + * @param length Amount of words that where read into words. + */ + private static Hashtable makeWordTable( String[] words, int length ) { + Hashtable table = new Hashtable( length ); + for ( int i = 0; i < length; i++ ) { + table.put( words[i], words[i] ); + } + return table; + } +} +