diff --git a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 3df11f65a03..8d5146b129b 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -74,94 +74,104 @@ import java.util.Hashtable; * @author Gerhard Schwarz * @version $Id$ */ -public class GermanAnalyzer extends Analyzer { +public class GermanAnalyzer extends Analyzer +{ + /** + * List of typical german stopwords. + */ + private String[] GERMAN_STOP_WORDS = { + "einer", "eine", "eines", "einem", "einen", + "der", "die", "das", "dass", "daß", + "du", "er", "sie", "es", + "was", "wer", "wie", "wir", + "und", "oder", "ohne", "mit", + "am", "im", "in", "aus", "auf", + "ist", "sein", "war", "wird", + "ihr", "ihre", "ihres", + "als", "für", "von", "mit", + "dich", "dir", "mich", "mir", + "mein", "sein", "kein", + "durch", "wegen", "wird" + }; - /** - * List of typical german stopwords. - */ - private String[] GERMAN_STOP_WORDS = { - "einer", "eine", "eines", "einem", "einen", - "der", "die", "das", "dass", "daß", - "du", "er", "sie", "es", - "was", "wer", "wie", "wir", - "und", "oder", "ohne", "mit", - "am", "im", "in", "aus", "auf", - "ist", "sein", "war", "wird", - "ihr", "ihre", "ihres", - "als", "für", "von", "mit", - "dich", "dir", "mich", "mir", - "mein", "sein", "kein", - "durch", "wegen", "wird" - }; - - /** - * Contains the stopwords used with the StopFilter. - */ - private Hashtable stoptable = new Hashtable(); - /** - * Contains words that should be indexed but not stemmed. - */ - private Hashtable excltable = new Hashtable(); - - /** - * Builds an analyzer. - */ - public GermanAnalyzer() { - stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS ); - } + /** + * Contains the stopwords used with the StopFilter. + */ + private Hashtable stoptable = new Hashtable(); - /** - * Builds an analyzer with the given stop words. - */ - public GermanAnalyzer( String[] stopwords ) { - stoptable = StopFilter.makeStopTable( stopwords ); - } + /** + * Contains words that should be indexed but not stemmed. + */ + private Hashtable excltable = new Hashtable(); - /** - * Builds an analyzer with the given stop words. - */ - public GermanAnalyzer( Hashtable stopwords ) { - stoptable = stopwords; - } + /** + * Builds an analyzer. + */ + public GermanAnalyzer() + { + stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS ); + } - /** - * Builds an analyzer with the given stop words. - */ - public GermanAnalyzer( File stopwords ) { - stoptable = WordlistLoader.getWordtable( stopwords ); - } + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer( String[] stopwords ) + { + stoptable = StopFilter.makeStopTable( stopwords ); + } - /** - * Builds an exclusionlist from an array of Strings. - */ - public void setStemExclusionTable( String[] exclusionlist ) { - excltable = StopFilter.makeStopTable( exclusionlist ); - } - /** - * Builds an exclusionlist from a Hashtable. - */ - public void setStemExclusionTable( Hashtable exclusionlist ) { - excltable = exclusionlist; - } - /** - * Builds an exclusionlist from the words contained in the given file. - */ - public void setStemExclusionTable( File exclusionlist ) { - excltable = WordlistLoader.getWordtable( exclusionlist ); - } - - /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. - * - * @return A TokenStream build from a StandardTokenizer filtered with - * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter - */ - public TokenStream tokenStream( String fieldName, Reader reader ) { - TokenStream result = new StandardTokenizer( reader ); - result = new StandardFilter( result ); - result = new StopFilter( result, stoptable ); - result = new GermanStemFilter( result, excltable ); - return result; - } + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer( Hashtable stopwords ) + { + stoptable = stopwords; + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer( File stopwords ) + { + stoptable = WordlistLoader.getWordtable( stopwords ); + } + + /** + * Builds an exclusionlist from an array of Strings. + */ + public void setStemExclusionTable( String[] exclusionlist ) + { + excltable = StopFilter.makeStopTable( exclusionlist ); + } + + /** + * Builds an exclusionlist from a Hashtable. + */ + public void setStemExclusionTable( Hashtable exclusionlist ) + { + excltable = exclusionlist; + } + + /** + * Builds an exclusionlist from the words contained in the given file. + */ + public void setStemExclusionTable( File exclusionlist ) + { + excltable = WordlistLoader.getWordtable( exclusionlist ); + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter + */ + public TokenStream tokenStream( String fieldName, Reader reader ) + { + TokenStream result = new StandardTokenizer( reader ); + result = new StandardFilter( result ); + result = new StopFilter( result, stoptable ); + result = new GermanStemFilter( result, excltable ); + return result; + } } - diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java index 1b421d62e71..2dd78ebcf1e 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java +++ b/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java @@ -68,62 +68,69 @@ import java.util.Hashtable; * @author Gerhard Schwarz * @version $Id$ */ -public final class GermanStemFilter extends TokenFilter { +public final class GermanStemFilter extends TokenFilter +{ + /** + * The actual token in the input stream. + */ + private Token token = null; + private GermanStemmer stemmer = null; + private Hashtable exclusions = null; + + public GermanStemFilter( TokenStream in ) + { + stemmer = new GermanStemmer(); + input = in; + } + + /** + * Builds a GermanStemFilter that uses an exclusiontable. + */ + public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) + { + this( in ); + exclusions = exclusiontable; + } + + /** + * @return Returns the next token in the stream, or null at EOS + */ + public final Token next() + throws IOException + { + if ( ( token = input.next() ) == null ) { + return null; + } + // Check the exclusiontable + else if ( exclusions != null && exclusions.contains( token.termText() ) ) { + return token; + } + else { + String s = stemmer.stem( token.termText() ); + // If not stemmed, dont waste the time creating a new token + if ( !s.equals( token.termText() ) ) { + return new Token( s, token.startOffset(), + token.endOffset(), token.type() ); + } + return token; + } + } - /** - * The actual token in the input stream. - */ - private Token token = null; - private GermanStemmer stemmer = null; - private Hashtable exclusions = null; - - public GermanStemFilter( TokenStream in ) { - stemmer = new GermanStemmer(); - input = in; - } - - /** - * Builds a GermanStemFilter that uses an exclusiontable. - */ - public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) { - this( in ); - exclusions = exclusiontable; + /** + * Set a alternative/custom GermanStemmer for this filter. + */ + public void setStemmer( GermanStemmer stemmer ) + { + if ( stemmer != null ) { + this.stemmer = stemmer; } + } - /** - * @return Returns the next token in the stream, or null at EOS - */ - public final Token next() - throws IOException { - if ( ( token = input.next() ) == null ) { - return null; - } - // Check the exclusiontable - else if ( exclusions != null && exclusions.contains( token.termText() ) ) { - return token; - } - else { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) { - return new Token( s, token.startOffset(), - token.endOffset(), token.type() ); - } - return token; - } - } - /** - * Set a alternative/custom GermanStemmer for this filter. - */ - public void setStemmer( GermanStemmer stemmer ) { - if ( stemmer != null ) { - this.stemmer = stemmer; - } - } - /** - * Set an alternative exclusion list for this filter. - */ - public void setExclusionTable( Hashtable exclusiontable ) { - exclusions = exclusiontable; - } + /** + * Set an alternative exclusion list for this filter. + */ + public void setExclusionTable( Hashtable exclusiontable ) + { + exclusions = exclusiontable; + } } diff --git a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java index b9bdb969ebd..3260aee3404 100644 --- a/src/java/org/apache/lucene/analysis/de/GermanStemmer.java +++ b/src/java/org/apache/lucene/analysis/de/GermanStemmer.java @@ -62,17 +62,18 @@ package org.apache.lucene.analysis.de; * @author Gerhard Schwarz * @version $Id$ */ - -public class GermanStemmer { - +public class GermanStemmer +{ /** * Buffer for the terms while stemming them. */ private StringBuffer sb = new StringBuffer(); - /** - * Indicates if a term is handled as a noun. + + /** + * Indicates if a term is handled as a noun. */ private boolean uppercase = false; + /** * Amount of characters that are removed with substitute() while stemming. */ @@ -84,22 +85,24 @@ public class GermanStemmer { * @param term The term that should be stemmed. * @return Discriminator for term */ - protected String stem( String term ) { - // Mark a possible noun. - uppercase = Character.isUpperCase( term.charAt( 0 ) ); - // Use lowercase for medium stemming. - term = term.toLowerCase(); - if ( !isStemmable( term ) ) return term; - // Reset the StringBuffer. - sb.delete( 0, sb.length() ); - sb.insert( 0, term ); - // Stemming starts here... - substitute( sb ); - strip( sb ); - optimize( sb ); - resubstitute( sb ); - removeParticleDenotion( sb ); - return sb.toString(); + protected String stem( String term ) + { + // Mark a possible noun. + uppercase = Character.isUpperCase( term.charAt( 0 ) ); + // Use lowercase for medium stemming. + term = term.toLowerCase(); + if ( !isStemmable( term ) ) + return term; + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + // Stemming starts here... + substitute( sb ); + strip( sb ); + optimize( sb ); + resubstitute( sb ); + removeParticleDenotion( sb ); + return sb.toString(); } /** @@ -107,82 +110,90 @@ public class GermanStemmer { * * @return true if, and only if, the given term consists in letters. */ - private boolean isStemmable( String term ) { - for ( int c = 0; c < term.length(); c++ ) { - if ( !Character.isLetter( term.charAt( c ) ) ) return false; - } - return true; + private boolean isStemmable( String term ) + { + for ( int c = 0; c < term.length(); c++ ) { + if ( !Character.isLetter( term.charAt( c ) ) ) return false; + } + return true; } - /** - * suffix stripping (stemming) on the current term. The stripping is reduced - * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", - * from which all regular suffixes are build of. The simplification causes - * some overstemming, and way more irregular stems, but still provides unique. - * discriminators in the most of those cases. - * The algorithm is context free, except of the length restrictions. - */ - private void strip( StringBuffer buffer ) { - boolean doMore = true; - while ( doMore && buffer.length() > 3 ) { - if ( ( buffer.length() + substCount > 5 ) && buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) ) { - buffer.delete( buffer.length() - 2, buffer.length() ); - } - else if ( ( buffer.length() + substCount > 4 ) && buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) { - buffer.delete( buffer.length() - 2, buffer.length() ); - } - else if ( ( buffer.length() + substCount > 4 ) && buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) { - buffer.delete( buffer.length() - 2, buffer.length() ); - } - else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) { - buffer.deleteCharAt( buffer.length() - 1 ); - } - else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) { - buffer.deleteCharAt( buffer.length() - 1 ); - } - else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) { - buffer.deleteCharAt( buffer.length() - 1 ); - } - // "t" occurs only as suffix of verbs. - else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) { - buffer.deleteCharAt( buffer.length() - 1 ); - } - else { - doMore = false; - } - } + /** + * suffix stripping (stemming) on the current term. The stripping is reduced + * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", + * from which all regular suffixes are build of. The simplification causes + * some overstemming, and way more irregular stems, but still provides unique. + * discriminators in the most of those cases. + * The algorithm is context free, except of the length restrictions. + */ + private void strip( StringBuffer buffer ) + { + boolean doMore = true; + while ( doMore && buffer.length() > 3 ) { + if ( ( buffer.length() + substCount > 5 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) ) + { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + // "t" occurs only as suffix of verbs. + else if ( buffer.charAt( buffer.length() - 1 ) == 't' && !uppercase ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else { + doMore = false; + } } + } - /** - * Does some optimizations on the term. This optimisations are - * contextual. - * - * @return The term with the optimizations applied. - */ - private void optimize( StringBuffer buffer ) { - // Additional step for female plurals of professions and inhabitants. - if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) { - buffer.deleteCharAt( buffer.length() -1 ); - strip( buffer ); - } - // Additional step for irregular plural nouns like "Matrizen -> Matrix". - if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) { - buffer.setCharAt( buffer.length() - 1, 'x' ); - } + /** + * Does some optimizations on the term. This optimisations are + * contextual. + * + * @return The term with the optimizations applied. + */ + private void optimize( StringBuffer buffer ) + { + // Additional step for female plurals of professions and inhabitants. + if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) { + buffer.deleteCharAt( buffer.length() -1 ); + strip( buffer ); } + // Additional step for irregular plural nouns like "Matrizen -> Matrix". + if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) { + buffer.setCharAt( buffer.length() - 1, 'x' ); + } + } /** * Removes a particle denotion ("ge") from a term. */ - private void removeParticleDenotion( StringBuffer buffer ) { - if ( buffer.length() > 4 ) { - for ( int c = 0; c < buffer.length() - 3; c++ ) { - if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) { - buffer.delete( c, c + 2 ); - return; - } - } + private void removeParticleDenotion( StringBuffer buffer ) + { + if ( buffer.length() > 4 ) { + for ( int c = 0; c < buffer.length() - 3; c++ ) { + if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) { + buffer.delete( c, c + 2 ); + return; } + } + } } /** @@ -195,63 +206,66 @@ public class GermanStemmer { * - Substitute some common character combinations with a token: * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! */ - private void substitute( StringBuffer buffer ) { - substCount = 0; - for ( int c = 0; c < buffer.length(); c++ ) { - // Replace the second char of a pair of the equal characters with an asterisk - if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { - buffer.setCharAt( c, '*' ); - } - // Substitute Umlauts. - else if ( buffer.charAt( c ) == 'ä' ) { - buffer.setCharAt( c, 'a' ); - } - else if ( buffer.charAt( c ) == 'ö' ) { - buffer.setCharAt( c, 'o' ); - } - else if ( buffer.charAt( c ) == 'ü' ) { - buffer.setCharAt( c, 'u' ); - } - // Take care that at least one character is left left side from the current one - if ( c < buffer.length() - 1 ) { - if ( buffer.charAt( c ) == 'ß' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, 's' ); - substCount++; - } - // Masking several common character combinations with an token - else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) { - buffer.setCharAt( c, '$' ); - buffer.delete( c + 1, c + 3 ); - substCount =+ 2; - } - else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { - buffer.setCharAt( c, '§' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { - buffer.setCharAt( c, '%' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { - buffer.setCharAt( c, '&' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { - buffer.setCharAt( c, '#' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { - buffer.setCharAt( c, '!' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - } + private void substitute( StringBuffer buffer ) + { + substCount = 0; + for ( int c = 0; c < buffer.length(); c++ ) { + // Replace the second char of a pair of the equal characters with an asterisk + if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { + buffer.setCharAt( c, '*' ); + } + // Substitute Umlauts. + else if ( buffer.charAt( c ) == 'ä' ) { + buffer.setCharAt( c, 'a' ); + } + else if ( buffer.charAt( c ) == 'ö' ) { + buffer.setCharAt( c, 'o' ); + } + else if ( buffer.charAt( c ) == 'ü' ) { + buffer.setCharAt( c, 'u' ); + } + // Take care that at least one character is left left side from the current one + if ( c < buffer.length() - 1 ) { + if ( buffer.charAt( c ) == 'ß' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 's' ); + substCount++; } + // Masking several common character combinations with an token + else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && + buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) + { + buffer.setCharAt( c, '$' ); + buffer.delete( c + 1, c + 3 ); + substCount =+ 2; + } + else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { + buffer.setCharAt( c, '§' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { + buffer.setCharAt( c, '%' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { + buffer.setCharAt( c, '&' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { + buffer.setCharAt( c, '#' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { + buffer.setCharAt( c, '!' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + } + } } /** @@ -259,36 +273,37 @@ public class GermanStemmer { * character combinations. Umlauts will remain as their corresponding vowel, * as "ß" remains as "ss". */ - private void resubstitute( StringBuffer buffer ) { - for ( int c = 0; c < buffer.length(); c++ ) { - if ( buffer.charAt( c ) == '*' ) { - char x = buffer.charAt( c - 1 ); - buffer.setCharAt( c, x ); - } - else if ( buffer.charAt( c ) == '$' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); - } - else if ( buffer.charAt( c ) == '§' ) { - buffer.setCharAt( c, 'c' ); - buffer.insert( c + 1, 'h' ); - } - else if ( buffer.charAt( c ) == '%' ) { - buffer.setCharAt( c, 'e' ); - buffer.insert( c + 1, 'i' ); - } - else if ( buffer.charAt( c ) == '&' ) { - buffer.setCharAt( c, 'i' ); - buffer.insert( c + 1, 'e' ); - } - else if ( buffer.charAt( c ) == '#' ) { - buffer.setCharAt( c, 'i' ); - buffer.insert( c + 1, 'g' ); - } - else if ( buffer.charAt( c ) == '!' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, 't' ); - } - } + private void resubstitute( StringBuffer buffer ) + { + for ( int c = 0; c < buffer.length(); c++ ) { + if ( buffer.charAt( c ) == '*' ) { + char x = buffer.charAt( c - 1 ); + buffer.setCharAt( c, x ); + } + else if ( buffer.charAt( c ) == '$' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); + } + else if ( buffer.charAt( c ) == '§' ) { + buffer.setCharAt( c, 'c' ); + buffer.insert( c + 1, 'h' ); + } + else if ( buffer.charAt( c ) == '%' ) { + buffer.setCharAt( c, 'e' ); + buffer.insert( c + 1, 'i' ); + } + else if ( buffer.charAt( c ) == '&' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'e' ); + } + else if ( buffer.charAt( c ) == '#' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'g' ); + } + else if ( buffer.charAt( c ) == '!' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 't' ); + } + } } } diff --git a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java index 78f43638057..4b20b39fd16 100644 --- a/src/java/org/apache/lucene/analysis/de/WordlistLoader.java +++ b/src/java/org/apache/lucene/analysis/de/WordlistLoader.java @@ -68,71 +68,71 @@ import java.util.Hashtable; * @author Gerhard Schwarz * @version $Id$ */ -public class WordlistLoader { +public class WordlistLoader +{ + /** + * @param path Path to the wordlist + * @param wordfile Name of the wordlist + */ + public static Hashtable getWordtable( String path, String wordfile ) { + if ( path == null || wordfile == null ) { + return new Hashtable(); + } + return getWordtable( new File( path, wordfile ) ); + } - /** - * @param path Path to the wordlist - * @param wordfile Name of the wordlist - */ - public static Hashtable getWordtable( String path, String wordfile ) { - if ( path == null || wordfile == null ) { - return new Hashtable(); - } - return getWordtable( new File( path, wordfile ) ); - } - /** - * @param wordfile Complete path to the wordlist - */ - public static Hashtable getWordtable( String wordfile ) { - if ( wordfile == null ) { - return new Hashtable(); - } - return getWordtable( new File( wordfile ) ); + /** + * @param wordfile Complete path to the wordlist + */ + public static Hashtable getWordtable( String wordfile ) { + if ( wordfile == null ) { + return new Hashtable(); } + return getWordtable( new File( wordfile ) ); + } - /** - * @param wordfile File containing the wordlist - */ - public static Hashtable getWordtable( File wordfile ) { - if ( wordfile == null ) { - return new Hashtable(); - } - Hashtable result = null; - try { - LineNumberReader lnr = new LineNumberReader( new FileReader( wordfile ) ); - String word = null; - String[] stopwords = new String[100]; - int wordcount = 0; - while ( ( word = lnr.readLine() ) != null ) { - wordcount++; - if ( wordcount == stopwords.length ) { - String[] tmp = new String[stopwords.length + 50]; - System.arraycopy( stopwords, 0, tmp, 0, wordcount ); - stopwords = tmp; - } - stopwords[wordcount-1] = word; - } - result = makeWordTable( stopwords, wordcount ); - } - // On error, use an empty table - catch ( IOException e ) { - result = new Hashtable(); - } - return result; + /** + * @param wordfile File containing the wordlist + */ + public static Hashtable getWordtable( File wordfile ) { + if ( wordfile == null ) { + return new Hashtable(); } + Hashtable result = null; + try { + LineNumberReader lnr = new LineNumberReader( new FileReader( wordfile ) ); + String word = null; + String[] stopwords = new String[100]; + int wordcount = 0; + while ( ( word = lnr.readLine() ) != null ) { + wordcount++; + if ( wordcount == stopwords.length ) { + String[] tmp = new String[stopwords.length + 50]; + System.arraycopy( stopwords, 0, tmp, 0, wordcount ); + stopwords = tmp; + } + stopwords[wordcount-1] = word; + } + result = makeWordTable( stopwords, wordcount ); + } + // On error, use an empty table + catch ( IOException e ) { + result = new Hashtable(); + } + return result; + } - /** - * Builds the wordlist table. - * - * @param words Word that where read - * @param length Amount of words that where read into words - */ - private static Hashtable makeWordTable( String[] words, int length ) { - Hashtable table = new Hashtable( length ); - for ( int i = 0; i < length; i++ ) { - table.put( words[i], words[i] ); - } - return table; + /** + * Builds the wordlist table. + * + * @param words Word that where read + * @param length Amount of words that where read into words + */ + private static Hashtable makeWordTable( String[] words, int length ) { + Hashtable table = new Hashtable( length ); + for ( int i = 0; i < length; i++ ) { + table.put( words[i], words[i] ); } + return table; + } } -