From c4dd210b0f4248097c090f28bbfe699faeb6f144 Mon Sep 17 00:00:00 2001 From: Erik Hatcher Date: Thu, 11 Mar 2004 03:05:36 +0000 Subject: [PATCH] bringing sandbox analyzers up to date with changes to the core StopFilter and migrating away from using Hashtable git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150964 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/analysis/LengthFilter.java | 2 +- .../lucene/analysis/br/BrazilianAnalyzer.java | 19 +- .../analysis/br/BrazilianStemFilter.java | 10 +- .../lucene/analysis/cjk/CJKAnalyzer.java | 7 +- .../lucene/analysis/cz/CzechAnalyzer.java | 21 +- .../lucene/analysis/fr/FrenchAnalyzer.java | 22 +- .../lucene/analysis/fr/FrenchStemFilter.java | 12 +- .../lucene/analysis/nl/DutchAnalyzer.java | 237 +++--- .../lucene/analysis/nl/DutchStemFilter.java | 162 ++-- .../lucene/analysis/nl/DutchStemmer.java | 744 ++++++++---------- .../lucene/analysis/nl/WordlistLoader.java | 203 +++-- 11 files changed, 689 insertions(+), 750 deletions(-) diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java index 3400e336f65..ad88d1f4d23 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java @@ -35,7 +35,7 @@ public final class LengthFilter extends TokenFilter { */ public LengthFilter(TokenStream in, int min, int max) { - input = in; + super(in); this.min = min; this.max =max; } diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java index 9bd24f7d32f..287b0438b9b 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java @@ -64,6 +64,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.File; import java.io.Reader; import java.util.Hashtable; +import java.util.HashSet; /** * Analyzer for brazilian language. Supports an external list of stopwords (words that @@ -102,57 +103,57 @@ public final class BrazilianAnalyzer extends Analyzer { /** * Contains the stopwords used with the StopFilter. */ - private Hashtable stoptable = new Hashtable(); + private HashSet stoptable = new HashSet(); /** * Contains words that should be indexed but not stemmed. */ - private Hashtable excltable = new Hashtable(); + private HashSet excltable = new HashSet(); /** * Builds an analyzer. */ public BrazilianAnalyzer() { - stoptable = StopFilter.makeStopTable( BRAZILIAN_STOP_WORDS ); + stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS ); } /** * Builds an analyzer with the given stop words. */ public BrazilianAnalyzer( String[] stopwords ) { - stoptable = StopFilter.makeStopTable( stopwords ); + stoptable = StopFilter.makeStopSet( stopwords ); } /** * Builds an analyzer with the given stop words. */ public BrazilianAnalyzer( Hashtable stopwords ) { - stoptable = stopwords; + stoptable = new HashSet(stopwords.keySet()); } /** * Builds an analyzer with the given stop words. */ public BrazilianAnalyzer( File stopwords ) { - stoptable = WordlistLoader.getWordtable( stopwords ); + stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet()); } /** * Builds an exclusionlist from an array of Strings. */ public void setStemExclusionTable( String[] exclusionlist ) { - excltable = StopFilter.makeStopTable( exclusionlist ); + excltable = StopFilter.makeStopSet( exclusionlist ); } /** * Builds an exclusionlist from a Hashtable. */ public void setStemExclusionTable( Hashtable exclusionlist ) { - excltable = exclusionlist; + excltable = new HashSet(exclusionlist.keySet()); } /** * Builds an exclusionlist from the words contained in the given file. */ public void setStemExclusionTable( File exclusionlist ) { - excltable = WordlistLoader.getWordtable( exclusionlist ); + excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet()); } /** diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java index 8fe1f81eb43..ebc48ef3ca1 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java @@ -59,6 +59,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import java.io.IOException; import java.util.Hashtable; +import java.util.HashSet; /** * Based on (copied) the GermanStemFilter @@ -79,7 +80,7 @@ public final class BrazilianStemFilter extends TokenFilter { */ private Token token = null; private BrazilianStemmer stemmer = null; - private Hashtable exclusions = null; + private HashSet exclusions = null; public BrazilianStemFilter( TokenStream in ) { super(in); @@ -88,8 +89,15 @@ public final class BrazilianStemFilter extends TokenFilter { /** * Builds a BrazilianStemFilter that uses an exclusiontable. + * + * @deprecated */ public BrazilianStemFilter( TokenStream in, Hashtable exclusiontable ) { + this( in ); + this.exclusions = new HashSet(exclusiontable.keySet()); + } + + public BrazilianStemFilter( TokenStream in, HashSet exclusiontable ) { this( in ); this.exclusions = exclusiontable; } diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java index 9c8594d1c02..ed050bc1be6 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java @@ -63,6 +63,7 @@ import org.apache.lucene.analysis.TokenStream; import java.io.Reader; import java.util.Hashtable; +import java.util.HashSet; /** @@ -91,7 +92,7 @@ public class CJKAnalyzer extends Analyzer { //~ Instance fields -------------------------------------------------------- /** stop word list */ - private Hashtable stopTable; + private HashSet stopTable; //~ Constructors ----------------------------------------------------------- @@ -99,7 +100,7 @@ public class CJKAnalyzer extends Analyzer { * Builds an analyzer which removes words in STOP_WORDS. */ public CJKAnalyzer() { - stopTable = StopFilter.makeStopTable(stopWords); + stopTable = StopFilter.makeStopSet(stopWords); } /** @@ -108,7 +109,7 @@ public class CJKAnalyzer extends Analyzer { * @param stopWords stop word array */ public CJKAnalyzer(String[] stopWords) { - stopTable = StopFilter.makeStopTable(stopWords); + stopTable = StopFilter.makeStopSet(stopWords); } //~ Methods ---------------------------------------------------------------- diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java index ee3d0fd0ae6..9cd7fe4a52a 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java @@ -64,6 +64,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.*; import java.util.Hashtable; +import java.util.HashSet; /** * Analyzer for Czech language. Supports an external list of stopwords (words that @@ -102,26 +103,32 @@ public final class CzechAnalyzer extends Analyzer { /** * Contains the stopwords used with the StopFilter. */ - private Hashtable stoptable = new Hashtable(); + private HashSet stoptable; /** * Builds an analyzer. */ public CzechAnalyzer() { - stoptable = StopFilter.makeStopTable( STOP_WORDS ); + stoptable = StopFilter.makeStopSet( STOP_WORDS ); } /** * Builds an analyzer with the given stop words. */ public CzechAnalyzer( String[] stopwords ) { - stoptable = StopFilter.makeStopTable( stopwords ); + stoptable = StopFilter.makeStopSet( stopwords ); } /** * Builds an analyzer with the given stop words. + * + * @deprecated */ public CzechAnalyzer( Hashtable stopwords ) { + stoptable = new HashSet(stopwords.keySet()); + } + + public CzechAnalyzer( HashSet stopwords ) { stoptable = stopwords; } @@ -129,7 +136,7 @@ public final class CzechAnalyzer extends Analyzer { * Builds an analyzer with the given stop words. */ public CzechAnalyzer( File stopwords ) { - stoptable = WordlistLoader.getWordtable( stopwords ); + stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet()); } /** @@ -139,12 +146,12 @@ public final class CzechAnalyzer extends Analyzer { */ public void loadStopWords( InputStream wordfile, String encoding ) { if ( wordfile == null ) { - stoptable = new Hashtable(); + stoptable = new HashSet(); return; } try { // clear any previous table (if present) - stoptable = new Hashtable(); + stoptable = new HashSet(); InputStreamReader isr; if (encoding == null) @@ -156,7 +163,7 @@ public final class CzechAnalyzer extends Analyzer { LineNumberReader lnr = new LineNumberReader(isr); String word; while ( ( word = lnr.readLine() ) != null ) { - stoptable.put(word, word); + stoptable.add(word); } } catch ( IOException e ) { diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java index 269e6e981f3..4bf82676f74 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java @@ -63,6 +63,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.File; import java.io.Reader; import java.util.Hashtable; +import java.util.HashSet; + import org.apache.lucene.analysis.de.WordlistLoader; /** @@ -108,57 +110,59 @@ public final class FrenchAnalyzer extends Analyzer { /** * Contains the stopwords used with the StopFilter. */ - private Hashtable stoptable = new Hashtable(); + private HashSet stoptable = new HashSet(); /** * Contains words that should be indexed but not stemmed. */ - private Hashtable excltable = new Hashtable(); + private HashSet excltable = new HashSet(); /** * Builds an analyzer. */ public FrenchAnalyzer() { - stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS ); + stoptable = StopFilter.makeStopSet( FRENCH_STOP_WORDS ); } /** * Builds an analyzer with the given stop words. */ public FrenchAnalyzer( String[] stopwords ) { - stoptable = StopFilter.makeStopTable( stopwords ); + stoptable = StopFilter.makeStopSet( stopwords ); } /** * Builds an analyzer with the given stop words. + * + * @deprecated */ public FrenchAnalyzer( Hashtable stopwords ) { - stoptable = stopwords; + stoptable = new HashSet(stopwords.keySet()); } /** * Builds an analyzer with the given stop words. */ public FrenchAnalyzer( File stopwords ) { - stoptable = WordlistLoader.getWordtable( stopwords ); + stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet()); } /** * Builds an exclusionlist from an array of Strings. */ public void setStemExclusionTable( String[] exclusionlist ) { - excltable = StopFilter.makeStopTable( exclusionlist ); + excltable = StopFilter.makeStopSet( exclusionlist ); } /** * Builds an exclusionlist from a Hashtable. */ public void setStemExclusionTable( Hashtable exclusionlist ) { - excltable = exclusionlist; + excltable = new HashSet(exclusionlist.keySet()); } /** * Builds an exclusionlist from the words contained in the given file. */ public void setStemExclusionTable( File exclusionlist ) { - excltable = WordlistLoader.getWordtable( exclusionlist ); + excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet()); } /** diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java index 80d1811b119..cc43b197c02 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java @@ -59,6 +59,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import java.io.IOException; import java.util.Hashtable; +import java.util.HashSet; /** * A filter that stemms french words. It supports a table of words that should @@ -74,7 +75,7 @@ public final class FrenchStemFilter extends TokenFilter { */ private Token token = null; private FrenchStemmer stemmer = null; - private Hashtable exclusions = null; + private HashSet exclusions = null; public FrenchStemFilter( TokenStream in ) { super(in); @@ -83,8 +84,15 @@ public final class FrenchStemFilter extends TokenFilter { /** * Builds a FrenchStemFilter that uses an exclusiontable. + * + * @deprecated */ public FrenchStemFilter( TokenStream in, Hashtable exclusiontable ) { + this( in ); + exclusions = new HashSet(exclusiontable.keySet()); + } + + public FrenchStemFilter( TokenStream in, HashSet exclusiontable ) { this( in ); exclusions = exclusiontable; } @@ -122,7 +130,7 @@ public final class FrenchStemFilter extends TokenFilter { * Set an alternative exclusion list for this filter. */ public void setExclusionTable( Hashtable exclusiontable ) { - exclusions = exclusiontable; + exclusions = new HashSet(exclusiontable.keySet()); } } diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java index a73f9613150..03b9d806577 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java @@ -21,148 +21,137 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.Token; + import java.io.File; -import java.io.*; import java.io.Reader; -import java.util.Hashtable; +import java.util.HashMap; +import java.util.HashSet; /** - * * @author Edwin de Jonge - * - * Analyzer for Dutch language. Supports an external list of stopwords (words that - * will not be indexed at all), an external list of exclusions (word that will - * not be stemmed, but indexed) and an external list of word-stem pairs that overrule - * the algorithm (dictionary stemming). - * A default set of stopwords is used unless an alternative list is specified, the - * exclusion list is empty by default. - * As start for the Analyzer the German Analyzer was used. The stemming algorithm - * implemented can be found at @link + *

+ * Analyzer for Dutch language. Supports an external list of stopwords (words that + * will not be indexed at all), an external list of exclusions (word that will + * not be stemmed, but indexed) and an external list of word-stem pairs that overrule + * the algorithm (dictionary stemming). + * A default set of stopwords is used unless an alternative list is specified, the + * exclusion list is empty by default. + * As start for the Analyzer the German Analyzer was used. The stemming algorithm + * implemented can be found at @link */ -public class DutchAnalyzer extends Analyzer -{ - /** - * List of typical Dutch stopwords. - */ - private String[] DUTCH_STOP_WORDS = - { - "de","en","van","ik","te","dat","die","in","een", - "hij","het","niet","zijn","is","was","op","aan","met","als","voor","had", - "er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo", - "door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar", - "haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal", - "me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren", - "veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus", - "alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt", - "wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets", - "uw","iemand","geweest","andere" - }; +public class DutchAnalyzer extends Analyzer { + /** + * List of typical Dutch stopwords. + */ + private String[] DUTCH_STOP_WORDS = + { + "de", "en", "van", "ik", "te", "dat", "die", "in", "een", + "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had", + "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo", + "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar", + "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal", + "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren", + "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus", + "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt", + "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets", + "uw", "iemand", "geweest", "andere" + }; - /** - * Contains the stopwords used with the StopFilter. - */ - private Hashtable stoptable = new Hashtable(); + /** + * Contains the stopwords used with the StopFilter. + */ + private HashSet stoptable = new HashSet(); - /** - * Contains words that should be indexed but not stemmed. - */ - private Hashtable excltable = new Hashtable(); + /** + * Contains words that should be indexed but not stemmed. + */ + private HashSet excltable = new HashSet(); - private Hashtable _stemdict = new Hashtable(); + private HashMap _stemdict = new HashMap(); - /** - * Builds an analyzer. - */ - public DutchAnalyzer() - { - stoptable = StopFilter.makeStopTable( DUTCH_STOP_WORDS ); - _stemdict.put("fiets","fiets"); //otherwise fiet - _stemdict.put("bromfiets","bromfiets"); //otherwise bromfiet - _stemdict.put("ei","eier"); - _stemdict.put("kind","kinder"); - } + /** + * Builds an analyzer. + */ + public DutchAnalyzer() { + stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS); + _stemdict.put("fiets", "fiets"); //otherwise fiet + _stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet + _stemdict.put("ei", "eier"); + _stemdict.put("kind", "kinder"); + } - /** - * Builds an analyzer with the given stop words. - * - * @param stopwords - */ - public DutchAnalyzer( String[] stopwords ) - { - stoptable = StopFilter.makeStopTable( stopwords ); - } + /** + * Builds an analyzer with the given stop words. + * + * @param stopwords + */ + public DutchAnalyzer(String[] stopwords) { + stoptable = StopFilter.makeStopSet(stopwords); + } - /** - * Builds an analyzer with the given stop words. - * - * @param stopwords - */ - public DutchAnalyzer( Hashtable stopwords ) - { - stoptable = stopwords; - } + /** + * Builds an analyzer with the given stop words. + * + * @param stopwords + */ + public DutchAnalyzer(HashSet stopwords) { + stoptable = stopwords; + } - /** - * Builds an analyzer with the given stop words. - * - * @param stopwords - */ - public DutchAnalyzer( File stopwords ) - { - stoptable = WordlistLoader.getWordtable( stopwords ); - } + /** + * Builds an analyzer with the given stop words. + * + * @param stopwords + */ + public DutchAnalyzer(File stopwords) { + stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet()); + } - /** - * Builds an exclusionlist from an array of Strings. - * - * @param exclusionlist - */ - public void setStemExclusionTable( String[] exclusionlist ) - { - excltable = StopFilter.makeStopTable( exclusionlist ); - } + /** + * Builds an exclusionlist from an array of Strings. + * + * @param exclusionlist + */ + public void setStemExclusionTable(String[] exclusionlist) { + excltable = StopFilter.makeStopSet(exclusionlist); + } - /** - * Builds an exclusionlist from a Hashtable. - */ - public void setStemExclusionTable( Hashtable exclusionlist ) - { - excltable = exclusionlist; - } + /** + * Builds an exclusionlist from a Hashtable. + */ + public void setStemExclusionTable(HashSet exclusionlist) { + excltable = exclusionlist; + } - /** - * Builds an exclusionlist from the words contained in the given file. - */ - public void setStemExclusionTable(File exclusionlist) - { - excltable = WordlistLoader.getWordtable(exclusionlist); - } + /** + * Builds an exclusionlist from the words contained in the given file. + */ + public void setStemExclusionTable(File exclusionlist) { + excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet()); + } - /** - * Reads a stemdictionary file , that overrules the stemming algorithm - * This is a textfile that contains per line - * word\tstem - * i.e: tabseperated - */ - public void setStemDictionary(File stemdict) - { - _stemdict = WordlistLoader.getStemDict(stemdict); - } + /** + * Reads a stemdictionary file , that overrules the stemming algorithm + * This is a textfile that contains per line + * word\tstem + * i.e: tabseperated + */ + public void setStemDictionary(File stemdict) { + _stemdict = WordlistLoader.getStemDict(stemdict); + } - /** - * Creates a TokenStream which tokenizes all the text in the provided TextReader. - * - * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter - */ - public TokenStream tokenStream(String fieldName, Reader reader) - { - TokenStream result = new StandardTokenizer( reader ); - result = new StandardFilter( result ); - result = new StopFilter( result, stoptable ); - result = new DutchStemFilter( result, excltable, _stemdict); - return result; - } + /** + * Creates a TokenStream which tokenizes all the text in the provided TextReader. + * + * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter + */ + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(reader); + result = new StandardFilter(result); + result = new StopFilter(result, stoptable); + result = new DutchStemFilter(result, excltable, _stemdict); + return result; + } } diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java index cf8b7c71c31..156497f5056 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java @@ -19,105 +19,91 @@ package org.apache.lucene.analysis.nl; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; + import java.io.IOException; -import java.util.Hashtable; +import java.util.HashMap; +import java.util.HashSet; /** - * * @author Edwin de Jonge - * - * A filter that stems Dutch words. It supports a table of words that should - * not be stemmed at all. The stemmer used can be changed at runtime after the - * filter object is created (as long as it is a DutchStemmer). + *

+ * A filter that stems Dutch words. It supports a table of words that should + * not be stemmed at all. The stemmer used can be changed at runtime after the + * filter object is created (as long as it is a DutchStemmer). */ -public final class DutchStemFilter extends TokenFilter -{ - /** - * The actual token in the input stream. - */ - private Token token = null; - private DutchStemmer stemmer = null; - private Hashtable exclusions = null; +public final class DutchStemFilter extends TokenFilter { + /** + * The actual token in the input stream. + */ + private Token token = null; + private DutchStemmer stemmer = null; + private HashSet exclusions = null; - public DutchStemFilter( TokenStream _in ) - { - super(_in); - stemmer = new DutchStemmer(); - } + public DutchStemFilter(TokenStream _in) { + super(_in); + stemmer = new DutchStemmer(); + } - /** - * Builds a DutchStemFilter that uses an exclusiontable. - */ - public DutchStemFilter( TokenStream _in, Hashtable exclusiontable ) - { - this(_in); - exclusions = exclusiontable; - } + /** + * Builds a DutchStemFilter that uses an exclusiontable. + */ + public DutchStemFilter(TokenStream _in, HashSet exclusiontable) { + this(_in); + exclusions = exclusiontable; + } - /** - * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm - */ - public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary) - { - this(_in, exclusiontable); - stemmer.setStemDictionary(stemdictionary); - } + /** + * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm + */ + public DutchStemFilter(TokenStream _in, HashSet exclusiontable, HashMap stemdictionary) { + this(_in, exclusiontable); + stemmer.setStemDictionary(stemdictionary); + } - /** - * @return Returns the next token in the stream, or null at EOS - */ - public Token next() throws IOException + /** + * @return Returns the next token in the stream, or null at EOS + */ + public Token next() throws IOException { + if ((token = input.next()) == null) { + return null; + } - { - if ( ( token = input.next() ) == null ) - { - return null; - } + // Check the exclusiontable + else if (exclusions != null && exclusions.contains(token.termText())) { + return token; + } else { + String s = stemmer.stem(token.termText()); + // If not stemmed, dont waste the time creating a new token + if (!s.equals(token.termText())) { + return new Token(s, token.startOffset(), + token.endOffset(), token.type()); + } + return token; + } + } - // Check the exclusiontable - else if ( exclusions != null && exclusions.contains( token.termText() ) ) - { - return token; - } - else - { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) - { - return new Token( s, token.startOffset(), - token.endOffset(), token.type() ); - } - return token; - } - } + /** + * Set a alternative/custom DutchStemmer for this filter. + */ + public void setStemmer(DutchStemmer stemmer) { + if (stemmer != null) { + this.stemmer = stemmer; + } + } - /** - * Set a alternative/custom DutchStemmer for this filter. - */ - public void setStemmer( DutchStemmer stemmer ) - { - if ( stemmer != null ) - { - this.stemmer = stemmer; - } - } + /** + * Set an alternative exclusion list for this filter. + */ + public void setExclusionTable(HashSet exclusiontable) { + exclusions = exclusiontable; + } - /** - * Set an alternative exclusion list for this filter. - */ - public void setExclusionTable( Hashtable exclusiontable ) - { - exclusions = exclusiontable; - } - - /** - * Set dictionary for stemming, this dictionary overrules the algorithm, - * so you can correct for a particular unwanted word-stem pair. - */ - public void setStemDictionary(Hashtable dict) - { - if (stemmer != null) - stemmer.setStemDictionary(dict); - } + /** + * Set dictionary for stemming, this dictionary overrules the algorithm, + * so you can correct for a particular unwanted word-stem pair. + */ + public void setStemDictionary(HashMap dict) { + if (stemmer != null) + stemmer.setStemDictionary(dict); + } } \ No newline at end of file diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java index c3810ce00c0..2c059c09667 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java @@ -16,9 +16,8 @@ package org.apache.lucene.analysis.nl; * limitations under the License. */ -import java.util.Hashtable; -import java.util.ArrayList; -import java.io.*; +import java.util.HashMap; + /* * @author Edwin de Jonge (ejne@cbs.nl) * @@ -26,427 +25,382 @@ import java.io.*; * the dutch stemming * algorithm in snowball. Snowball is a project of Martin Porter (does Porter Stemmer ring a bell?): */ -public class DutchStemmer -{ - /** - * Buffer for the terms while stemming them. - */ - private StringBuffer sb = new StringBuffer(); - private boolean _removedE; - private Hashtable _stemDict; - private int _R1; - private int _R2; +public class DutchStemmer { + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + private boolean _removedE; + private HashMap _stemDict; - //TODO convert to internal - /* - * Stemms the given term to an unique discriminator. - * - * @param term The term that should be stemmed. - * @return Discriminator for term - */ - public String stem( String term ) - { - term = term.toLowerCase(); - if ( !isStemmable( term ) ) - return term; - if (_stemDict != null && _stemDict.contains(term)) - if (_stemDict.get(term) instanceof String) - return (String)_stemDict.get(term); - else return null; + private int _R1; + private int _R2; - // Reset the StringBuffer. - sb.delete(0, sb.length()); - sb.insert(0, term); - // Stemming starts here... - substitute(sb); - storeYandI(sb); - _R1 = getRIndex(sb, 0); - _R1 = Math.max(3,_R1); - step1(sb); - step2(sb); - _R2 = getRIndex(sb, _R1); - step3a(sb); - step3b(sb); - step4(sb); - reStoreYandI(sb); - return sb.toString(); - } + //TODO convert to internal + /* + * Stemms the given term to an unique discriminator. + * + * @param term The term that should be stemmed. + * @return Discriminator for term + */ + public String stem(String term) { + term = term.toLowerCase(); + if (!isStemmable(term)) + return term; + if (_stemDict != null && _stemDict.containsKey(term)) + if (_stemDict.get(term) instanceof String) + return (String) _stemDict.get(term); + else + return null; - private boolean enEnding(StringBuffer sb) - { - String[] enend = new String[]{"ene","en"}; - for (int i = 0; i < enend.length; i++) - { - String end = enend[i]; - String s = sb.toString(); - int index = s.length() - end.length(); - if ( s.endsWith(end) && - index >= _R1 && - isValidEnEnding(sb,index-1) - ) - { - sb.delete(index, index + end.length()); - unDouble(sb,index); - return true; - } - } - return false; - } + // Reset the StringBuffer. + sb.delete(0, sb.length()); + sb.insert(0, term); + // Stemming starts here... + substitute(sb); + storeYandI(sb); + _R1 = getRIndex(sb, 0); + _R1 = Math.max(3, _R1); + step1(sb); + step2(sb); + _R2 = getRIndex(sb, _R1); + step3a(sb); + step3b(sb); + step4(sb); + reStoreYandI(sb); + return sb.toString(); + } + + private boolean enEnding(StringBuffer sb) { + String[] enend = new String[]{"ene", "en"}; + for (int i = 0; i < enend.length; i++) { + String end = enend[i]; + String s = sb.toString(); + int index = s.length() - end.length(); + if (s.endsWith(end) && + index >= _R1 && + isValidEnEnding(sb, index - 1) + ) { + sb.delete(index, index + end.length()); + unDouble(sb, index); + return true; + } + } + return false; + } - private void step1(StringBuffer sb) - { - if (_R1 >= sb.length()) - return; + private void step1(StringBuffer sb) { + if (_R1 >= sb.length()) + return; - String s = sb.toString(); - int lengthR1 = sb.length() - _R1; - int index; + String s = sb.toString(); + int lengthR1 = sb.length() - _R1; + int index; - if (s.endsWith("heden")) - { - sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid")); - return; - } + if (s.endsWith("heden")) { + sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid")); + return; + } - if (enEnding(sb)) - return; + if (enEnding(sb)) + return; - if (s.endsWith("se") && - (index = s.length() - 2) >= _R1 && - isValidSEnding(sb, index -1) - ) - { - sb.delete(index, index + 2); - return; - } - if (s.endsWith("s") && - (index = s.length() - 1) >= _R1 && - isValidSEnding(sb, index - 1)) - { - sb.delete(index, index + 1); - } - } + if (s.endsWith("se") && + (index = s.length() - 2) >= _R1 && + isValidSEnding(sb, index - 1) + ) { + sb.delete(index, index + 2); + return; + } + if (s.endsWith("s") && + (index = s.length() - 1) >= _R1 && + isValidSEnding(sb, index - 1)) { + sb.delete(index, index + 1); + } + } - /** - * Delete suffix e if in R1 and - * preceded by a non-vowel, and then undouble the ending - * - * @param sb String being stemmed - */ - private void step2(StringBuffer sb) - { - _removedE = false; - if (_R1 >= sb.length()) - return; - String s = sb.toString(); - int index = s.length() - 1; - if ( index >= _R1 && - s.endsWith("e") && - !isVowel(sb.charAt(index-1))) - { - sb.delete(index, index + 1); - unDouble(sb); - _removedE = true; - } - } + /** + * Delete suffix e if in R1 and + * preceded by a non-vowel, and then undouble the ending + * + * @param sb String being stemmed + */ + private void step2(StringBuffer sb) { + _removedE = false; + if (_R1 >= sb.length()) + return; + String s = sb.toString(); + int index = s.length() - 1; + if (index >= _R1 && + s.endsWith("e") && + !isVowel(sb.charAt(index - 1))) { + sb.delete(index, index + 1); + unDouble(sb); + _removedE = true; + } + } - /** - * Delete "heid" - * - * @param sb String being stemmed - */ - private void step3a(StringBuffer sb) - { - if (_R2 >= sb.length()) - return; - String s = sb.toString(); - int index = s.length() - 4; - if (s.endsWith("heid")&& index >= _R2 && sb.charAt(index - 1) != 'c') - { - sb.delete(index, index + 4); //remove heid - enEnding(sb); - } - } + /** + * Delete "heid" + * + * @param sb String being stemmed + */ + private void step3a(StringBuffer sb) { + if (_R2 >= sb.length()) + return; + String s = sb.toString(); + int index = s.length() - 4; + if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') { + sb.delete(index, index + 4); //remove heid + enEnding(sb); + } + } - /** - *

A d-suffix, or derivational suffix, enables a new word, - * often with a different grammatical category, or with a different - * sense, to be built from another word. Whether a d-suffix can be - * attached is discovered not from the rules of grammar, but by - * referring to a dictionary. So in English, ness can be added to - * certain adjectives to form corresponding nouns (littleness, - * kindness, foolishness ...) but not to all adjectives - * (not for example, to big, cruel, wise ...) d-suffixes can be - * used to change meaning, often in rather exotic ways.

- * Remove "ing", "end", "ig", "lijk", "baar" and "bar" - * - * @param sb String being stemmed - */ - private void step3b(StringBuffer sb) - { - if (_R2 >= sb.length()) - return; - String s = sb.toString(); - int index; + /** + *

A d-suffix, or derivational suffix, enables a new word, + * often with a different grammatical category, or with a different + * sense, to be built from another word. Whether a d-suffix can be + * attached is discovered not from the rules of grammar, but by + * referring to a dictionary. So in English, ness can be added to + * certain adjectives to form corresponding nouns (littleness, + * kindness, foolishness ...) but not to all adjectives + * (not for example, to big, cruel, wise ...) d-suffixes can be + * used to change meaning, often in rather exotic ways.

+ * Remove "ing", "end", "ig", "lijk", "baar" and "bar" + * + * @param sb String being stemmed + */ + private void step3b(StringBuffer sb) { + if (_R2 >= sb.length()) + return; + String s = sb.toString(); + int index = 0; - if ((s.endsWith("end") || s.endsWith("ing")) && - (index = s.length() - 3) >= _R2) - { - sb.delete(index, index + 3); - if (sb.charAt(index - 2) == 'i' && - sb.charAt(index - 1) == 'g') - { - if (sb.charAt(index - 3) != 'e' & index-2 >= _R2) - { - index -= 2; - sb.delete(index, index + 2); - } - } - else - { - unDouble(sb,index); - } - return; - } - if ( s.endsWith("ig") && - (index = s.length() - 2) >= _R2 - ) - { - if (sb.charAt(index - 1) != 'e') - sb.delete(index, index + 2); - return; - } - if (s.endsWith("lijk") && - (index = s.length() - 4) >= _R2 - ) - { - sb.delete(index, index + 4); - step2(sb); - return; - } - if (s.endsWith("baar") && - (index = s.length() - 4) >= _R2 - ) - { - sb.delete(index, index + 4); - return; - } - if (s.endsWith("bar") && - (index = s.length() - 3) >= _R2 - ) - { - if (_removedE) - sb.delete(index, index + 3); - return; - } - } + if ((s.endsWith("end") || s.endsWith("ing")) && + (index = s.length() - 3) >= _R2) { + sb.delete(index, index + 3); + if (sb.charAt(index - 2) == 'i' && + sb.charAt(index - 1) == 'g') { + if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) { + index -= 2; + sb.delete(index, index + 2); + } + } else { + unDouble(sb, index); + } + return; + } + if (s.endsWith("ig") && + (index = s.length() - 2) >= _R2 + ) { + if (sb.charAt(index - 1) != 'e') + sb.delete(index, index + 2); + return; + } + if (s.endsWith("lijk") && + (index = s.length() - 4) >= _R2 + ) { + sb.delete(index, index + 4); + step2(sb); + return; + } + if (s.endsWith("baar") && + (index = s.length() - 4) >= _R2 + ) { + sb.delete(index, index + 4); + return; + } + if (s.endsWith("bar") && + (index = s.length() - 3) >= _R2 + ) { + if (_removedE) + sb.delete(index, index + 3); + return; + } + } - /** - * undouble vowel - * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod). - * - * @param sb String being stemmed - */ - private void step4(StringBuffer sb) - { - if (sb.length() < 4) - return; - String end = sb.substring(sb.length() - 4, sb.length()); - char c = end.charAt(0); - char v1 = end.charAt(1); - char v2 = end.charAt(2); - char d = end.charAt(3); - if (v1 == v2 && - d != 'I' && - v1 != 'i' && - isVowel(v1) && - !isVowel(d) && - !isVowel(c)) - { - sb.delete(sb.length() - 2, sb.length() - 1); - } - } + /** + * undouble vowel + * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod). + * + * @param sb String being stemmed + */ + private void step4(StringBuffer sb) { + if (sb.length() < 4) + return; + String end = sb.substring(sb.length() - 4, sb.length()); + char c = end.charAt(0); + char v1 = end.charAt(1); + char v2 = end.charAt(2); + char d = end.charAt(3); + if (v1 == v2 && + d != 'I' && + v1 != 'i' && + isVowel(v1) && + !isVowel(d) && + !isVowel(c)) { + sb.delete(sb.length() - 2, sb.length() - 1); + } + } - /** - * Checks if a term could be stemmed. - * - * @return true if, and only if, the given term consists in letters. - */ - private boolean isStemmable( String term ) - { - for ( int c = 0; c < term.length(); c++ ) - { - if ( !Character.isLetter(term.charAt(c))) return false; - } - return true; - } + /** + * Checks if a term could be stemmed. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable(String term) { + for (int c = 0; c < term.length(); c++) { + if (!Character.isLetter(term.charAt(c))) return false; + } + return true; + } - /** - * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú - */ - private void substitute( StringBuffer buffer ) - { - for ( int i = 0; i < buffer.length(); i++ ) - { - switch (buffer.charAt(i)) - { - case 'ä': - case 'á': - { - buffer.setCharAt(i, 'a'); - break; - } - case 'ë': - case 'é': - { - buffer.setCharAt(i, 'e'); - break; - } - case 'ü': - case 'ú': - { - buffer.setCharAt(i, 'u'); - break; - } - case 'ï': - case 'i': - { - buffer.setCharAt(i, 'i'); - break; - } - case 'ö': - case 'ó': - { - buffer.setCharAt(i, 'o'); - break; - } - } - } - } + /** + * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú + */ + private void substitute(StringBuffer buffer) { + for (int i = 0; i < buffer.length(); i++) { + switch (buffer.charAt(i)) { + case 'ä': + case 'á': + { + buffer.setCharAt(i, 'a'); + break; + } + case 'ë': + case 'é': + { + buffer.setCharAt(i, 'e'); + break; + } + case 'ü': + case 'ú': + { + buffer.setCharAt(i, 'u'); + break; + } + case 'ï': + case 'i': + { + buffer.setCharAt(i, 'i'); + break; + } + case 'ö': + case 'ó': + { + buffer.setCharAt(i, 'o'); + break; + } + } + } + } - private boolean isValidSEnding(StringBuffer sb) - { - return isValidSEnding(sb,sb.length() - 1); - } + private boolean isValidSEnding(StringBuffer sb) { + return isValidSEnding(sb, sb.length() - 1); + } - private boolean isValidSEnding(StringBuffer sb, int index) - { - char c = sb.charAt(index); - if (isVowel(c) || c == 'j') - return false; - return true; - } + private boolean isValidSEnding(StringBuffer sb, int index) { + char c = sb.charAt(index); + if (isVowel(c) || c == 'j') + return false; + return true; + } - private boolean isValidEnEnding(StringBuffer sb) - { - return isValidEnEnding(sb,sb.length() - 1); - } + private boolean isValidEnEnding(StringBuffer sb) { + return isValidEnEnding(sb, sb.length() - 1); + } - private boolean isValidEnEnding(StringBuffer sb, int index) - { - char c = sb.charAt(index); - if (isVowel(c)) - return false; - if (c < 3) - return false; - // ends with "gem"? - if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index-1) == 'e') - return false; - return true; - } + private boolean isValidEnEnding(StringBuffer sb, int index) { + char c = sb.charAt(index); + if (isVowel(c)) + return false; + if (c < 3) + return false; + // ends with "gem"? + if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e') + return false; + return true; + } - private void unDouble(StringBuffer sb) - { - unDouble(sb, sb.length()); - } + private void unDouble(StringBuffer sb) { + unDouble(sb, sb.length()); + } - private void unDouble(StringBuffer sb, int endIndex) - { - String s = sb.substring(0, endIndex); - if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn")|| s.endsWith("mm")|| s.endsWith("ff")) - { - sb.delete(endIndex-1, endIndex); - } - } + private void unDouble(StringBuffer sb, int endIndex) { + String s = sb.substring(0, endIndex); + if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) { + sb.delete(endIndex - 1, endIndex); + } + } - private int getRIndex(StringBuffer sb, int start) - { - if (start == 0) - start = 1; - int i = start; - for (; i < sb.length(); i++) - { - //first non-vowel preceded by a vowel - if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i-1))) - { - return i + 1; - } - } - return i + 1; - } + private int getRIndex(StringBuffer sb, int start) { + if (start == 0) + start = 1; + int i = start; + for (; i < sb.length(); i++) { + //first non-vowel preceded by a vowel + if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) { + return i + 1; + } + } + return i + 1; + } - private void storeYandI(StringBuffer sb) - { - if (sb.charAt(0) == 'y') - sb.setCharAt(0, 'Y'); + private void storeYandI(StringBuffer sb) { + if (sb.charAt(0) == 'y') + sb.setCharAt(0, 'Y'); - char c; - int last = sb.length() - 1; + char c; + int last = sb.length() - 1; - for (int i = 1; i < last; i++) - { - switch (sb.charAt(i)) - { - case 'i': - { - if (isVowel(sb.charAt(i-1)) && - isVowel(sb.charAt(i+1)) - ) - sb.setCharAt(i, 'I'); - break; - } - case 'y': - { - if (isVowel(sb.charAt(i-1))) - sb.setCharAt(i, 'Y'); - break; - } - } - } - if (last > 0 && sb.charAt(last)=='y' && isVowel(sb.charAt(last-1))) - sb.setCharAt(last, 'Y'); - } + for (int i = 1; i < last; i++) { + switch (sb.charAt(i)) { + case 'i': + { + if (isVowel(sb.charAt(i - 1)) && + isVowel(sb.charAt(i + 1)) + ) + sb.setCharAt(i, 'I'); + break; + } + case 'y': + { + if (isVowel(sb.charAt(i - 1))) + sb.setCharAt(i, 'Y'); + break; + } + } + } + if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1))) + sb.setCharAt(last, 'Y'); + } - private void reStoreYandI(StringBuffer sb) - { - String tmp = sb.toString(); - sb.delete(0, sb.length()); - sb.insert(0, tmp.replaceAll("I","i").replaceAll("Y","y")); - } + private void reStoreYandI(StringBuffer sb) { + String tmp = sb.toString(); + sb.delete(0, sb.length()); + sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y")); + } - private boolean isVowel(char c) - { - switch (c) - { - case 'e': - case 'a': - case 'o': - case 'i': - case 'u': - case 'y': - case 'è': - { - return true; - } - } - return false; - } + private boolean isVowel(char c) { + switch (c) { + case 'e': + case 'a': + case 'o': + case 'i': + case 'u': + case 'y': + case 'è': + { + return true; + } + } + return false; + } - void setStemDictionary(Hashtable dict) - { - _stemDict = dict; - } + void setStemDictionary(HashMap dict) { + _stemDict = dict; + } } \ No newline at end of file diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java index 6f4d2041e72..7fc3d26ba46 100644 --- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java @@ -20,123 +20,104 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; -import java.util.Hashtable; +import java.util.HashMap; /** - * * @author Gerhard Schwarz - * - * Loads a text file and adds every line as an entry to a Hashtable. Every line - * should contain only one word. If the file is not found or on any error, an - * empty table is returned. + *

+ * Loads a text file and adds every line as an entry to a Hashtable. Every line + * should contain only one word. If the file is not found or on any error, an + * empty table is returned. */ -public class WordlistLoader -{ - /** - * @param path Path to the wordlist - * @param wordfile Name of the wordlist - */ - public static Hashtable getWordtable( String path, String wordfile ) - { - if ( path == null || wordfile == null ) - { - return new Hashtable(); - } - return getWordtable(new File(path, wordfile)); - } +public class WordlistLoader { + /** + * @param path Path to the wordlist + * @param wordfile Name of the wordlist + */ + public static HashMap getWordtable(String path, String wordfile) { + if (path == null || wordfile == null) { + return new HashMap(); + } + return getWordtable(new File(path, wordfile)); + } - /** - * @param wordfile Complete path to the wordlist - */ - public static Hashtable getWordtable( String wordfile ) - { - if ( wordfile == null ) - { - return new Hashtable(); - } - return getWordtable( new File( wordfile ) ); - } + /** + * @param wordfile Complete path to the wordlist + */ + public static HashMap getWordtable(String wordfile) { + if (wordfile == null) { + return new HashMap(); + } + return getWordtable(new File(wordfile)); + } - /** - * Reads a stemsdictionary. Each line contains: - * word \t stem - * i.e. tab seperated) - * - * @return Stem dictionary that overrules, the stemming algorithm - */ - public static Hashtable getStemDict( File wordstemfile) - { - if ( wordstemfile == null ) - { - return new Hashtable(); - } - Hashtable result = new Hashtable(); - try - { - LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile)); - String line; - String[] wordstem; - while ((line = lnr.readLine()) != null) - { - wordstem = line.split("\t", 2); - result.put(wordstem[0], wordstem[1]); - } - } - catch (IOException e) - {} - return result; - } + /** + * Reads a stemsdictionary. Each line contains: + * word \t stem + * i.e. tab seperated) + * + * @return Stem dictionary that overrules, the stemming algorithm + */ + public static HashMap getStemDict(File wordstemfile) { + if (wordstemfile == null) { + return new HashMap(); + } + HashMap result = new HashMap(); + try { + LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile)); + String line; + String[] wordstem; + while ((line = lnr.readLine()) != null) { + wordstem = line.split("\t", 2); + result.put(wordstem[0], wordstem[1]); + } + } catch (IOException e) { + } + return result; + } - /** - * @param wordfile File containing the wordlist - */ - public static Hashtable getWordtable( File wordfile ) - { - if ( wordfile == null ) - { - return new Hashtable(); - } - Hashtable result = null; - try - { - LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile)); - String word = null; - String[] stopwords = new String[100]; - int wordcount = 0; - while ( ( word = lnr.readLine() ) != null ) - { - wordcount++; - if ( wordcount == stopwords.length ) - { - String[] tmp = new String[stopwords.length + 50]; - System.arraycopy( stopwords, 0, tmp, 0, wordcount ); - stopwords = tmp; - } - stopwords[wordcount-1] = word; - } - result = makeWordTable( stopwords, wordcount ); - } - // On error, use an empty table - catch (IOException e) - { - result = new Hashtable(); - } - return result; - } + /** + * @param wordfile File containing the wordlist + */ + public static HashMap getWordtable(File wordfile) { + if (wordfile == null) { + return new HashMap(); + } + HashMap result = null; + try { + LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile)); + String word = null; + String[] stopwords = new String[100]; + int wordcount = 0; + while ((word = lnr.readLine()) != null) { + wordcount++; + if (wordcount == stopwords.length) { + String[] tmp = new String[stopwords.length + 50]; + System.arraycopy(stopwords, 0, tmp, 0, wordcount); + stopwords = tmp; + } + stopwords[wordcount - 1] = word; + } + result = makeWordTable(stopwords, wordcount); + } + // On error, use an empty table + catch (IOException e) { + result = new HashMap(); + } + return result; + } - /** - * Builds the wordlist table. - * - * @param words Word that where read - * @param length Amount of words that where read into words - */ - private static Hashtable makeWordTable( String[] words, int length ) - { - Hashtable table = new Hashtable( length ); - for ( int i = 0; i < length; i++ ) - { - table.put(words[i], words[i]); - } - return table; - } + /** + * Builds the wordlist table. + * + * @param words Word that where read + * @param length Amount of words that where read into words + */ + private static HashMap makeWordTable(String[] words, int length) { + HashMap table = new HashMap(length); + for (int i = 0; i < length; i++) { + table.put(words[i], words[i]); + } + return table; + } } \ No newline at end of file