bringing sandbox analyzers up to date with changes to the core StopFilter and migrating away from using Hashtable

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150964 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2004-03-11 03:05:36 +00:00
parent badeb8cd5a
commit c4dd210b0f
11 changed files with 689 additions and 750 deletions

View File

@ -35,7 +35,7 @@ public final class LengthFilter extends TokenFilter {
*/
public LengthFilter(TokenStream in, int min, int max)
{
input = in;
super(in);
this.min = min;
this.max =max;
}

View File

@ -64,6 +64,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
import java.io.Reader;
import java.util.Hashtable;
import java.util.HashSet;
/**
* Analyzer for brazilian language. Supports an external list of stopwords (words that
@ -102,57 +103,57 @@ public final class BrazilianAnalyzer extends Analyzer {
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stoptable = new Hashtable();
private HashSet stoptable = new HashSet();
/**
* Contains words that should be indexed but not stemmed.
*/
private Hashtable excltable = new Hashtable();
private HashSet excltable = new HashSet();
/**
* Builds an analyzer.
*/
public BrazilianAnalyzer() {
stoptable = StopFilter.makeStopTable( BRAZILIAN_STOP_WORDS );
stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
}
/**
* Builds an analyzer with the given stop words.
*/
public BrazilianAnalyzer( String[] stopwords ) {
stoptable = StopFilter.makeStopTable( stopwords );
stoptable = StopFilter.makeStopSet( stopwords );
}
/**
* Builds an analyzer with the given stop words.
*/
public BrazilianAnalyzer( Hashtable stopwords ) {
stoptable = stopwords;
stoptable = new HashSet(stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
*/
public BrazilianAnalyzer( File stopwords ) {
stoptable = WordlistLoader.getWordtable( stopwords );
stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
}
/**
* Builds an exclusionlist from an array of Strings.
*/
public void setStemExclusionTable( String[] exclusionlist ) {
excltable = StopFilter.makeStopTable( exclusionlist );
excltable = StopFilter.makeStopSet( exclusionlist );
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable( Hashtable exclusionlist ) {
excltable = exclusionlist;
excltable = new HashSet(exclusionlist.keySet());
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable( File exclusionlist ) {
excltable = WordlistLoader.getWordtable( exclusionlist );
excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
}
/**

View File

@ -59,6 +59,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
import java.util.HashSet;
/**
* Based on (copied) the GermanStemFilter
@ -79,7 +80,7 @@ public final class BrazilianStemFilter extends TokenFilter {
*/
private Token token = null;
private BrazilianStemmer stemmer = null;
private Hashtable exclusions = null;
private HashSet exclusions = null;
public BrazilianStemFilter( TokenStream in ) {
super(in);
@ -88,8 +89,15 @@ public final class BrazilianStemFilter extends TokenFilter {
/**
* Builds a BrazilianStemFilter that uses an exclusiontable.
*
* @deprecated
*/
public BrazilianStemFilter( TokenStream in, Hashtable exclusiontable ) {
this( in );
this.exclusions = new HashSet(exclusiontable.keySet());
}
public BrazilianStemFilter( TokenStream in, HashSet exclusiontable ) {
this( in );
this.exclusions = exclusiontable;
}

View File

@ -63,6 +63,7 @@ import org.apache.lucene.analysis.TokenStream;
import java.io.Reader;
import java.util.Hashtable;
import java.util.HashSet;
/**
@ -91,7 +92,7 @@ public class CJKAnalyzer extends Analyzer {
//~ Instance fields --------------------------------------------------------
/** stop word list */
private Hashtable stopTable;
private HashSet stopTable;
//~ Constructors -----------------------------------------------------------
@ -99,7 +100,7 @@ public class CJKAnalyzer extends Analyzer {
* Builds an analyzer which removes words in STOP_WORDS.
*/
public CJKAnalyzer() {
stopTable = StopFilter.makeStopTable(stopWords);
stopTable = StopFilter.makeStopSet(stopWords);
}
/**
@ -108,7 +109,7 @@ public class CJKAnalyzer extends Analyzer {
* @param stopWords stop word array
*/
public CJKAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords);
stopTable = StopFilter.makeStopSet(stopWords);
}
//~ Methods ----------------------------------------------------------------

View File

@ -64,6 +64,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.*;
import java.util.Hashtable;
import java.util.HashSet;
/**
* Analyzer for Czech language. Supports an external list of stopwords (words that
@ -102,26 +103,32 @@ public final class CzechAnalyzer extends Analyzer {
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stoptable = new Hashtable();
private HashSet stoptable;
/**
* Builds an analyzer.
*/
public CzechAnalyzer() {
stoptable = StopFilter.makeStopTable( STOP_WORDS );
stoptable = StopFilter.makeStopSet( STOP_WORDS );
}
/**
* Builds an analyzer with the given stop words.
*/
public CzechAnalyzer( String[] stopwords ) {
stoptable = StopFilter.makeStopTable( stopwords );
stoptable = StopFilter.makeStopSet( stopwords );
}
/**
* Builds an analyzer with the given stop words.
*
* @deprecated
*/
public CzechAnalyzer( Hashtable stopwords ) {
stoptable = new HashSet(stopwords.keySet());
}
public CzechAnalyzer( HashSet stopwords ) {
stoptable = stopwords;
}
@ -129,7 +136,7 @@ public final class CzechAnalyzer extends Analyzer {
* Builds an analyzer with the given stop words.
*/
public CzechAnalyzer( File stopwords ) {
stoptable = WordlistLoader.getWordtable( stopwords );
stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
}
/**
@ -139,12 +146,12 @@ public final class CzechAnalyzer extends Analyzer {
*/
public void loadStopWords( InputStream wordfile, String encoding ) {
if ( wordfile == null ) {
stoptable = new Hashtable();
stoptable = new HashSet();
return;
}
try {
// clear any previous table (if present)
stoptable = new Hashtable();
stoptable = new HashSet();
InputStreamReader isr;
if (encoding == null)
@ -156,7 +163,7 @@ public final class CzechAnalyzer extends Analyzer {
LineNumberReader lnr = new LineNumberReader(isr);
String word;
while ( ( word = lnr.readLine() ) != null ) {
stoptable.put(word, word);
stoptable.add(word);
}
} catch ( IOException e ) {

View File

@ -63,6 +63,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
import java.io.Reader;
import java.util.Hashtable;
import java.util.HashSet;
import org.apache.lucene.analysis.de.WordlistLoader;
/**
@ -108,57 +110,59 @@ public final class FrenchAnalyzer extends Analyzer {
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stoptable = new Hashtable();
private HashSet stoptable = new HashSet();
/**
* Contains words that should be indexed but not stemmed.
*/
private Hashtable excltable = new Hashtable();
private HashSet excltable = new HashSet();
/**
* Builds an analyzer.
*/
public FrenchAnalyzer() {
stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS );
stoptable = StopFilter.makeStopSet( FRENCH_STOP_WORDS );
}
/**
* Builds an analyzer with the given stop words.
*/
public FrenchAnalyzer( String[] stopwords ) {
stoptable = StopFilter.makeStopTable( stopwords );
stoptable = StopFilter.makeStopSet( stopwords );
}
/**
* Builds an analyzer with the given stop words.
*
* @deprecated
*/
public FrenchAnalyzer( Hashtable stopwords ) {
stoptable = stopwords;
stoptable = new HashSet(stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
*/
public FrenchAnalyzer( File stopwords ) {
stoptable = WordlistLoader.getWordtable( stopwords );
stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
}
/**
* Builds an exclusionlist from an array of Strings.
*/
public void setStemExclusionTable( String[] exclusionlist ) {
excltable = StopFilter.makeStopTable( exclusionlist );
excltable = StopFilter.makeStopSet( exclusionlist );
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable( Hashtable exclusionlist ) {
excltable = exclusionlist;
excltable = new HashSet(exclusionlist.keySet());
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable( File exclusionlist ) {
excltable = WordlistLoader.getWordtable( exclusionlist );
excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
}
/**

View File

@ -59,6 +59,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
import java.util.HashSet;
/**
* A filter that stemms french words. It supports a table of words that should
@ -74,7 +75,7 @@ public final class FrenchStemFilter extends TokenFilter {
*/
private Token token = null;
private FrenchStemmer stemmer = null;
private Hashtable exclusions = null;
private HashSet exclusions = null;
public FrenchStemFilter( TokenStream in ) {
super(in);
@ -83,8 +84,15 @@ public final class FrenchStemFilter extends TokenFilter {
/**
* Builds a FrenchStemFilter that uses an exclusiontable.
*
* @deprecated
*/
public FrenchStemFilter( TokenStream in, Hashtable exclusiontable ) {
this( in );
exclusions = new HashSet(exclusiontable.keySet());
}
public FrenchStemFilter( TokenStream in, HashSet exclusiontable ) {
this( in );
exclusions = exclusiontable;
}
@ -122,7 +130,7 @@ public final class FrenchStemFilter extends TokenFilter {
* Set an alternative exclusion list for this filter.
*/
public void setExclusionTable( Hashtable exclusiontable ) {
exclusions = exclusiontable;
exclusions = new HashSet(exclusiontable.keySet());
}
}

View File

@ -21,148 +21,137 @@ import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.Token;
import java.io.File;
import java.io.*;
import java.io.Reader;
import java.util.Hashtable;
import java.util.HashMap;
import java.util.HashSet;
/**
*
* @author Edwin de Jonge
*
* Analyzer for Dutch language. Supports an external list of stopwords (words that
* will not be indexed at all), an external list of exclusions (word that will
* not be stemmed, but indexed) and an external list of word-stem pairs that overrule
* the algorithm (dictionary stemming).
* A default set of stopwords is used unless an alternative list is specified, the
* exclusion list is empty by default.
* As start for the Analyzer the German Analyzer was used. The stemming algorithm
* implemented can be found at @link
* <p/>
* Analyzer for Dutch language. Supports an external list of stopwords (words that
* will not be indexed at all), an external list of exclusions (word that will
* not be stemmed, but indexed) and an external list of word-stem pairs that overrule
* the algorithm (dictionary stemming).
* A default set of stopwords is used unless an alternative list is specified, the
* exclusion list is empty by default.
* As start for the Analyzer the German Analyzer was used. The stemming algorithm
* implemented can be found at @link
*/
public class DutchAnalyzer extends Analyzer
{
/**
* List of typical Dutch stopwords.
*/
private String[] DUTCH_STOP_WORDS =
{
"de","en","van","ik","te","dat","die","in","een",
"hij","het","niet","zijn","is","was","op","aan","met","als","voor","had",
"er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo",
"door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar",
"haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal",
"me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren",
"veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus",
"alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt",
"wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets",
"uw","iemand","geweest","andere"
};
public class DutchAnalyzer extends Analyzer {
/**
* List of typical Dutch stopwords.
*/
private String[] DUTCH_STOP_WORDS =
{
"de", "en", "van", "ik", "te", "dat", "die", "in", "een",
"hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
"er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
"door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
"haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
"me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
"veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
"alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
"wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
"uw", "iemand", "geweest", "andere"
};
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stoptable = new Hashtable();
/**
* Contains the stopwords used with the StopFilter.
*/
private HashSet stoptable = new HashSet();
/**
* Contains words that should be indexed but not stemmed.
*/
private Hashtable excltable = new Hashtable();
/**
* Contains words that should be indexed but not stemmed.
*/
private HashSet excltable = new HashSet();
private Hashtable _stemdict = new Hashtable();
private HashMap _stemdict = new HashMap();
/**
* Builds an analyzer.
*/
public DutchAnalyzer()
{
stoptable = StopFilter.makeStopTable( DUTCH_STOP_WORDS );
_stemdict.put("fiets","fiets"); //otherwise fiet
_stemdict.put("bromfiets","bromfiets"); //otherwise bromfiet
_stemdict.put("ei","eier");
_stemdict.put("kind","kinder");
}
/**
* Builds an analyzer.
*/
public DutchAnalyzer() {
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
_stemdict.put("fiets", "fiets"); //otherwise fiet
_stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
_stemdict.put("ei", "eier");
_stemdict.put("kind", "kinder");
}
/**
* Builds an analyzer with the given stop words.
*
* @param stopwords
*/
public DutchAnalyzer( String[] stopwords )
{
stoptable = StopFilter.makeStopTable( stopwords );
}
/**
* Builds an analyzer with the given stop words.
*
* @param stopwords
*/
public DutchAnalyzer(String[] stopwords) {
stoptable = StopFilter.makeStopSet(stopwords);
}
/**
* Builds an analyzer with the given stop words.
*
* @param stopwords
*/
public DutchAnalyzer( Hashtable stopwords )
{
stoptable = stopwords;
}
/**
* Builds an analyzer with the given stop words.
*
* @param stopwords
*/
public DutchAnalyzer(HashSet stopwords) {
stoptable = stopwords;
}
/**
* Builds an analyzer with the given stop words.
*
* @param stopwords
*/
public DutchAnalyzer( File stopwords )
{
stoptable = WordlistLoader.getWordtable( stopwords );
}
/**
* Builds an analyzer with the given stop words.
*
* @param stopwords
*/
public DutchAnalyzer(File stopwords) {
stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
}
/**
* Builds an exclusionlist from an array of Strings.
*
* @param exclusionlist
*/
public void setStemExclusionTable( String[] exclusionlist )
{
excltable = StopFilter.makeStopTable( exclusionlist );
}
/**
* Builds an exclusionlist from an array of Strings.
*
* @param exclusionlist
*/
public void setStemExclusionTable(String[] exclusionlist) {
excltable = StopFilter.makeStopSet(exclusionlist);
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable( Hashtable exclusionlist )
{
excltable = exclusionlist;
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable(HashSet exclusionlist) {
excltable = exclusionlist;
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable(File exclusionlist)
{
excltable = WordlistLoader.getWordtable(exclusionlist);
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable(File exclusionlist) {
excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
}
/**
* Reads a stemdictionary file , that overrules the stemming algorithm
* This is a textfile that contains per line
* word\tstem
* i.e: tabseperated
*/
public void setStemDictionary(File stemdict)
{
_stemdict = WordlistLoader.getStemDict(stemdict);
}
/**
* Reads a stemdictionary file , that overrules the stemming algorithm
* This is a textfile that contains per line
* word\tstem
* i.e: tabseperated
*/
public void setStemDictionary(File stemdict) {
_stemdict = WordlistLoader.getStemDict(stemdict);
}
/**
* Creates a TokenStream which tokenizes all the text in the provided TextReader.
*
* @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new StandardTokenizer( reader );
result = new StandardFilter( result );
result = new StopFilter( result, stoptable );
result = new DutchStemFilter( result, excltable, _stemdict);
return result;
}
/**
* Creates a TokenStream which tokenizes all the text in the provided TextReader.
*
* @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new StopFilter(result, stoptable);
result = new DutchStemFilter(result, excltable, _stemdict);
return result;
}
}

View File

@ -19,105 +19,91 @@ package org.apache.lucene.analysis.nl;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
import java.util.HashMap;
import java.util.HashSet;
/**
*
* @author Edwin de Jonge
*
* A filter that stems Dutch words. It supports a table of words that should
* not be stemmed at all. The stemmer used can be changed at runtime after the
* filter object is created (as long as it is a DutchStemmer).
* <p/>
* A filter that stems Dutch words. It supports a table of words that should
* not be stemmed at all. The stemmer used can be changed at runtime after the
* filter object is created (as long as it is a DutchStemmer).
*/
public final class DutchStemFilter extends TokenFilter
{
/**
* The actual token in the input stream.
*/
private Token token = null;
private DutchStemmer stemmer = null;
private Hashtable exclusions = null;
public final class DutchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
*/
private Token token = null;
private DutchStemmer stemmer = null;
private HashSet exclusions = null;
public DutchStemFilter( TokenStream _in )
{
super(_in);
stemmer = new DutchStemmer();
}
public DutchStemFilter(TokenStream _in) {
super(_in);
stemmer = new DutchStemmer();
}
/**
* Builds a DutchStemFilter that uses an exclusiontable.
*/
public DutchStemFilter( TokenStream _in, Hashtable exclusiontable )
{
this(_in);
exclusions = exclusiontable;
}
/**
* Builds a DutchStemFilter that uses an exclusiontable.
*/
public DutchStemFilter(TokenStream _in, HashSet exclusiontable) {
this(_in);
exclusions = exclusiontable;
}
/**
* @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
*/
public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary)
{
this(_in, exclusiontable);
stemmer.setStemDictionary(stemdictionary);
}
/**
* @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
*/
public DutchStemFilter(TokenStream _in, HashSet exclusiontable, HashMap stemdictionary) {
this(_in, exclusiontable);
stemmer.setStemDictionary(stemdictionary);
}
/**
* @return Returns the next token in the stream, or null at EOS
*/
public Token next() throws IOException
/**
* @return Returns the next token in the stream, or null at EOS
*/
public Token next() throws IOException {
if ((token = input.next()) == null) {
return null;
}
{
if ( ( token = input.next() ) == null )
{
return null;
}
// Check the exclusiontable
else if (exclusions != null && exclusions.contains(token.termText())) {
return token;
} else {
String s = stemmer.stem(token.termText());
// If not stemmed, dont waste the time creating a new token
if (!s.equals(token.termText())) {
return new Token(s, token.startOffset(),
token.endOffset(), token.type());
}
return token;
}
}
// Check the exclusiontable
else if ( exclusions != null && exclusions.contains( token.termText() ) )
{
return token;
}
else
{
String s = stemmer.stem( token.termText() );
// If not stemmed, dont waste the time creating a new token
if ( !s.equals( token.termText() ) )
{
return new Token( s, token.startOffset(),
token.endOffset(), token.type() );
}
return token;
}
}
/**
* Set a alternative/custom DutchStemmer for this filter.
*/
public void setStemmer(DutchStemmer stemmer) {
if (stemmer != null) {
this.stemmer = stemmer;
}
}
/**
* Set a alternative/custom DutchStemmer for this filter.
*/
public void setStemmer( DutchStemmer stemmer )
{
if ( stemmer != null )
{
this.stemmer = stemmer;
}
}
/**
* Set an alternative exclusion list for this filter.
*/
public void setExclusionTable(HashSet exclusiontable) {
exclusions = exclusiontable;
}
/**
* Set an alternative exclusion list for this filter.
*/
public void setExclusionTable( Hashtable exclusiontable )
{
exclusions = exclusiontable;
}
/**
* Set dictionary for stemming, this dictionary overrules the algorithm,
* so you can correct for a particular unwanted word-stem pair.
*/
public void setStemDictionary(Hashtable dict)
{
if (stemmer != null)
stemmer.setStemDictionary(dict);
}
/**
* Set dictionary for stemming, this dictionary overrules the algorithm,
* so you can correct for a particular unwanted word-stem pair.
*/
public void setStemDictionary(HashMap dict) {
if (stemmer != null)
stemmer.setStemDictionary(dict);
}
}

View File

@ -16,9 +16,8 @@ package org.apache.lucene.analysis.nl;
* limitations under the License.
*/
import java.util.Hashtable;
import java.util.ArrayList;
import java.io.*;
import java.util.HashMap;
/*
* @author Edwin de Jonge (ejne@cbs.nl)
*
@ -26,427 +25,382 @@ import java.io.*;
* the <see cref="http://snowball.tartarus.org/dutch/stemmer.html">dutch stemming</see>
* algorithm in snowball. Snowball is a project of Martin Porter (does Porter Stemmer ring a bell?):
*/
public class DutchStemmer
{
/**
* Buffer for the terms while stemming them.
*/
private StringBuffer sb = new StringBuffer();
private boolean _removedE;
private Hashtable _stemDict;
private int _R1;
private int _R2;
public class DutchStemmer {
/**
* Buffer for the terms while stemming them.
*/
private StringBuffer sb = new StringBuffer();
private boolean _removedE;
private HashMap _stemDict;
//TODO convert to internal
/*
* Stemms the given term to an unique <tt>discriminator</tt>.
*
* @param term The term that should be stemmed.
* @return Discriminator for <tt>term</tt>
*/
public String stem( String term )
{
term = term.toLowerCase();
if ( !isStemmable( term ) )
return term;
if (_stemDict != null && _stemDict.contains(term))
if (_stemDict.get(term) instanceof String)
return (String)_stemDict.get(term);
else return null;
private int _R1;
private int _R2;
// Reset the StringBuffer.
sb.delete(0, sb.length());
sb.insert(0, term);
// Stemming starts here...
substitute(sb);
storeYandI(sb);
_R1 = getRIndex(sb, 0);
_R1 = Math.max(3,_R1);
step1(sb);
step2(sb);
_R2 = getRIndex(sb, _R1);
step3a(sb);
step3b(sb);
step4(sb);
reStoreYandI(sb);
return sb.toString();
}
//TODO convert to internal
/*
* Stemms the given term to an unique <tt>discriminator</tt>.
*
* @param term The term that should be stemmed.
* @return Discriminator for <tt>term</tt>
*/
public String stem(String term) {
term = term.toLowerCase();
if (!isStemmable(term))
return term;
if (_stemDict != null && _stemDict.containsKey(term))
if (_stemDict.get(term) instanceof String)
return (String) _stemDict.get(term);
else
return null;
private boolean enEnding(StringBuffer sb)
{
String[] enend = new String[]{"ene","en"};
for (int i = 0; i < enend.length; i++)
{
String end = enend[i];
String s = sb.toString();
int index = s.length() - end.length();
if ( s.endsWith(end) &&
index >= _R1 &&
isValidEnEnding(sb,index-1)
)
{
sb.delete(index, index + end.length());
unDouble(sb,index);
return true;
}
}
return false;
}
// Reset the StringBuffer.
sb.delete(0, sb.length());
sb.insert(0, term);
// Stemming starts here...
substitute(sb);
storeYandI(sb);
_R1 = getRIndex(sb, 0);
_R1 = Math.max(3, _R1);
step1(sb);
step2(sb);
_R2 = getRIndex(sb, _R1);
step3a(sb);
step3b(sb);
step4(sb);
reStoreYandI(sb);
return sb.toString();
}
private boolean enEnding(StringBuffer sb) {
String[] enend = new String[]{"ene", "en"};
for (int i = 0; i < enend.length; i++) {
String end = enend[i];
String s = sb.toString();
int index = s.length() - end.length();
if (s.endsWith(end) &&
index >= _R1 &&
isValidEnEnding(sb, index - 1)
) {
sb.delete(index, index + end.length());
unDouble(sb, index);
return true;
}
}
return false;
}
private void step1(StringBuffer sb)
{
if (_R1 >= sb.length())
return;
private void step1(StringBuffer sb) {
if (_R1 >= sb.length())
return;
String s = sb.toString();
int lengthR1 = sb.length() - _R1;
int index;
String s = sb.toString();
int lengthR1 = sb.length() - _R1;
int index;
if (s.endsWith("heden"))
{
sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
return;
}
if (s.endsWith("heden")) {
sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
return;
}
if (enEnding(sb))
return;
if (enEnding(sb))
return;
if (s.endsWith("se") &&
(index = s.length() - 2) >= _R1 &&
isValidSEnding(sb, index -1)
)
{
sb.delete(index, index + 2);
return;
}
if (s.endsWith("s") &&
(index = s.length() - 1) >= _R1 &&
isValidSEnding(sb, index - 1))
{
sb.delete(index, index + 1);
}
}
if (s.endsWith("se") &&
(index = s.length() - 2) >= _R1 &&
isValidSEnding(sb, index - 1)
) {
sb.delete(index, index + 2);
return;
}
if (s.endsWith("s") &&
(index = s.length() - 1) >= _R1 &&
isValidSEnding(sb, index - 1)) {
sb.delete(index, index + 1);
}
}
/**
* Delete suffix e if in R1 and
* preceded by a non-vowel, and then undouble the ending
*
* @param sb String being stemmed
*/
private void step2(StringBuffer sb)
{
_removedE = false;
if (_R1 >= sb.length())
return;
String s = sb.toString();
int index = s.length() - 1;
if ( index >= _R1 &&
s.endsWith("e") &&
!isVowel(sb.charAt(index-1)))
{
sb.delete(index, index + 1);
unDouble(sb);
_removedE = true;
}
}
/**
* Delete suffix e if in R1 and
* preceded by a non-vowel, and then undouble the ending
*
* @param sb String being stemmed
*/
private void step2(StringBuffer sb) {
_removedE = false;
if (_R1 >= sb.length())
return;
String s = sb.toString();
int index = s.length() - 1;
if (index >= _R1 &&
s.endsWith("e") &&
!isVowel(sb.charAt(index - 1))) {
sb.delete(index, index + 1);
unDouble(sb);
_removedE = true;
}
}
/**
* Delete "heid"
*
* @param sb String being stemmed
*/
private void step3a(StringBuffer sb)
{
if (_R2 >= sb.length())
return;
String s = sb.toString();
int index = s.length() - 4;
if (s.endsWith("heid")&& index >= _R2 && sb.charAt(index - 1) != 'c')
{
sb.delete(index, index + 4); //remove heid
enEnding(sb);
}
}
/**
* Delete "heid"
*
* @param sb String being stemmed
*/
private void step3a(StringBuffer sb) {
if (_R2 >= sb.length())
return;
String s = sb.toString();
int index = s.length() - 4;
if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
sb.delete(index, index + 4); //remove heid
enEnding(sb);
}
}
/**
* <p>A d-suffix, or derivational suffix, enables a new word,
* often with a different grammatical category, or with a different
* sense, to be built from another word. Whether a d-suffix can be
* attached is discovered not from the rules of grammar, but by
* referring to a dictionary. So in English, ness can be added to
* certain adjectives to form corresponding nouns (littleness,
* kindness, foolishness ...) but not to all adjectives
* (not for example, to big, cruel, wise ...) d-suffixes can be
* used to change meaning, often in rather exotic ways.</p>
* Remove "ing", "end", "ig", "lijk", "baar" and "bar"
*
* @param sb String being stemmed
*/
private void step3b(StringBuffer sb)
{
if (_R2 >= sb.length())
return;
String s = sb.toString();
int index;
/**
* <p>A d-suffix, or derivational suffix, enables a new word,
* often with a different grammatical category, or with a different
* sense, to be built from another word. Whether a d-suffix can be
* attached is discovered not from the rules of grammar, but by
* referring to a dictionary. So in English, ness can be added to
* certain adjectives to form corresponding nouns (littleness,
* kindness, foolishness ...) but not to all adjectives
* (not for example, to big, cruel, wise ...) d-suffixes can be
* used to change meaning, often in rather exotic ways.</p>
* Remove "ing", "end", "ig", "lijk", "baar" and "bar"
*
* @param sb String being stemmed
*/
private void step3b(StringBuffer sb) {
if (_R2 >= sb.length())
return;
String s = sb.toString();
int index = 0;
if ((s.endsWith("end") || s.endsWith("ing")) &&
(index = s.length() - 3) >= _R2)
{
sb.delete(index, index + 3);
if (sb.charAt(index - 2) == 'i' &&
sb.charAt(index - 1) == 'g')
{
if (sb.charAt(index - 3) != 'e' & index-2 >= _R2)
{
index -= 2;
sb.delete(index, index + 2);
}
}
else
{
unDouble(sb,index);
}
return;
}
if ( s.endsWith("ig") &&
(index = s.length() - 2) >= _R2
)
{
if (sb.charAt(index - 1) != 'e')
sb.delete(index, index + 2);
return;
}
if (s.endsWith("lijk") &&
(index = s.length() - 4) >= _R2
)
{
sb.delete(index, index + 4);
step2(sb);
return;
}
if (s.endsWith("baar") &&
(index = s.length() - 4) >= _R2
)
{
sb.delete(index, index + 4);
return;
}
if (s.endsWith("bar") &&
(index = s.length() - 3) >= _R2
)
{
if (_removedE)
sb.delete(index, index + 3);
return;
}
}
if ((s.endsWith("end") || s.endsWith("ing")) &&
(index = s.length() - 3) >= _R2) {
sb.delete(index, index + 3);
if (sb.charAt(index - 2) == 'i' &&
sb.charAt(index - 1) == 'g') {
if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
index -= 2;
sb.delete(index, index + 2);
}
} else {
unDouble(sb, index);
}
return;
}
if (s.endsWith("ig") &&
(index = s.length() - 2) >= _R2
) {
if (sb.charAt(index - 1) != 'e')
sb.delete(index, index + 2);
return;
}
if (s.endsWith("lijk") &&
(index = s.length() - 4) >= _R2
) {
sb.delete(index, index + 4);
step2(sb);
return;
}
if (s.endsWith("baar") &&
(index = s.length() - 4) >= _R2
) {
sb.delete(index, index + 4);
return;
}
if (s.endsWith("bar") &&
(index = s.length() - 3) >= _R2
) {
if (_removedE)
sb.delete(index, index + 3);
return;
}
}
/**
* undouble vowel
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
*
* @param sb String being stemmed
*/
private void step4(StringBuffer sb)
{
if (sb.length() < 4)
return;
String end = sb.substring(sb.length() - 4, sb.length());
char c = end.charAt(0);
char v1 = end.charAt(1);
char v2 = end.charAt(2);
char d = end.charAt(3);
if (v1 == v2 &&
d != 'I' &&
v1 != 'i' &&
isVowel(v1) &&
!isVowel(d) &&
!isVowel(c))
{
sb.delete(sb.length() - 2, sb.length() - 1);
}
}
/**
* undouble vowel
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
*
* @param sb String being stemmed
*/
private void step4(StringBuffer sb) {
if (sb.length() < 4)
return;
String end = sb.substring(sb.length() - 4, sb.length());
char c = end.charAt(0);
char v1 = end.charAt(1);
char v2 = end.charAt(2);
char d = end.charAt(3);
if (v1 == v2 &&
d != 'I' &&
v1 != 'i' &&
isVowel(v1) &&
!isVowel(d) &&
!isVowel(c)) {
sb.delete(sb.length() - 2, sb.length() - 1);
}
}
/**
* Checks if a term could be stemmed.
*
* @return true if, and only if, the given term consists in letters.
*/
private boolean isStemmable( String term )
{
for ( int c = 0; c < term.length(); c++ )
{
if ( !Character.isLetter(term.charAt(c))) return false;
}
return true;
}
/**
* Checks if a term could be stemmed.
*
* @return true if, and only if, the given term consists in letters.
*/
private boolean isStemmable(String term) {
for (int c = 0; c < term.length(); c++) {
if (!Character.isLetter(term.charAt(c))) return false;
}
return true;
}
/**
* Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
*/
private void substitute( StringBuffer buffer )
{
for ( int i = 0; i < buffer.length(); i++ )
{
switch (buffer.charAt(i))
{
case 'ä':
case 'á':
{
buffer.setCharAt(i, 'a');
break;
}
case 'ë':
case 'é':
{
buffer.setCharAt(i, 'e');
break;
}
case 'ü':
case 'ú':
{
buffer.setCharAt(i, 'u');
break;
}
case 'ï':
case 'i':
{
buffer.setCharAt(i, 'i');
break;
}
case 'ö':
case 'ó':
{
buffer.setCharAt(i, 'o');
break;
}
}
}
}
/**
* Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
*/
private void substitute(StringBuffer buffer) {
for (int i = 0; i < buffer.length(); i++) {
switch (buffer.charAt(i)) {
case 'ä':
case 'á':
{
buffer.setCharAt(i, 'a');
break;
}
case 'ë':
case 'é':
{
buffer.setCharAt(i, 'e');
break;
}
case 'ü':
case 'ú':
{
buffer.setCharAt(i, 'u');
break;
}
case 'ï':
case 'i':
{
buffer.setCharAt(i, 'i');
break;
}
case 'ö':
case 'ó':
{
buffer.setCharAt(i, 'o');
break;
}
}
}
}
private boolean isValidSEnding(StringBuffer sb)
{
return isValidSEnding(sb,sb.length() - 1);
}
private boolean isValidSEnding(StringBuffer sb) {
return isValidSEnding(sb, sb.length() - 1);
}
private boolean isValidSEnding(StringBuffer sb, int index)
{
char c = sb.charAt(index);
if (isVowel(c) || c == 'j')
return false;
return true;
}
private boolean isValidSEnding(StringBuffer sb, int index) {
char c = sb.charAt(index);
if (isVowel(c) || c == 'j')
return false;
return true;
}
private boolean isValidEnEnding(StringBuffer sb)
{
return isValidEnEnding(sb,sb.length() - 1);
}
private boolean isValidEnEnding(StringBuffer sb) {
return isValidEnEnding(sb, sb.length() - 1);
}
private boolean isValidEnEnding(StringBuffer sb, int index)
{
char c = sb.charAt(index);
if (isVowel(c))
return false;
if (c < 3)
return false;
// ends with "gem"?
if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index-1) == 'e')
return false;
return true;
}
private boolean isValidEnEnding(StringBuffer sb, int index) {
char c = sb.charAt(index);
if (isVowel(c))
return false;
if (c < 3)
return false;
// ends with "gem"?
if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
return false;
return true;
}
private void unDouble(StringBuffer sb)
{
unDouble(sb, sb.length());
}
private void unDouble(StringBuffer sb) {
unDouble(sb, sb.length());
}
private void unDouble(StringBuffer sb, int endIndex)
{
String s = sb.substring(0, endIndex);
if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn")|| s.endsWith("mm")|| s.endsWith("ff"))
{
sb.delete(endIndex-1, endIndex);
}
}
private void unDouble(StringBuffer sb, int endIndex) {
String s = sb.substring(0, endIndex);
if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
sb.delete(endIndex - 1, endIndex);
}
}
private int getRIndex(StringBuffer sb, int start)
{
if (start == 0)
start = 1;
int i = start;
for (; i < sb.length(); i++)
{
//first non-vowel preceded by a vowel
if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i-1)))
{
return i + 1;
}
}
return i + 1;
}
private int getRIndex(StringBuffer sb, int start) {
if (start == 0)
start = 1;
int i = start;
for (; i < sb.length(); i++) {
//first non-vowel preceded by a vowel
if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
return i + 1;
}
}
return i + 1;
}
private void storeYandI(StringBuffer sb)
{
if (sb.charAt(0) == 'y')
sb.setCharAt(0, 'Y');
private void storeYandI(StringBuffer sb) {
if (sb.charAt(0) == 'y')
sb.setCharAt(0, 'Y');
char c;
int last = sb.length() - 1;
char c;
int last = sb.length() - 1;
for (int i = 1; i < last; i++)
{
switch (sb.charAt(i))
{
case 'i':
{
if (isVowel(sb.charAt(i-1)) &&
isVowel(sb.charAt(i+1))
)
sb.setCharAt(i, 'I');
break;
}
case 'y':
{
if (isVowel(sb.charAt(i-1)))
sb.setCharAt(i, 'Y');
break;
}
}
}
if (last > 0 && sb.charAt(last)=='y' && isVowel(sb.charAt(last-1)))
sb.setCharAt(last, 'Y');
}
for (int i = 1; i < last; i++) {
switch (sb.charAt(i)) {
case 'i':
{
if (isVowel(sb.charAt(i - 1)) &&
isVowel(sb.charAt(i + 1))
)
sb.setCharAt(i, 'I');
break;
}
case 'y':
{
if (isVowel(sb.charAt(i - 1)))
sb.setCharAt(i, 'Y');
break;
}
}
}
if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
sb.setCharAt(last, 'Y');
}
private void reStoreYandI(StringBuffer sb)
{
String tmp = sb.toString();
sb.delete(0, sb.length());
sb.insert(0, tmp.replaceAll("I","i").replaceAll("Y","y"));
}
private void reStoreYandI(StringBuffer sb) {
String tmp = sb.toString();
sb.delete(0, sb.length());
sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
}
private boolean isVowel(char c)
{
switch (c)
{
case 'e':
case 'a':
case 'o':
case 'i':
case 'u':
case 'y':
case 'è':
{
return true;
}
}
return false;
}
private boolean isVowel(char c) {
switch (c) {
case 'e':
case 'a':
case 'o':
case 'i':
case 'u':
case 'y':
case 'è':
{
return true;
}
}
return false;
}
void setStemDictionary(Hashtable dict)
{
_stemDict = dict;
}
void setStemDictionary(HashMap dict) {
_stemDict = dict;
}
}

View File

@ -20,123 +20,104 @@ import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.Hashtable;
import java.util.HashMap;
/**
*
* @author Gerhard Schwarz
*
* Loads a text file and adds every line as an entry to a Hashtable. Every line
* should contain only one word. If the file is not found or on any error, an
* empty table is returned.
* <p/>
* Loads a text file and adds every line as an entry to a Hashtable. Every line
* should contain only one word. If the file is not found or on any error, an
* empty table is returned.
*/
public class WordlistLoader
{
/**
* @param path Path to the wordlist
* @param wordfile Name of the wordlist
*/
public static Hashtable getWordtable( String path, String wordfile )
{
if ( path == null || wordfile == null )
{
return new Hashtable();
}
return getWordtable(new File(path, wordfile));
}
public class WordlistLoader {
/**
* @param path Path to the wordlist
* @param wordfile Name of the wordlist
*/
public static HashMap getWordtable(String path, String wordfile) {
if (path == null || wordfile == null) {
return new HashMap();
}
return getWordtable(new File(path, wordfile));
}
/**
* @param wordfile Complete path to the wordlist
*/
public static Hashtable getWordtable( String wordfile )
{
if ( wordfile == null )
{
return new Hashtable();
}
return getWordtable( new File( wordfile ) );
}
/**
* @param wordfile Complete path to the wordlist
*/
public static HashMap getWordtable(String wordfile) {
if (wordfile == null) {
return new HashMap();
}
return getWordtable(new File(wordfile));
}
/**
* Reads a stemsdictionary. Each line contains:
* word \t stem
* i.e. tab seperated)
*
* @return Stem dictionary that overrules, the stemming algorithm
*/
public static Hashtable getStemDict( File wordstemfile)
{
if ( wordstemfile == null )
{
return new Hashtable();
}
Hashtable result = new Hashtable();
try
{
LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
String line;
String[] wordstem;
while ((line = lnr.readLine()) != null)
{
wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
}
catch (IOException e)
{}
return result;
}
/**
* Reads a stemsdictionary. Each line contains:
* word \t stem
* i.e. tab seperated)
*
* @return Stem dictionary that overrules, the stemming algorithm
*/
public static HashMap getStemDict(File wordstemfile) {
if (wordstemfile == null) {
return new HashMap();
}
HashMap result = new HashMap();
try {
LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
String line;
String[] wordstem;
while ((line = lnr.readLine()) != null) {
wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} catch (IOException e) {
}
return result;
}
/**
* @param wordfile File containing the wordlist
*/
public static Hashtable getWordtable( File wordfile )
{
if ( wordfile == null )
{
return new Hashtable();
}
Hashtable result = null;
try
{
LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
String word = null;
String[] stopwords = new String[100];
int wordcount = 0;
while ( ( word = lnr.readLine() ) != null )
{
wordcount++;
if ( wordcount == stopwords.length )
{
String[] tmp = new String[stopwords.length + 50];
System.arraycopy( stopwords, 0, tmp, 0, wordcount );
stopwords = tmp;
}
stopwords[wordcount-1] = word;
}
result = makeWordTable( stopwords, wordcount );
}
// On error, use an empty table
catch (IOException e)
{
result = new Hashtable();
}
return result;
}
/**
* @param wordfile File containing the wordlist
*/
public static HashMap getWordtable(File wordfile) {
if (wordfile == null) {
return new HashMap();
}
HashMap result = null;
try {
LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
String word = null;
String[] stopwords = new String[100];
int wordcount = 0;
while ((word = lnr.readLine()) != null) {
wordcount++;
if (wordcount == stopwords.length) {
String[] tmp = new String[stopwords.length + 50];
System.arraycopy(stopwords, 0, tmp, 0, wordcount);
stopwords = tmp;
}
stopwords[wordcount - 1] = word;
}
result = makeWordTable(stopwords, wordcount);
}
// On error, use an empty table
catch (IOException e) {
result = new HashMap();
}
return result;
}
/**
* Builds the wordlist table.
*
* @param words Word that where read
* @param length Amount of words that where read into <tt>words</tt>
*/
private static Hashtable makeWordTable( String[] words, int length )
{
Hashtable table = new Hashtable( length );
for ( int i = 0; i < length; i++ )
{
table.put(words[i], words[i]);
}
return table;
}
/**
* Builds the wordlist table.
*
* @param words Word that where read
* @param length Amount of words that where read into <tt>words</tt>
*/
private static HashMap makeWordTable(String[] words, int length) {
HashMap table = new HashMap(length);
for (int i = 0; i < length; i++) {
table.put(words[i], words[i]);
}
return table;
}
}