mirror of
https://github.com/apache/lucene.git
synced 2025-02-23 02:35:02 +00:00
bringing sandbox analyzers up to date with changes to the core StopFilter and migrating away from using Hashtable
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150964 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
badeb8cd5a
commit
c4dd210b0f
@ -35,7 +35,7 @@ public final class LengthFilter extends TokenFilter {
|
||||
*/
|
||||
public LengthFilter(TokenStream in, int min, int max)
|
||||
{
|
||||
input = in;
|
||||
super(in);
|
||||
this.min = min;
|
||||
this.max =max;
|
||||
}
|
||||
|
@ -64,6 +64,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import java.io.File;
|
||||
import java.io.Reader;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* Analyzer for brazilian language. Supports an external list of stopwords (words that
|
||||
@ -102,57 +103,57 @@ public final class BrazilianAnalyzer extends Analyzer {
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Hashtable stoptable = new Hashtable();
|
||||
private HashSet stoptable = new HashSet();
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Hashtable excltable = new Hashtable();
|
||||
private HashSet excltable = new HashSet();
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public BrazilianAnalyzer() {
|
||||
stoptable = StopFilter.makeStopTable( BRAZILIAN_STOP_WORDS );
|
||||
stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BrazilianAnalyzer( String[] stopwords ) {
|
||||
stoptable = StopFilter.makeStopTable( stopwords );
|
||||
stoptable = StopFilter.makeStopSet( stopwords );
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BrazilianAnalyzer( Hashtable stopwords ) {
|
||||
stoptable = stopwords;
|
||||
stoptable = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BrazilianAnalyzer( File stopwords ) {
|
||||
stoptable = WordlistLoader.getWordtable( stopwords );
|
||||
stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
*/
|
||||
public void setStemExclusionTable( String[] exclusionlist ) {
|
||||
excltable = StopFilter.makeStopTable( exclusionlist );
|
||||
excltable = StopFilter.makeStopSet( exclusionlist );
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from a Hashtable.
|
||||
*/
|
||||
public void setStemExclusionTable( Hashtable exclusionlist ) {
|
||||
excltable = exclusionlist;
|
||||
excltable = new HashSet(exclusionlist.keySet());
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
*/
|
||||
public void setStemExclusionTable( File exclusionlist ) {
|
||||
excltable = WordlistLoader.getWordtable( exclusionlist );
|
||||
excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -59,6 +59,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* Based on (copied) the GermanStemFilter
|
||||
@ -79,7 +80,7 @@ public final class BrazilianStemFilter extends TokenFilter {
|
||||
*/
|
||||
private Token token = null;
|
||||
private BrazilianStemmer stemmer = null;
|
||||
private Hashtable exclusions = null;
|
||||
private HashSet exclusions = null;
|
||||
|
||||
public BrazilianStemFilter( TokenStream in ) {
|
||||
super(in);
|
||||
@ -88,8 +89,15 @@ public final class BrazilianStemFilter extends TokenFilter {
|
||||
|
||||
/**
|
||||
* Builds a BrazilianStemFilter that uses an exclusiontable.
|
||||
*
|
||||
* @deprecated
|
||||
*/
|
||||
public BrazilianStemFilter( TokenStream in, Hashtable exclusiontable ) {
|
||||
this( in );
|
||||
this.exclusions = new HashSet(exclusiontable.keySet());
|
||||
}
|
||||
|
||||
public BrazilianStemFilter( TokenStream in, HashSet exclusiontable ) {
|
||||
this( in );
|
||||
this.exclusions = exclusiontable;
|
||||
}
|
||||
|
@ -63,6 +63,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.Reader;
|
||||
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashSet;
|
||||
|
||||
|
||||
/**
|
||||
@ -91,7 +92,7 @@ public class CJKAnalyzer extends Analyzer {
|
||||
//~ Instance fields --------------------------------------------------------
|
||||
|
||||
/** stop word list */
|
||||
private Hashtable stopTable;
|
||||
private HashSet stopTable;
|
||||
|
||||
//~ Constructors -----------------------------------------------------------
|
||||
|
||||
@ -99,7 +100,7 @@ public class CJKAnalyzer extends Analyzer {
|
||||
* Builds an analyzer which removes words in STOP_WORDS.
|
||||
*/
|
||||
public CJKAnalyzer() {
|
||||
stopTable = StopFilter.makeStopTable(stopWords);
|
||||
stopTable = StopFilter.makeStopSet(stopWords);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -108,7 +109,7 @@ public class CJKAnalyzer extends Analyzer {
|
||||
* @param stopWords stop word array
|
||||
*/
|
||||
public CJKAnalyzer(String[] stopWords) {
|
||||
stopTable = StopFilter.makeStopTable(stopWords);
|
||||
stopTable = StopFilter.makeStopSet(stopWords);
|
||||
}
|
||||
|
||||
//~ Methods ----------------------------------------------------------------
|
||||
|
@ -64,6 +64,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* Analyzer for Czech language. Supports an external list of stopwords (words that
|
||||
@ -102,26 +103,32 @@ public final class CzechAnalyzer extends Analyzer {
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Hashtable stoptable = new Hashtable();
|
||||
private HashSet stoptable;
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public CzechAnalyzer() {
|
||||
stoptable = StopFilter.makeStopTable( STOP_WORDS );
|
||||
stoptable = StopFilter.makeStopSet( STOP_WORDS );
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public CzechAnalyzer( String[] stopwords ) {
|
||||
stoptable = StopFilter.makeStopTable( stopwords );
|
||||
stoptable = StopFilter.makeStopSet( stopwords );
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @deprecated
|
||||
*/
|
||||
public CzechAnalyzer( Hashtable stopwords ) {
|
||||
stoptable = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
public CzechAnalyzer( HashSet stopwords ) {
|
||||
stoptable = stopwords;
|
||||
}
|
||||
|
||||
@ -129,7 +136,7 @@ public final class CzechAnalyzer extends Analyzer {
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public CzechAnalyzer( File stopwords ) {
|
||||
stoptable = WordlistLoader.getWordtable( stopwords );
|
||||
stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
@ -139,12 +146,12 @@ public final class CzechAnalyzer extends Analyzer {
|
||||
*/
|
||||
public void loadStopWords( InputStream wordfile, String encoding ) {
|
||||
if ( wordfile == null ) {
|
||||
stoptable = new Hashtable();
|
||||
stoptable = new HashSet();
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// clear any previous table (if present)
|
||||
stoptable = new Hashtable();
|
||||
stoptable = new HashSet();
|
||||
|
||||
InputStreamReader isr;
|
||||
if (encoding == null)
|
||||
@ -156,7 +163,7 @@ public final class CzechAnalyzer extends Analyzer {
|
||||
LineNumberReader lnr = new LineNumberReader(isr);
|
||||
String word;
|
||||
while ( ( word = lnr.readLine() ) != null ) {
|
||||
stoptable.put(word, word);
|
||||
stoptable.add(word);
|
||||
}
|
||||
|
||||
} catch ( IOException e ) {
|
||||
|
@ -63,6 +63,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import java.io.File;
|
||||
import java.io.Reader;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.de.WordlistLoader;
|
||||
|
||||
/**
|
||||
@ -108,57 +110,59 @@ public final class FrenchAnalyzer extends Analyzer {
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Hashtable stoptable = new Hashtable();
|
||||
private HashSet stoptable = new HashSet();
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Hashtable excltable = new Hashtable();
|
||||
private HashSet excltable = new HashSet();
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public FrenchAnalyzer() {
|
||||
stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS );
|
||||
stoptable = StopFilter.makeStopSet( FRENCH_STOP_WORDS );
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public FrenchAnalyzer( String[] stopwords ) {
|
||||
stoptable = StopFilter.makeStopTable( stopwords );
|
||||
stoptable = StopFilter.makeStopSet( stopwords );
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @deprecated
|
||||
*/
|
||||
public FrenchAnalyzer( Hashtable stopwords ) {
|
||||
stoptable = stopwords;
|
||||
stoptable = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public FrenchAnalyzer( File stopwords ) {
|
||||
stoptable = WordlistLoader.getWordtable( stopwords );
|
||||
stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
*/
|
||||
public void setStemExclusionTable( String[] exclusionlist ) {
|
||||
excltable = StopFilter.makeStopTable( exclusionlist );
|
||||
excltable = StopFilter.makeStopSet( exclusionlist );
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from a Hashtable.
|
||||
*/
|
||||
public void setStemExclusionTable( Hashtable exclusionlist ) {
|
||||
excltable = exclusionlist;
|
||||
excltable = new HashSet(exclusionlist.keySet());
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
*/
|
||||
public void setStemExclusionTable( File exclusionlist ) {
|
||||
excltable = WordlistLoader.getWordtable( exclusionlist );
|
||||
excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -59,6 +59,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* A filter that stemms french words. It supports a table of words that should
|
||||
@ -74,7 +75,7 @@ public final class FrenchStemFilter extends TokenFilter {
|
||||
*/
|
||||
private Token token = null;
|
||||
private FrenchStemmer stemmer = null;
|
||||
private Hashtable exclusions = null;
|
||||
private HashSet exclusions = null;
|
||||
|
||||
public FrenchStemFilter( TokenStream in ) {
|
||||
super(in);
|
||||
@ -83,8 +84,15 @@ public final class FrenchStemFilter extends TokenFilter {
|
||||
|
||||
/**
|
||||
* Builds a FrenchStemFilter that uses an exclusiontable.
|
||||
*
|
||||
* @deprecated
|
||||
*/
|
||||
public FrenchStemFilter( TokenStream in, Hashtable exclusiontable ) {
|
||||
this( in );
|
||||
exclusions = new HashSet(exclusiontable.keySet());
|
||||
}
|
||||
|
||||
public FrenchStemFilter( TokenStream in, HashSet exclusiontable ) {
|
||||
this( in );
|
||||
exclusions = exclusiontable;
|
||||
}
|
||||
@ -122,7 +130,7 @@ public final class FrenchStemFilter extends TokenFilter {
|
||||
* Set an alternative exclusion list for this filter.
|
||||
*/
|
||||
public void setExclusionTable( Hashtable exclusiontable ) {
|
||||
exclusions = exclusiontable;
|
||||
exclusions = new HashSet(exclusiontable.keySet());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -21,148 +21,137 @@ import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.*;
|
||||
import java.io.Reader;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Edwin de Jonge
|
||||
*
|
||||
* Analyzer for Dutch language. Supports an external list of stopwords (words that
|
||||
* will not be indexed at all), an external list of exclusions (word that will
|
||||
* not be stemmed, but indexed) and an external list of word-stem pairs that overrule
|
||||
* the algorithm (dictionary stemming).
|
||||
* A default set of stopwords is used unless an alternative list is specified, the
|
||||
* exclusion list is empty by default.
|
||||
* As start for the Analyzer the German Analyzer was used. The stemming algorithm
|
||||
* implemented can be found at @link
|
||||
* <p/>
|
||||
* Analyzer for Dutch language. Supports an external list of stopwords (words that
|
||||
* will not be indexed at all), an external list of exclusions (word that will
|
||||
* not be stemmed, but indexed) and an external list of word-stem pairs that overrule
|
||||
* the algorithm (dictionary stemming).
|
||||
* A default set of stopwords is used unless an alternative list is specified, the
|
||||
* exclusion list is empty by default.
|
||||
* As start for the Analyzer the German Analyzer was used. The stemming algorithm
|
||||
* implemented can be found at @link
|
||||
*/
|
||||
public class DutchAnalyzer extends Analyzer
|
||||
{
|
||||
/**
|
||||
* List of typical Dutch stopwords.
|
||||
*/
|
||||
private String[] DUTCH_STOP_WORDS =
|
||||
{
|
||||
"de","en","van","ik","te","dat","die","in","een",
|
||||
"hij","het","niet","zijn","is","was","op","aan","met","als","voor","had",
|
||||
"er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo",
|
||||
"door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar",
|
||||
"haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal",
|
||||
"me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren",
|
||||
"veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus",
|
||||
"alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt",
|
||||
"wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets",
|
||||
"uw","iemand","geweest","andere"
|
||||
};
|
||||
public class DutchAnalyzer extends Analyzer {
|
||||
/**
|
||||
* List of typical Dutch stopwords.
|
||||
*/
|
||||
private String[] DUTCH_STOP_WORDS =
|
||||
{
|
||||
"de", "en", "van", "ik", "te", "dat", "die", "in", "een",
|
||||
"hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
|
||||
"er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
|
||||
"door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
|
||||
"haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
|
||||
"me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
|
||||
"veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
|
||||
"alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
|
||||
"wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
|
||||
"uw", "iemand", "geweest", "andere"
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Hashtable stoptable = new Hashtable();
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private HashSet stoptable = new HashSet();
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Hashtable excltable = new Hashtable();
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private HashSet excltable = new HashSet();
|
||||
|
||||
private Hashtable _stemdict = new Hashtable();
|
||||
private HashMap _stemdict = new HashMap();
|
||||
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public DutchAnalyzer()
|
||||
{
|
||||
stoptable = StopFilter.makeStopTable( DUTCH_STOP_WORDS );
|
||||
_stemdict.put("fiets","fiets"); //otherwise fiet
|
||||
_stemdict.put("bromfiets","bromfiets"); //otherwise bromfiet
|
||||
_stemdict.put("ei","eier");
|
||||
_stemdict.put("kind","kinder");
|
||||
}
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public DutchAnalyzer() {
|
||||
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
|
||||
_stemdict.put("fiets", "fiets"); //otherwise fiet
|
||||
_stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||
_stemdict.put("ei", "eier");
|
||||
_stemdict.put("kind", "kinder");
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer( String[] stopwords )
|
||||
{
|
||||
stoptable = StopFilter.makeStopTable( stopwords );
|
||||
}
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer(String[] stopwords) {
|
||||
stoptable = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer( Hashtable stopwords )
|
||||
{
|
||||
stoptable = stopwords;
|
||||
}
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer(HashSet stopwords) {
|
||||
stoptable = stopwords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer( File stopwords )
|
||||
{
|
||||
stoptable = WordlistLoader.getWordtable( stopwords );
|
||||
}
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer(File stopwords) {
|
||||
stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
*
|
||||
* @param exclusionlist
|
||||
*/
|
||||
public void setStemExclusionTable( String[] exclusionlist )
|
||||
{
|
||||
excltable = StopFilter.makeStopTable( exclusionlist );
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
*
|
||||
* @param exclusionlist
|
||||
*/
|
||||
public void setStemExclusionTable(String[] exclusionlist) {
|
||||
excltable = StopFilter.makeStopSet(exclusionlist);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from a Hashtable.
|
||||
*/
|
||||
public void setStemExclusionTable( Hashtable exclusionlist )
|
||||
{
|
||||
excltable = exclusionlist;
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from a Hashtable.
|
||||
*/
|
||||
public void setStemExclusionTable(HashSet exclusionlist) {
|
||||
excltable = exclusionlist;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
*/
|
||||
public void setStemExclusionTable(File exclusionlist)
|
||||
{
|
||||
excltable = WordlistLoader.getWordtable(exclusionlist);
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) {
|
||||
excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a stemdictionary file , that overrules the stemming algorithm
|
||||
* This is a textfile that contains per line
|
||||
* word\tstem
|
||||
* i.e: tabseperated
|
||||
*/
|
||||
public void setStemDictionary(File stemdict)
|
||||
{
|
||||
_stemdict = WordlistLoader.getStemDict(stemdict);
|
||||
}
|
||||
/**
|
||||
* Reads a stemdictionary file , that overrules the stemming algorithm
|
||||
* This is a textfile that contains per line
|
||||
* word\tstem
|
||||
* i.e: tabseperated
|
||||
*/
|
||||
public void setStemDictionary(File stemdict) {
|
||||
_stemdict = WordlistLoader.getStemDict(stemdict);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided TextReader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
TokenStream result = new StandardTokenizer( reader );
|
||||
result = new StandardFilter( result );
|
||||
result = new StopFilter( result, stoptable );
|
||||
result = new DutchStemFilter( result, excltable, _stemdict);
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided TextReader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(result, stoptable);
|
||||
result = new DutchStemFilter(result, excltable, _stemdict);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@ -19,105 +19,91 @@ package org.apache.lucene.analysis.nl;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Edwin de Jonge
|
||||
*
|
||||
* A filter that stems Dutch words. It supports a table of words that should
|
||||
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
||||
* filter object is created (as long as it is a DutchStemmer).
|
||||
* <p/>
|
||||
* A filter that stems Dutch words. It supports a table of words that should
|
||||
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
||||
* filter object is created (as long as it is a DutchStemmer).
|
||||
*/
|
||||
public final class DutchStemFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private DutchStemmer stemmer = null;
|
||||
private Hashtable exclusions = null;
|
||||
public final class DutchStemFilter extends TokenFilter {
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private DutchStemmer stemmer = null;
|
||||
private HashSet exclusions = null;
|
||||
|
||||
public DutchStemFilter( TokenStream _in )
|
||||
{
|
||||
super(_in);
|
||||
stemmer = new DutchStemmer();
|
||||
}
|
||||
public DutchStemFilter(TokenStream _in) {
|
||||
super(_in);
|
||||
stemmer = new DutchStemmer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a DutchStemFilter that uses an exclusiontable.
|
||||
*/
|
||||
public DutchStemFilter( TokenStream _in, Hashtable exclusiontable )
|
||||
{
|
||||
this(_in);
|
||||
exclusions = exclusiontable;
|
||||
}
|
||||
/**
|
||||
* Builds a DutchStemFilter that uses an exclusiontable.
|
||||
*/
|
||||
public DutchStemFilter(TokenStream _in, HashSet exclusiontable) {
|
||||
this(_in);
|
||||
exclusions = exclusiontable;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
|
||||
*/
|
||||
public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary)
|
||||
{
|
||||
this(_in, exclusiontable);
|
||||
stemmer.setStemDictionary(stemdictionary);
|
||||
}
|
||||
/**
|
||||
* @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
|
||||
*/
|
||||
public DutchStemFilter(TokenStream _in, HashSet exclusiontable, HashMap stemdictionary) {
|
||||
this(_in, exclusiontable);
|
||||
stemmer.setStemDictionary(stemdictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public Token next() throws IOException
|
||||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
if ((token = input.next()) == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
{
|
||||
if ( ( token = input.next() ) == null )
|
||||
{
|
||||
return null;
|
||||
}
|
||||
// Check the exclusiontable
|
||||
else if (exclusions != null && exclusions.contains(token.termText())) {
|
||||
return token;
|
||||
} else {
|
||||
String s = stemmer.stem(token.termText());
|
||||
// If not stemmed, dont waste the time creating a new token
|
||||
if (!s.equals(token.termText())) {
|
||||
return new Token(s, token.startOffset(),
|
||||
token.endOffset(), token.type());
|
||||
}
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
// Check the exclusiontable
|
||||
else if ( exclusions != null && exclusions.contains( token.termText() ) )
|
||||
{
|
||||
return token;
|
||||
}
|
||||
else
|
||||
{
|
||||
String s = stemmer.stem( token.termText() );
|
||||
// If not stemmed, dont waste the time creating a new token
|
||||
if ( !s.equals( token.termText() ) )
|
||||
{
|
||||
return new Token( s, token.startOffset(),
|
||||
token.endOffset(), token.type() );
|
||||
}
|
||||
return token;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Set a alternative/custom DutchStemmer for this filter.
|
||||
*/
|
||||
public void setStemmer(DutchStemmer stemmer) {
|
||||
if (stemmer != null) {
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a alternative/custom DutchStemmer for this filter.
|
||||
*/
|
||||
public void setStemmer( DutchStemmer stemmer )
|
||||
{
|
||||
if ( stemmer != null )
|
||||
{
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Set an alternative exclusion list for this filter.
|
||||
*/
|
||||
public void setExclusionTable(HashSet exclusiontable) {
|
||||
exclusions = exclusiontable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set an alternative exclusion list for this filter.
|
||||
*/
|
||||
public void setExclusionTable( Hashtable exclusiontable )
|
||||
{
|
||||
exclusions = exclusiontable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set dictionary for stemming, this dictionary overrules the algorithm,
|
||||
* so you can correct for a particular unwanted word-stem pair.
|
||||
*/
|
||||
public void setStemDictionary(Hashtable dict)
|
||||
{
|
||||
if (stemmer != null)
|
||||
stemmer.setStemDictionary(dict);
|
||||
}
|
||||
/**
|
||||
* Set dictionary for stemming, this dictionary overrules the algorithm,
|
||||
* so you can correct for a particular unwanted word-stem pair.
|
||||
*/
|
||||
public void setStemDictionary(HashMap dict) {
|
||||
if (stemmer != null)
|
||||
stemmer.setStemDictionary(dict);
|
||||
}
|
||||
}
|
@ -16,9 +16,8 @@ package org.apache.lucene.analysis.nl;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Hashtable;
|
||||
import java.util.ArrayList;
|
||||
import java.io.*;
|
||||
import java.util.HashMap;
|
||||
|
||||
/*
|
||||
* @author Edwin de Jonge (ejne@cbs.nl)
|
||||
*
|
||||
@ -26,427 +25,382 @@ import java.io.*;
|
||||
* the <see cref="http://snowball.tartarus.org/dutch/stemmer.html">dutch stemming</see>
|
||||
* algorithm in snowball. Snowball is a project of Martin Porter (does Porter Stemmer ring a bell?):
|
||||
*/
|
||||
public class DutchStemmer
|
||||
{
|
||||
/**
|
||||
* Buffer for the terms while stemming them.
|
||||
*/
|
||||
private StringBuffer sb = new StringBuffer();
|
||||
private boolean _removedE;
|
||||
private Hashtable _stemDict;
|
||||
|
||||
private int _R1;
|
||||
private int _R2;
|
||||
public class DutchStemmer {
|
||||
/**
|
||||
* Buffer for the terms while stemming them.
|
||||
*/
|
||||
private StringBuffer sb = new StringBuffer();
|
||||
private boolean _removedE;
|
||||
private HashMap _stemDict;
|
||||
|
||||
//TODO convert to internal
|
||||
/*
|
||||
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||
*
|
||||
* @param term The term that should be stemmed.
|
||||
* @return Discriminator for <tt>term</tt>
|
||||
*/
|
||||
public String stem( String term )
|
||||
{
|
||||
term = term.toLowerCase();
|
||||
if ( !isStemmable( term ) )
|
||||
return term;
|
||||
if (_stemDict != null && _stemDict.contains(term))
|
||||
if (_stemDict.get(term) instanceof String)
|
||||
return (String)_stemDict.get(term);
|
||||
else return null;
|
||||
private int _R1;
|
||||
private int _R2;
|
||||
|
||||
// Reset the StringBuffer.
|
||||
sb.delete(0, sb.length());
|
||||
sb.insert(0, term);
|
||||
// Stemming starts here...
|
||||
substitute(sb);
|
||||
storeYandI(sb);
|
||||
_R1 = getRIndex(sb, 0);
|
||||
_R1 = Math.max(3,_R1);
|
||||
step1(sb);
|
||||
step2(sb);
|
||||
_R2 = getRIndex(sb, _R1);
|
||||
step3a(sb);
|
||||
step3b(sb);
|
||||
step4(sb);
|
||||
reStoreYandI(sb);
|
||||
return sb.toString();
|
||||
}
|
||||
//TODO convert to internal
|
||||
/*
|
||||
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||
*
|
||||
* @param term The term that should be stemmed.
|
||||
* @return Discriminator for <tt>term</tt>
|
||||
*/
|
||||
public String stem(String term) {
|
||||
term = term.toLowerCase();
|
||||
if (!isStemmable(term))
|
||||
return term;
|
||||
if (_stemDict != null && _stemDict.containsKey(term))
|
||||
if (_stemDict.get(term) instanceof String)
|
||||
return (String) _stemDict.get(term);
|
||||
else
|
||||
return null;
|
||||
|
||||
private boolean enEnding(StringBuffer sb)
|
||||
{
|
||||
String[] enend = new String[]{"ene","en"};
|
||||
for (int i = 0; i < enend.length; i++)
|
||||
{
|
||||
String end = enend[i];
|
||||
String s = sb.toString();
|
||||
int index = s.length() - end.length();
|
||||
if ( s.endsWith(end) &&
|
||||
index >= _R1 &&
|
||||
isValidEnEnding(sb,index-1)
|
||||
)
|
||||
{
|
||||
sb.delete(index, index + end.length());
|
||||
unDouble(sb,index);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// Reset the StringBuffer.
|
||||
sb.delete(0, sb.length());
|
||||
sb.insert(0, term);
|
||||
// Stemming starts here...
|
||||
substitute(sb);
|
||||
storeYandI(sb);
|
||||
_R1 = getRIndex(sb, 0);
|
||||
_R1 = Math.max(3, _R1);
|
||||
step1(sb);
|
||||
step2(sb);
|
||||
_R2 = getRIndex(sb, _R1);
|
||||
step3a(sb);
|
||||
step3b(sb);
|
||||
step4(sb);
|
||||
reStoreYandI(sb);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private boolean enEnding(StringBuffer sb) {
|
||||
String[] enend = new String[]{"ene", "en"};
|
||||
for (int i = 0; i < enend.length; i++) {
|
||||
String end = enend[i];
|
||||
String s = sb.toString();
|
||||
int index = s.length() - end.length();
|
||||
if (s.endsWith(end) &&
|
||||
index >= _R1 &&
|
||||
isValidEnEnding(sb, index - 1)
|
||||
) {
|
||||
sb.delete(index, index + end.length());
|
||||
unDouble(sb, index);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private void step1(StringBuffer sb)
|
||||
{
|
||||
if (_R1 >= sb.length())
|
||||
return;
|
||||
private void step1(StringBuffer sb) {
|
||||
if (_R1 >= sb.length())
|
||||
return;
|
||||
|
||||
String s = sb.toString();
|
||||
int lengthR1 = sb.length() - _R1;
|
||||
int index;
|
||||
String s = sb.toString();
|
||||
int lengthR1 = sb.length() - _R1;
|
||||
int index;
|
||||
|
||||
if (s.endsWith("heden"))
|
||||
{
|
||||
sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("heden")) {
|
||||
sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (enEnding(sb))
|
||||
return;
|
||||
if (enEnding(sb))
|
||||
return;
|
||||
|
||||
if (s.endsWith("se") &&
|
||||
(index = s.length() - 2) >= _R1 &&
|
||||
isValidSEnding(sb, index -1)
|
||||
)
|
||||
{
|
||||
sb.delete(index, index + 2);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("s") &&
|
||||
(index = s.length() - 1) >= _R1 &&
|
||||
isValidSEnding(sb, index - 1))
|
||||
{
|
||||
sb.delete(index, index + 1);
|
||||
}
|
||||
}
|
||||
if (s.endsWith("se") &&
|
||||
(index = s.length() - 2) >= _R1 &&
|
||||
isValidSEnding(sb, index - 1)
|
||||
) {
|
||||
sb.delete(index, index + 2);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("s") &&
|
||||
(index = s.length() - 1) >= _R1 &&
|
||||
isValidSEnding(sb, index - 1)) {
|
||||
sb.delete(index, index + 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete suffix e if in R1 and
|
||||
* preceded by a non-vowel, and then undouble the ending
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step2(StringBuffer sb)
|
||||
{
|
||||
_removedE = false;
|
||||
if (_R1 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index = s.length() - 1;
|
||||
if ( index >= _R1 &&
|
||||
s.endsWith("e") &&
|
||||
!isVowel(sb.charAt(index-1)))
|
||||
{
|
||||
sb.delete(index, index + 1);
|
||||
unDouble(sb);
|
||||
_removedE = true;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Delete suffix e if in R1 and
|
||||
* preceded by a non-vowel, and then undouble the ending
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step2(StringBuffer sb) {
|
||||
_removedE = false;
|
||||
if (_R1 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index = s.length() - 1;
|
||||
if (index >= _R1 &&
|
||||
s.endsWith("e") &&
|
||||
!isVowel(sb.charAt(index - 1))) {
|
||||
sb.delete(index, index + 1);
|
||||
unDouble(sb);
|
||||
_removedE = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete "heid"
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step3a(StringBuffer sb)
|
||||
{
|
||||
if (_R2 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index = s.length() - 4;
|
||||
if (s.endsWith("heid")&& index >= _R2 && sb.charAt(index - 1) != 'c')
|
||||
{
|
||||
sb.delete(index, index + 4); //remove heid
|
||||
enEnding(sb);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Delete "heid"
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step3a(StringBuffer sb) {
|
||||
if (_R2 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index = s.length() - 4;
|
||||
if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
|
||||
sb.delete(index, index + 4); //remove heid
|
||||
enEnding(sb);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>A d-suffix, or derivational suffix, enables a new word,
|
||||
* often with a different grammatical category, or with a different
|
||||
* sense, to be built from another word. Whether a d-suffix can be
|
||||
* attached is discovered not from the rules of grammar, but by
|
||||
* referring to a dictionary. So in English, ness can be added to
|
||||
* certain adjectives to form corresponding nouns (littleness,
|
||||
* kindness, foolishness ...) but not to all adjectives
|
||||
* (not for example, to big, cruel, wise ...) d-suffixes can be
|
||||
* used to change meaning, often in rather exotic ways.</p>
|
||||
* Remove "ing", "end", "ig", "lijk", "baar" and "bar"
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step3b(StringBuffer sb)
|
||||
{
|
||||
if (_R2 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index;
|
||||
/**
|
||||
* <p>A d-suffix, or derivational suffix, enables a new word,
|
||||
* often with a different grammatical category, or with a different
|
||||
* sense, to be built from another word. Whether a d-suffix can be
|
||||
* attached is discovered not from the rules of grammar, but by
|
||||
* referring to a dictionary. So in English, ness can be added to
|
||||
* certain adjectives to form corresponding nouns (littleness,
|
||||
* kindness, foolishness ...) but not to all adjectives
|
||||
* (not for example, to big, cruel, wise ...) d-suffixes can be
|
||||
* used to change meaning, often in rather exotic ways.</p>
|
||||
* Remove "ing", "end", "ig", "lijk", "baar" and "bar"
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step3b(StringBuffer sb) {
|
||||
if (_R2 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index = 0;
|
||||
|
||||
if ((s.endsWith("end") || s.endsWith("ing")) &&
|
||||
(index = s.length() - 3) >= _R2)
|
||||
{
|
||||
sb.delete(index, index + 3);
|
||||
if (sb.charAt(index - 2) == 'i' &&
|
||||
sb.charAt(index - 1) == 'g')
|
||||
{
|
||||
if (sb.charAt(index - 3) != 'e' & index-2 >= _R2)
|
||||
{
|
||||
index -= 2;
|
||||
sb.delete(index, index + 2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
unDouble(sb,index);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if ( s.endsWith("ig") &&
|
||||
(index = s.length() - 2) >= _R2
|
||||
)
|
||||
{
|
||||
if (sb.charAt(index - 1) != 'e')
|
||||
sb.delete(index, index + 2);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("lijk") &&
|
||||
(index = s.length() - 4) >= _R2
|
||||
)
|
||||
{
|
||||
sb.delete(index, index + 4);
|
||||
step2(sb);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("baar") &&
|
||||
(index = s.length() - 4) >= _R2
|
||||
)
|
||||
{
|
||||
sb.delete(index, index + 4);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("bar") &&
|
||||
(index = s.length() - 3) >= _R2
|
||||
)
|
||||
{
|
||||
if (_removedE)
|
||||
sb.delete(index, index + 3);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if ((s.endsWith("end") || s.endsWith("ing")) &&
|
||||
(index = s.length() - 3) >= _R2) {
|
||||
sb.delete(index, index + 3);
|
||||
if (sb.charAt(index - 2) == 'i' &&
|
||||
sb.charAt(index - 1) == 'g') {
|
||||
if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
|
||||
index -= 2;
|
||||
sb.delete(index, index + 2);
|
||||
}
|
||||
} else {
|
||||
unDouble(sb, index);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("ig") &&
|
||||
(index = s.length() - 2) >= _R2
|
||||
) {
|
||||
if (sb.charAt(index - 1) != 'e')
|
||||
sb.delete(index, index + 2);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("lijk") &&
|
||||
(index = s.length() - 4) >= _R2
|
||||
) {
|
||||
sb.delete(index, index + 4);
|
||||
step2(sb);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("baar") &&
|
||||
(index = s.length() - 4) >= _R2
|
||||
) {
|
||||
sb.delete(index, index + 4);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("bar") &&
|
||||
(index = s.length() - 3) >= _R2
|
||||
) {
|
||||
if (_removedE)
|
||||
sb.delete(index, index + 3);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* undouble vowel
|
||||
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step4(StringBuffer sb)
|
||||
{
|
||||
if (sb.length() < 4)
|
||||
return;
|
||||
String end = sb.substring(sb.length() - 4, sb.length());
|
||||
char c = end.charAt(0);
|
||||
char v1 = end.charAt(1);
|
||||
char v2 = end.charAt(2);
|
||||
char d = end.charAt(3);
|
||||
if (v1 == v2 &&
|
||||
d != 'I' &&
|
||||
v1 != 'i' &&
|
||||
isVowel(v1) &&
|
||||
!isVowel(d) &&
|
||||
!isVowel(c))
|
||||
{
|
||||
sb.delete(sb.length() - 2, sb.length() - 1);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* undouble vowel
|
||||
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step4(StringBuffer sb) {
|
||||
if (sb.length() < 4)
|
||||
return;
|
||||
String end = sb.substring(sb.length() - 4, sb.length());
|
||||
char c = end.charAt(0);
|
||||
char v1 = end.charAt(1);
|
||||
char v2 = end.charAt(2);
|
||||
char d = end.charAt(3);
|
||||
if (v1 == v2 &&
|
||||
d != 'I' &&
|
||||
v1 != 'i' &&
|
||||
isVowel(v1) &&
|
||||
!isVowel(d) &&
|
||||
!isVowel(c)) {
|
||||
sb.delete(sb.length() - 2, sb.length() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a term could be stemmed.
|
||||
*
|
||||
* @return true if, and only if, the given term consists in letters.
|
||||
*/
|
||||
private boolean isStemmable( String term )
|
||||
{
|
||||
for ( int c = 0; c < term.length(); c++ )
|
||||
{
|
||||
if ( !Character.isLetter(term.charAt(c))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Checks if a term could be stemmed.
|
||||
*
|
||||
* @return true if, and only if, the given term consists in letters.
|
||||
*/
|
||||
private boolean isStemmable(String term) {
|
||||
for (int c = 0; c < term.length(); c++) {
|
||||
if (!Character.isLetter(term.charAt(c))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
|
||||
*/
|
||||
private void substitute( StringBuffer buffer )
|
||||
{
|
||||
for ( int i = 0; i < buffer.length(); i++ )
|
||||
{
|
||||
switch (buffer.charAt(i))
|
||||
{
|
||||
case 'ä':
|
||||
case 'á':
|
||||
{
|
||||
buffer.setCharAt(i, 'a');
|
||||
break;
|
||||
}
|
||||
case 'ë':
|
||||
case 'é':
|
||||
{
|
||||
buffer.setCharAt(i, 'e');
|
||||
break;
|
||||
}
|
||||
case 'ü':
|
||||
case 'ú':
|
||||
{
|
||||
buffer.setCharAt(i, 'u');
|
||||
break;
|
||||
}
|
||||
case 'ï':
|
||||
case 'i':
|
||||
{
|
||||
buffer.setCharAt(i, 'i');
|
||||
break;
|
||||
}
|
||||
case 'ö':
|
||||
case 'ó':
|
||||
{
|
||||
buffer.setCharAt(i, 'o');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
|
||||
*/
|
||||
private void substitute(StringBuffer buffer) {
|
||||
for (int i = 0; i < buffer.length(); i++) {
|
||||
switch (buffer.charAt(i)) {
|
||||
case 'ä':
|
||||
case 'á':
|
||||
{
|
||||
buffer.setCharAt(i, 'a');
|
||||
break;
|
||||
}
|
||||
case 'ë':
|
||||
case 'é':
|
||||
{
|
||||
buffer.setCharAt(i, 'e');
|
||||
break;
|
||||
}
|
||||
case 'ü':
|
||||
case 'ú':
|
||||
{
|
||||
buffer.setCharAt(i, 'u');
|
||||
break;
|
||||
}
|
||||
case 'ï':
|
||||
case 'i':
|
||||
{
|
||||
buffer.setCharAt(i, 'i');
|
||||
break;
|
||||
}
|
||||
case 'ö':
|
||||
case 'ó':
|
||||
{
|
||||
buffer.setCharAt(i, 'o');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isValidSEnding(StringBuffer sb)
|
||||
{
|
||||
return isValidSEnding(sb,sb.length() - 1);
|
||||
}
|
||||
private boolean isValidSEnding(StringBuffer sb) {
|
||||
return isValidSEnding(sb, sb.length() - 1);
|
||||
}
|
||||
|
||||
private boolean isValidSEnding(StringBuffer sb, int index)
|
||||
{
|
||||
char c = sb.charAt(index);
|
||||
if (isVowel(c) || c == 'j')
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
private boolean isValidSEnding(StringBuffer sb, int index) {
|
||||
char c = sb.charAt(index);
|
||||
if (isVowel(c) || c == 'j')
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean isValidEnEnding(StringBuffer sb)
|
||||
{
|
||||
return isValidEnEnding(sb,sb.length() - 1);
|
||||
}
|
||||
private boolean isValidEnEnding(StringBuffer sb) {
|
||||
return isValidEnEnding(sb, sb.length() - 1);
|
||||
}
|
||||
|
||||
private boolean isValidEnEnding(StringBuffer sb, int index)
|
||||
{
|
||||
char c = sb.charAt(index);
|
||||
if (isVowel(c))
|
||||
return false;
|
||||
if (c < 3)
|
||||
return false;
|
||||
// ends with "gem"?
|
||||
if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index-1) == 'e')
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
private boolean isValidEnEnding(StringBuffer sb, int index) {
|
||||
char c = sb.charAt(index);
|
||||
if (isVowel(c))
|
||||
return false;
|
||||
if (c < 3)
|
||||
return false;
|
||||
// ends with "gem"?
|
||||
if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private void unDouble(StringBuffer sb)
|
||||
{
|
||||
unDouble(sb, sb.length());
|
||||
}
|
||||
private void unDouble(StringBuffer sb) {
|
||||
unDouble(sb, sb.length());
|
||||
}
|
||||
|
||||
private void unDouble(StringBuffer sb, int endIndex)
|
||||
{
|
||||
String s = sb.substring(0, endIndex);
|
||||
if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn")|| s.endsWith("mm")|| s.endsWith("ff"))
|
||||
{
|
||||
sb.delete(endIndex-1, endIndex);
|
||||
}
|
||||
}
|
||||
private void unDouble(StringBuffer sb, int endIndex) {
|
||||
String s = sb.substring(0, endIndex);
|
||||
if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
|
||||
sb.delete(endIndex - 1, endIndex);
|
||||
}
|
||||
}
|
||||
|
||||
private int getRIndex(StringBuffer sb, int start)
|
||||
{
|
||||
if (start == 0)
|
||||
start = 1;
|
||||
int i = start;
|
||||
for (; i < sb.length(); i++)
|
||||
{
|
||||
//first non-vowel preceded by a vowel
|
||||
if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i-1)))
|
||||
{
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
return i + 1;
|
||||
}
|
||||
private int getRIndex(StringBuffer sb, int start) {
|
||||
if (start == 0)
|
||||
start = 1;
|
||||
int i = start;
|
||||
for (; i < sb.length(); i++) {
|
||||
//first non-vowel preceded by a vowel
|
||||
if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
private void storeYandI(StringBuffer sb)
|
||||
{
|
||||
if (sb.charAt(0) == 'y')
|
||||
sb.setCharAt(0, 'Y');
|
||||
private void storeYandI(StringBuffer sb) {
|
||||
if (sb.charAt(0) == 'y')
|
||||
sb.setCharAt(0, 'Y');
|
||||
|
||||
char c;
|
||||
int last = sb.length() - 1;
|
||||
char c;
|
||||
int last = sb.length() - 1;
|
||||
|
||||
for (int i = 1; i < last; i++)
|
||||
{
|
||||
switch (sb.charAt(i))
|
||||
{
|
||||
case 'i':
|
||||
{
|
||||
if (isVowel(sb.charAt(i-1)) &&
|
||||
isVowel(sb.charAt(i+1))
|
||||
)
|
||||
sb.setCharAt(i, 'I');
|
||||
break;
|
||||
}
|
||||
case 'y':
|
||||
{
|
||||
if (isVowel(sb.charAt(i-1)))
|
||||
sb.setCharAt(i, 'Y');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (last > 0 && sb.charAt(last)=='y' && isVowel(sb.charAt(last-1)))
|
||||
sb.setCharAt(last, 'Y');
|
||||
}
|
||||
for (int i = 1; i < last; i++) {
|
||||
switch (sb.charAt(i)) {
|
||||
case 'i':
|
||||
{
|
||||
if (isVowel(sb.charAt(i - 1)) &&
|
||||
isVowel(sb.charAt(i + 1))
|
||||
)
|
||||
sb.setCharAt(i, 'I');
|
||||
break;
|
||||
}
|
||||
case 'y':
|
||||
{
|
||||
if (isVowel(sb.charAt(i - 1)))
|
||||
sb.setCharAt(i, 'Y');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
|
||||
sb.setCharAt(last, 'Y');
|
||||
}
|
||||
|
||||
private void reStoreYandI(StringBuffer sb)
|
||||
{
|
||||
String tmp = sb.toString();
|
||||
sb.delete(0, sb.length());
|
||||
sb.insert(0, tmp.replaceAll("I","i").replaceAll("Y","y"));
|
||||
}
|
||||
private void reStoreYandI(StringBuffer sb) {
|
||||
String tmp = sb.toString();
|
||||
sb.delete(0, sb.length());
|
||||
sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
|
||||
}
|
||||
|
||||
private boolean isVowel(char c)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case 'e':
|
||||
case 'a':
|
||||
case 'o':
|
||||
case 'i':
|
||||
case 'u':
|
||||
case 'y':
|
||||
case 'è':
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
private boolean isVowel(char c) {
|
||||
switch (c) {
|
||||
case 'e':
|
||||
case 'a':
|
||||
case 'o':
|
||||
case 'i':
|
||||
case 'u':
|
||||
case 'y':
|
||||
case 'è':
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void setStemDictionary(Hashtable dict)
|
||||
{
|
||||
_stemDict = dict;
|
||||
}
|
||||
void setStemDictionary(HashMap dict) {
|
||||
_stemDict = dict;
|
||||
}
|
||||
|
||||
}
|
@ -20,123 +20,104 @@ import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.LineNumberReader;
|
||||
import java.util.Hashtable;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Gerhard Schwarz
|
||||
*
|
||||
* Loads a text file and adds every line as an entry to a Hashtable. Every line
|
||||
* should contain only one word. If the file is not found or on any error, an
|
||||
* empty table is returned.
|
||||
* <p/>
|
||||
* Loads a text file and adds every line as an entry to a Hashtable. Every line
|
||||
* should contain only one word. If the file is not found or on any error, an
|
||||
* empty table is returned.
|
||||
*/
|
||||
public class WordlistLoader
|
||||
{
|
||||
/**
|
||||
* @param path Path to the wordlist
|
||||
* @param wordfile Name of the wordlist
|
||||
*/
|
||||
public static Hashtable getWordtable( String path, String wordfile )
|
||||
{
|
||||
if ( path == null || wordfile == null )
|
||||
{
|
||||
return new Hashtable();
|
||||
}
|
||||
return getWordtable(new File(path, wordfile));
|
||||
}
|
||||
public class WordlistLoader {
|
||||
/**
|
||||
* @param path Path to the wordlist
|
||||
* @param wordfile Name of the wordlist
|
||||
*/
|
||||
public static HashMap getWordtable(String path, String wordfile) {
|
||||
if (path == null || wordfile == null) {
|
||||
return new HashMap();
|
||||
}
|
||||
return getWordtable(new File(path, wordfile));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param wordfile Complete path to the wordlist
|
||||
*/
|
||||
public static Hashtable getWordtable( String wordfile )
|
||||
{
|
||||
if ( wordfile == null )
|
||||
{
|
||||
return new Hashtable();
|
||||
}
|
||||
return getWordtable( new File( wordfile ) );
|
||||
}
|
||||
/**
|
||||
* @param wordfile Complete path to the wordlist
|
||||
*/
|
||||
public static HashMap getWordtable(String wordfile) {
|
||||
if (wordfile == null) {
|
||||
return new HashMap();
|
||||
}
|
||||
return getWordtable(new File(wordfile));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a stemsdictionary. Each line contains:
|
||||
* word \t stem
|
||||
* i.e. tab seperated)
|
||||
*
|
||||
* @return Stem dictionary that overrules, the stemming algorithm
|
||||
*/
|
||||
public static Hashtable getStemDict( File wordstemfile)
|
||||
{
|
||||
if ( wordstemfile == null )
|
||||
{
|
||||
return new Hashtable();
|
||||
}
|
||||
Hashtable result = new Hashtable();
|
||||
try
|
||||
{
|
||||
LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
|
||||
String line;
|
||||
String[] wordstem;
|
||||
while ((line = lnr.readLine()) != null)
|
||||
{
|
||||
wordstem = line.split("\t", 2);
|
||||
result.put(wordstem[0], wordstem[1]);
|
||||
}
|
||||
}
|
||||
catch (IOException e)
|
||||
{}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Reads a stemsdictionary. Each line contains:
|
||||
* word \t stem
|
||||
* i.e. tab seperated)
|
||||
*
|
||||
* @return Stem dictionary that overrules, the stemming algorithm
|
||||
*/
|
||||
public static HashMap getStemDict(File wordstemfile) {
|
||||
if (wordstemfile == null) {
|
||||
return new HashMap();
|
||||
}
|
||||
HashMap result = new HashMap();
|
||||
try {
|
||||
LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
|
||||
String line;
|
||||
String[] wordstem;
|
||||
while ((line = lnr.readLine()) != null) {
|
||||
wordstem = line.split("\t", 2);
|
||||
result.put(wordstem[0], wordstem[1]);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param wordfile File containing the wordlist
|
||||
*/
|
||||
public static Hashtable getWordtable( File wordfile )
|
||||
{
|
||||
if ( wordfile == null )
|
||||
{
|
||||
return new Hashtable();
|
||||
}
|
||||
Hashtable result = null;
|
||||
try
|
||||
{
|
||||
LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
|
||||
String word = null;
|
||||
String[] stopwords = new String[100];
|
||||
int wordcount = 0;
|
||||
while ( ( word = lnr.readLine() ) != null )
|
||||
{
|
||||
wordcount++;
|
||||
if ( wordcount == stopwords.length )
|
||||
{
|
||||
String[] tmp = new String[stopwords.length + 50];
|
||||
System.arraycopy( stopwords, 0, tmp, 0, wordcount );
|
||||
stopwords = tmp;
|
||||
}
|
||||
stopwords[wordcount-1] = word;
|
||||
}
|
||||
result = makeWordTable( stopwords, wordcount );
|
||||
}
|
||||
// On error, use an empty table
|
||||
catch (IOException e)
|
||||
{
|
||||
result = new Hashtable();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* @param wordfile File containing the wordlist
|
||||
*/
|
||||
public static HashMap getWordtable(File wordfile) {
|
||||
if (wordfile == null) {
|
||||
return new HashMap();
|
||||
}
|
||||
HashMap result = null;
|
||||
try {
|
||||
LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
|
||||
String word = null;
|
||||
String[] stopwords = new String[100];
|
||||
int wordcount = 0;
|
||||
while ((word = lnr.readLine()) != null) {
|
||||
wordcount++;
|
||||
if (wordcount == stopwords.length) {
|
||||
String[] tmp = new String[stopwords.length + 50];
|
||||
System.arraycopy(stopwords, 0, tmp, 0, wordcount);
|
||||
stopwords = tmp;
|
||||
}
|
||||
stopwords[wordcount - 1] = word;
|
||||
}
|
||||
result = makeWordTable(stopwords, wordcount);
|
||||
}
|
||||
// On error, use an empty table
|
||||
catch (IOException e) {
|
||||
result = new HashMap();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the wordlist table.
|
||||
*
|
||||
* @param words Word that where read
|
||||
* @param length Amount of words that where read into <tt>words</tt>
|
||||
*/
|
||||
private static Hashtable makeWordTable( String[] words, int length )
|
||||
{
|
||||
Hashtable table = new Hashtable( length );
|
||||
for ( int i = 0; i < length; i++ )
|
||||
{
|
||||
table.put(words[i], words[i]);
|
||||
}
|
||||
return table;
|
||||
}
|
||||
/**
|
||||
* Builds the wordlist table.
|
||||
*
|
||||
* @param words Word that where read
|
||||
* @param length Amount of words that where read into <tt>words</tt>
|
||||
*/
|
||||
private static HashMap makeWordTable(String[] words, int length) {
|
||||
HashMap table = new HashMap(length);
|
||||
for (int i = 0; i < length; i++) {
|
||||
table.put(words[i], words[i]);
|
||||
}
|
||||
return table;
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user