deprecate the analysis.nl.WordlistLoader class because it's not robust (fails silently) and use analysis.WordlistLoader instead

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@413180 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Daniel Naber 2006-06-09 22:15:47 +00:00
parent 8fa9ff435d
commit 2b9effb894
4 changed files with 73 additions and 14 deletions

View File

@ -14,6 +14,10 @@ API Changes
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
changing of termText via setTermText(). (Yonik Seeley)
2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated
and is supposed to be replaced with the WordlistLoader class in
package org.apache.lucene.analysis (Daniel Naber)
Bug fixes
1. Fixed the web application demo (built with "ant war-demo") which

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
@ -68,18 +69,20 @@ public class DutchAnalyzer extends Analyzer {
*/
private Set excltable = new HashSet();
private Map _stemdict = new HashMap();
private Map stemdict = new HashMap();
/**
* Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}).
* Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
* and a few default entries for the stem exclusion table.
*
*/
public DutchAnalyzer() {
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
_stemdict.put("fiets", "fiets"); //otherwise fiet
_stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
_stemdict.put("ei", "eier");
_stemdict.put("kind", "kinder");
stemdict.put("fiets", "fiets"); //otherwise fiet
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
stemdict.put("ei", "eier");
stemdict.put("kind", "kinder");
}
/**
@ -106,7 +109,12 @@ public class DutchAnalyzer extends Analyzer {
* @param stopwords
*/
public DutchAnalyzer(File stopwords) {
stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
try {
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
}
/**
@ -129,17 +137,26 @@ public class DutchAnalyzer extends Analyzer {
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable(File exclusionlist) {
excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
try {
excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
}
/**
* Reads a stemdictionary file , that overrules the stemming algorithm
* This is a textfile that contains per line
* word\tstem
* i.e: tabseperated
* <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
*/
public void setStemDictionary(File stemdict) {
_stemdict = WordlistLoader.getStemDict(stemdict);
public void setStemDictionary(File stemdictFile) {
try {
stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
}
/**
@ -152,7 +169,7 @@ public class DutchAnalyzer extends Analyzer {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new StopFilter(result, stoptable);
result = new DutchStemFilter(result, excltable, _stemdict);
result = new DutchStemFilter(result, excltable, stemdict);
return result;
}
}

View File

@ -23,16 +23,19 @@ import java.io.LineNumberReader;
import java.util.HashMap;
/**
* @author Gerhard Schwarz
* <p/>
* Loads a text file and adds every line as an entry to a Hashtable. Every line
* should contain only one word. If the file is not found or on any error, an
* empty table is returned.
*
* @author Gerhard Schwarz
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader} instead
*/
public class WordlistLoader {
/**
* @param path Path to the wordlist
* @param wordfile Name of the wordlist
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/
public static HashMap getWordtable(String path, String wordfile) {
if (path == null || wordfile == null) {
@ -43,6 +46,7 @@ public class WordlistLoader {
/**
* @param wordfile Complete path to the wordlist
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/
public static HashMap getWordtable(String wordfile) {
if (wordfile == null) {
@ -57,6 +61,7 @@ public class WordlistLoader {
* i.e. tab seperated)
*
* @return Stem dictionary that overrules, the stemming algorithm
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getStemDict(File)} instead
*/
public static HashMap getStemDict(File wordstemfile) {
if (wordstemfile == null) {
@ -78,6 +83,7 @@ public class WordlistLoader {
/**
* @param wordfile File containing the wordlist
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/
public static HashMap getWordtable(File wordfile) {
if (wordfile == null) {

View File

@ -21,6 +21,7 @@ import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
/**
@ -84,4 +85,35 @@ public class WordlistLoader {
return result;
}
/**
* Reads a stem dictionary. Each line contains:
* <pre>word<b>\t</b>stem</pre>
* (i.e. two tab seperated words)
*
* @return stem dictionary that overrules the stemming algorithm
* @throws IOException
*/
public static HashMap getStemDict(File wordstemfile) throws IOException {
if (wordstemfile == null)
throw new NullPointerException("wordstemfile may not be null");
HashMap result = new HashMap();
BufferedReader br = null;
FileReader fr = null;
try {
fr = new FileReader(wordstemfile);
br = new BufferedReader(fr);
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
if (fr != null)
fr.close();
if (br != null)
br.close();
}
return result;
}
}