deprecate the analysis.nl.WordlistLoader class because it's not robust (fails silently) and use analysis.WordlistLoader instead

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@413180 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Daniel Naber 2006-06-09 22:15:47 +00:00
parent 8fa9ff435d
commit 2b9effb894
4 changed files with 73 additions and 14 deletions

View File

@ -14,6 +14,10 @@ API Changes
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow 1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
changing of termText via setTermText(). (Yonik Seeley) changing of termText via setTermText(). (Yonik Seeley)
2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated
and is supposed to be replaced with the WordlistLoader class in
package org.apache.lucene.analysis (Daniel Naber)
Bug fixes Bug fixes
1. Fixed the web application demo (built with "ant war-demo") which 1. Fixed the web application demo (built with "ant war-demo") which

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File; import java.io.File;
import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -68,18 +69,20 @@ public class DutchAnalyzer extends Analyzer {
*/ */
private Set excltable = new HashSet(); private Set excltable = new HashSet();
private Map _stemdict = new HashMap(); private Map stemdict = new HashMap();
/** /**
* Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}). * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
* and a few default entries for the stem exclusion table.
*
*/ */
public DutchAnalyzer() { public DutchAnalyzer() {
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS); stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
_stemdict.put("fiets", "fiets"); //otherwise fiet stemdict.put("fiets", "fiets"); //otherwise fiet
_stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
_stemdict.put("ei", "eier"); stemdict.put("ei", "eier");
_stemdict.put("kind", "kinder"); stemdict.put("kind", "kinder");
} }
/** /**
@ -106,7 +109,12 @@ public class DutchAnalyzer extends Analyzer {
* @param stopwords * @param stopwords
*/ */
public DutchAnalyzer(File stopwords) { public DutchAnalyzer(File stopwords) {
stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet()); try {
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
} }
/** /**
@ -129,17 +137,26 @@ public class DutchAnalyzer extends Analyzer {
* Builds an exclusionlist from the words contained in the given file. * Builds an exclusionlist from the words contained in the given file.
*/ */
public void setStemExclusionTable(File exclusionlist) { public void setStemExclusionTable(File exclusionlist) {
excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet()); try {
excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
} }
/** /**
* Reads a stemdictionary file , that overrules the stemming algorithm * Reads a stemdictionary file , that overrules the stemming algorithm
* This is a textfile that contains per line * This is a textfile that contains per line
* word\tstem * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
* i.e: tabseperated
*/ */
public void setStemDictionary(File stemdict) { public void setStemDictionary(File stemdictFile) {
_stemdict = WordlistLoader.getStemDict(stemdict); try {
stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
} catch (IOException e) {
// TODO: throw IOException
throw new RuntimeException(e);
}
} }
/** /**
@ -152,7 +169,7 @@ public class DutchAnalyzer extends Analyzer {
TokenStream result = new StandardTokenizer(reader); TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result); result = new StandardFilter(result);
result = new StopFilter(result, stoptable); result = new StopFilter(result, stoptable);
result = new DutchStemFilter(result, excltable, _stemdict); result = new DutchStemFilter(result, excltable, stemdict);
return result; return result;
} }
} }

View File

@ -23,16 +23,19 @@ import java.io.LineNumberReader;
import java.util.HashMap; import java.util.HashMap;
/** /**
* @author Gerhard Schwarz
* <p/> * <p/>
* Loads a text file and adds every line as an entry to a Hashtable. Every line * Loads a text file and adds every line as an entry to a Hashtable. Every line
* should contain only one word. If the file is not found or on any error, an * should contain only one word. If the file is not found or on any error, an
* empty table is returned. * empty table is returned.
*
* @author Gerhard Schwarz
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader} instead
*/ */
public class WordlistLoader { public class WordlistLoader {
/** /**
* @param path Path to the wordlist * @param path Path to the wordlist
* @param wordfile Name of the wordlist * @param wordfile Name of the wordlist
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/ */
public static HashMap getWordtable(String path, String wordfile) { public static HashMap getWordtable(String path, String wordfile) {
if (path == null || wordfile == null) { if (path == null || wordfile == null) {
@ -43,6 +46,7 @@ public class WordlistLoader {
/** /**
* @param wordfile Complete path to the wordlist * @param wordfile Complete path to the wordlist
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/ */
public static HashMap getWordtable(String wordfile) { public static HashMap getWordtable(String wordfile) {
if (wordfile == null) { if (wordfile == null) {
@ -57,6 +61,7 @@ public class WordlistLoader {
* i.e. tab seperated) * i.e. tab seperated)
* *
* @return Stem dictionary that overrules, the stemming algorithm * @return Stem dictionary that overrules, the stemming algorithm
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getStemDict(File)} instead
*/ */
public static HashMap getStemDict(File wordstemfile) { public static HashMap getStemDict(File wordstemfile) {
if (wordstemfile == null) { if (wordstemfile == null) {
@ -78,6 +83,7 @@ public class WordlistLoader {
/** /**
* @param wordfile File containing the wordlist * @param wordfile File containing the wordlist
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
*/ */
public static HashMap getWordtable(File wordfile) { public static HashMap getWordtable(File wordfile) {
if (wordfile == null) { if (wordfile == null) {

View File

@ -21,6 +21,7 @@ import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
/** /**
@ -84,4 +85,35 @@ public class WordlistLoader {
return result; return result;
} }
/**
* Reads a stem dictionary. Each line contains:
* <pre>word<b>\t</b>stem</pre>
* (i.e. two tab seperated words)
*
* @return stem dictionary that overrules the stemming algorithm
* @throws IOException
*/
public static HashMap getStemDict(File wordstemfile) throws IOException {
if (wordstemfile == null)
throw new NullPointerException("wordstemfile may not be null");
HashMap result = new HashMap();
BufferedReader br = null;
FileReader fr = null;
try {
fr = new FileReader(wordstemfile);
br = new BufferedReader(fr);
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
if (fr != null)
fr.close();
if (br != null)
br.close();
}
return result;
}
} }