mirror of https://github.com/apache/lucene.git
deprecate the analysis.nl.WordlistLoader class because it's not robust (fails silently) and use analysis.WordlistLoader instead
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@413180 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8fa9ff435d
commit
2b9effb894
|
@ -14,6 +14,10 @@ API Changes
|
|||
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
|
||||
changing of termText via setTermText(). (Yonik Seeley)
|
||||
|
||||
2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated
|
||||
and is supposed to be replaced with the WordlistLoader class in
|
||||
package org.apache.lucene.analysis (Daniel Naber)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. Fixed the web application demo (built with "ant war-demo") which
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -68,18 +69,20 @@ public class DutchAnalyzer extends Analyzer {
|
|||
*/
|
||||
private Set excltable = new HashSet();
|
||||
|
||||
private Map _stemdict = new HashMap();
|
||||
private Map stemdict = new HashMap();
|
||||
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}).
|
||||
* Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
|
||||
* and a few default entries for the stem exclusion table.
|
||||
*
|
||||
*/
|
||||
public DutchAnalyzer() {
|
||||
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
|
||||
_stemdict.put("fiets", "fiets"); //otherwise fiet
|
||||
_stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||
_stemdict.put("ei", "eier");
|
||||
_stemdict.put("kind", "kinder");
|
||||
stemdict.put("fiets", "fiets"); //otherwise fiet
|
||||
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||
stemdict.put("ei", "eier");
|
||||
stemdict.put("kind", "kinder");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -106,7 +109,12 @@ public class DutchAnalyzer extends Analyzer {
|
|||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer(File stopwords) {
|
||||
stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
|
||||
try {
|
||||
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -129,17 +137,26 @@ public class DutchAnalyzer extends Analyzer {
|
|||
* Builds an exclusionlist from the words contained in the given file.
|
||||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) {
|
||||
excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
|
||||
try {
|
||||
excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a stemdictionary file , that overrules the stemming algorithm
|
||||
* This is a textfile that contains per line
|
||||
* word\tstem
|
||||
* i.e: tabseperated
|
||||
* <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
|
||||
*/
|
||||
public void setStemDictionary(File stemdict) {
|
||||
_stemdict = WordlistLoader.getStemDict(stemdict);
|
||||
public void setStemDictionary(File stemdictFile) {
|
||||
try {
|
||||
stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -152,7 +169,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
TokenStream result = new StandardTokenizer(reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(result, stoptable);
|
||||
result = new DutchStemFilter(result, excltable, _stemdict);
|
||||
result = new DutchStemFilter(result, excltable, stemdict);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,16 +23,19 @@ import java.io.LineNumberReader;
|
|||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* @author Gerhard Schwarz
|
||||
* <p/>
|
||||
* Loads a text file and adds every line as an entry to a Hashtable. Every line
|
||||
* should contain only one word. If the file is not found or on any error, an
|
||||
* empty table is returned.
|
||||
*
|
||||
* @author Gerhard Schwarz
|
||||
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader} instead
|
||||
*/
|
||||
public class WordlistLoader {
|
||||
/**
|
||||
* @param path Path to the wordlist
|
||||
* @param wordfile Name of the wordlist
|
||||
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
|
||||
*/
|
||||
public static HashMap getWordtable(String path, String wordfile) {
|
||||
if (path == null || wordfile == null) {
|
||||
|
@ -43,6 +46,7 @@ public class WordlistLoader {
|
|||
|
||||
/**
|
||||
* @param wordfile Complete path to the wordlist
|
||||
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
|
||||
*/
|
||||
public static HashMap getWordtable(String wordfile) {
|
||||
if (wordfile == null) {
|
||||
|
@ -57,6 +61,7 @@ public class WordlistLoader {
|
|||
* i.e. tab seperated)
|
||||
*
|
||||
* @return Stem dictionary that overrules, the stemming algorithm
|
||||
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getStemDict(File)} instead
|
||||
*/
|
||||
public static HashMap getStemDict(File wordstemfile) {
|
||||
if (wordstemfile == null) {
|
||||
|
@ -78,6 +83,7 @@ public class WordlistLoader {
|
|||
|
||||
/**
|
||||
* @param wordfile File containing the wordlist
|
||||
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
|
||||
*/
|
||||
public static HashMap getWordtable(File wordfile) {
|
||||
if (wordfile == null) {
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.File;
|
|||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
|
@ -84,4 +85,35 @@ public class WordlistLoader {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a stem dictionary. Each line contains:
|
||||
* <pre>word<b>\t</b>stem</pre>
|
||||
* (i.e. two tab seperated words)
|
||||
*
|
||||
* @return stem dictionary that overrules the stemming algorithm
|
||||
* @throws IOException
|
||||
*/
|
||||
public static HashMap getStemDict(File wordstemfile) throws IOException {
|
||||
if (wordstemfile == null)
|
||||
throw new NullPointerException("wordstemfile may not be null");
|
||||
HashMap result = new HashMap();
|
||||
BufferedReader br = null;
|
||||
FileReader fr = null;
|
||||
try {
|
||||
fr = new FileReader(wordstemfile);
|
||||
br = new BufferedReader(fr);
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] wordstem = line.split("\t", 2);
|
||||
result.put(wordstem[0], wordstem[1]);
|
||||
}
|
||||
} finally {
|
||||
if (fr != null)
|
||||
fr.close();
|
||||
if (br != null)
|
||||
br.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue