mirror of https://github.com/apache/lucene.git
deprecate the analysis.nl.WordlistLoader class because it's not robust (fails silently) and use analysis.WordlistLoader instead
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@413180 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8fa9ff435d
commit
2b9effb894
|
@ -14,6 +14,10 @@ API Changes
|
||||||
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
|
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
|
||||||
changing of termText via setTermText(). (Yonik Seeley)
|
changing of termText via setTermText(). (Yonik Seeley)
|
||||||
|
|
||||||
|
2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated
|
||||||
|
and is supposed to be replaced with the WordlistLoader class in
|
||||||
|
package org.apache.lucene.analysis (Daniel Naber)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
1. Fixed the web application demo (built with "ant war-demo") which
|
1. Fixed the web application demo (built with "ant war-demo") which
|
||||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -68,18 +69,20 @@ public class DutchAnalyzer extends Analyzer {
|
||||||
*/
|
*/
|
||||||
private Set excltable = new HashSet();
|
private Set excltable = new HashSet();
|
||||||
|
|
||||||
private Map _stemdict = new HashMap();
|
private Map stemdict = new HashMap();
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}).
|
* Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
|
||||||
|
* and a few default entries for the stem exclusion table.
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public DutchAnalyzer() {
|
public DutchAnalyzer() {
|
||||||
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
|
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
|
||||||
_stemdict.put("fiets", "fiets"); //otherwise fiet
|
stemdict.put("fiets", "fiets"); //otherwise fiet
|
||||||
_stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||||
_stemdict.put("ei", "eier");
|
stemdict.put("ei", "eier");
|
||||||
_stemdict.put("kind", "kinder");
|
stemdict.put("kind", "kinder");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -106,7 +109,12 @@ public class DutchAnalyzer extends Analyzer {
|
||||||
* @param stopwords
|
* @param stopwords
|
||||||
*/
|
*/
|
||||||
public DutchAnalyzer(File stopwords) {
|
public DutchAnalyzer(File stopwords) {
|
||||||
stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
|
try {
|
||||||
|
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
|
||||||
|
} catch (IOException e) {
|
||||||
|
// TODO: throw IOException
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -129,17 +137,26 @@ public class DutchAnalyzer extends Analyzer {
|
||||||
* Builds an exclusionlist from the words contained in the given file.
|
* Builds an exclusionlist from the words contained in the given file.
|
||||||
*/
|
*/
|
||||||
public void setStemExclusionTable(File exclusionlist) {
|
public void setStemExclusionTable(File exclusionlist) {
|
||||||
excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
|
try {
|
||||||
|
excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist);
|
||||||
|
} catch (IOException e) {
|
||||||
|
// TODO: throw IOException
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a stemdictionary file , that overrules the stemming algorithm
|
* Reads a stemdictionary file , that overrules the stemming algorithm
|
||||||
* This is a textfile that contains per line
|
* This is a textfile that contains per line
|
||||||
* word\tstem
|
* <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
|
||||||
* i.e: tabseperated
|
|
||||||
*/
|
*/
|
||||||
public void setStemDictionary(File stemdict) {
|
public void setStemDictionary(File stemdictFile) {
|
||||||
_stemdict = WordlistLoader.getStemDict(stemdict);
|
try {
|
||||||
|
stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
|
||||||
|
} catch (IOException e) {
|
||||||
|
// TODO: throw IOException
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -152,7 +169,7 @@ public class DutchAnalyzer extends Analyzer {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new StopFilter(result, stoptable);
|
result = new StopFilter(result, stoptable);
|
||||||
result = new DutchStemFilter(result, excltable, _stemdict);
|
result = new DutchStemFilter(result, excltable, stemdict);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,16 +23,19 @@ import java.io.LineNumberReader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author Gerhard Schwarz
|
|
||||||
* <p/>
|
* <p/>
|
||||||
* Loads a text file and adds every line as an entry to a Hashtable. Every line
|
* Loads a text file and adds every line as an entry to a Hashtable. Every line
|
||||||
* should contain only one word. If the file is not found or on any error, an
|
* should contain only one word. If the file is not found or on any error, an
|
||||||
* empty table is returned.
|
* empty table is returned.
|
||||||
|
*
|
||||||
|
* @author Gerhard Schwarz
|
||||||
|
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader} instead
|
||||||
*/
|
*/
|
||||||
public class WordlistLoader {
|
public class WordlistLoader {
|
||||||
/**
|
/**
|
||||||
* @param path Path to the wordlist
|
* @param path Path to the wordlist
|
||||||
* @param wordfile Name of the wordlist
|
* @param wordfile Name of the wordlist
|
||||||
|
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
|
||||||
*/
|
*/
|
||||||
public static HashMap getWordtable(String path, String wordfile) {
|
public static HashMap getWordtable(String path, String wordfile) {
|
||||||
if (path == null || wordfile == null) {
|
if (path == null || wordfile == null) {
|
||||||
|
@ -43,6 +46,7 @@ public class WordlistLoader {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param wordfile Complete path to the wordlist
|
* @param wordfile Complete path to the wordlist
|
||||||
|
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
|
||||||
*/
|
*/
|
||||||
public static HashMap getWordtable(String wordfile) {
|
public static HashMap getWordtable(String wordfile) {
|
||||||
if (wordfile == null) {
|
if (wordfile == null) {
|
||||||
|
@ -57,6 +61,7 @@ public class WordlistLoader {
|
||||||
* i.e. tab seperated)
|
* i.e. tab seperated)
|
||||||
*
|
*
|
||||||
* @return Stem dictionary that overrules, the stemming algorithm
|
* @return Stem dictionary that overrules, the stemming algorithm
|
||||||
|
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getStemDict(File)} instead
|
||||||
*/
|
*/
|
||||||
public static HashMap getStemDict(File wordstemfile) {
|
public static HashMap getStemDict(File wordstemfile) {
|
||||||
if (wordstemfile == null) {
|
if (wordstemfile == null) {
|
||||||
|
@ -78,6 +83,7 @@ public class WordlistLoader {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param wordfile File containing the wordlist
|
* @param wordfile File containing the wordlist
|
||||||
|
* @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead
|
||||||
*/
|
*/
|
||||||
public static HashMap getWordtable(File wordfile) {
|
public static HashMap getWordtable(File wordfile) {
|
||||||
if (wordfile == null) {
|
if (wordfile == null) {
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -84,4 +85,35 @@ public class WordlistLoader {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a stem dictionary. Each line contains:
|
||||||
|
* <pre>word<b>\t</b>stem</pre>
|
||||||
|
* (i.e. two tab seperated words)
|
||||||
|
*
|
||||||
|
* @return stem dictionary that overrules the stemming algorithm
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public static HashMap getStemDict(File wordstemfile) throws IOException {
|
||||||
|
if (wordstemfile == null)
|
||||||
|
throw new NullPointerException("wordstemfile may not be null");
|
||||||
|
HashMap result = new HashMap();
|
||||||
|
BufferedReader br = null;
|
||||||
|
FileReader fr = null;
|
||||||
|
try {
|
||||||
|
fr = new FileReader(wordstemfile);
|
||||||
|
br = new BufferedReader(fr);
|
||||||
|
String line;
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
String[] wordstem = line.split("\t", 2);
|
||||||
|
result.put(wordstem[0], wordstem[1]);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (fr != null)
|
||||||
|
fr.close();
|
||||||
|
if (br != null)
|
||||||
|
br.close();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue