From 2b9effb894dba18b05972a0bca6e94614d691d63 Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Fri, 9 Jun 2006 22:15:47 +0000 Subject: [PATCH] deprecate the analysis.nl.WordlistLoader class because it's not robust (fails silently) and use analysis.WordlistLoader instead git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@413180 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 ++ .../lucene/analysis/nl/DutchAnalyzer.java | 43 +++++++++++++------ .../lucene/analysis/nl/WordlistLoader.java | 8 +++- .../lucene/analysis/WordlistLoader.java | 32 ++++++++++++++ 4 files changed, 73 insertions(+), 14 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 4d32f77d5c2..16775619299 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -14,6 +14,10 @@ API Changes 1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow changing of termText via setTermText(). (Yonik Seeley) + 2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated + and is supposed to be replaced with the WordlistLoader class in + package org.apache.lucene.analysis (Daniel Naber) + Bug fixes 1. Fixed the web application demo (built with "ant war-demo") which diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java index f1194f30f28..a035f1c6974 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.File; +import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.HashSet; @@ -68,18 +69,20 @@ public class DutchAnalyzer extends Analyzer { */ private Set excltable = new HashSet(); - private Map _stemdict = new HashMap(); + private Map stemdict = new HashMap(); /** - * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}). + * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS}) + * and a few default entries for the stem exclusion table. + * */ public DutchAnalyzer() { stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS); - _stemdict.put("fiets", "fiets"); //otherwise fiet - _stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet - _stemdict.put("ei", "eier"); - _stemdict.put("kind", "kinder"); + stemdict.put("fiets", "fiets"); //otherwise fiet + stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet + stemdict.put("ei", "eier"); + stemdict.put("kind", "kinder"); } /** @@ -106,7 +109,12 @@ public class DutchAnalyzer extends Analyzer { * @param stopwords */ public DutchAnalyzer(File stopwords) { - stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet()); + try { + stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords); + } catch (IOException e) { + // TODO: throw IOException + throw new RuntimeException(e); + } } /** @@ -129,17 +137,26 @@ public class DutchAnalyzer extends Analyzer { * Builds an exclusionlist from the words contained in the given file. */ public void setStemExclusionTable(File exclusionlist) { - excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet()); + try { + excltable = org.apache.lucene.analysis.WordlistLoader.getWordSet(exclusionlist); + } catch (IOException e) { + // TODO: throw IOException + throw new RuntimeException(e); + } } /** * Reads a stemdictionary file , that overrules the stemming algorithm * This is a textfile that contains per line - * word\tstem - * i.e: tabseperated + * word\tstem, i.e: two tab seperated words */ - public void setStemDictionary(File stemdict) { - _stemdict = WordlistLoader.getStemDict(stemdict); + public void setStemDictionary(File stemdictFile) { + try { + stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile); + } catch (IOException e) { + // TODO: throw IOException + throw new RuntimeException(e); + } } /** @@ -152,7 +169,7 @@ public class DutchAnalyzer extends Analyzer { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new StopFilter(result, stoptable); - result = new DutchStemFilter(result, excltable, _stemdict); + result = new DutchStemFilter(result, excltable, stemdict); return result; } } diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java index 7fc3d26ba46..285defbd537 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java @@ -23,16 +23,19 @@ import java.io.LineNumberReader; import java.util.HashMap; /** - * @author Gerhard Schwarz *

* Loads a text file and adds every line as an entry to a Hashtable. Every line * should contain only one word. If the file is not found or on any error, an * empty table is returned. + * + * @author Gerhard Schwarz + * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader} instead */ public class WordlistLoader { /** * @param path Path to the wordlist * @param wordfile Name of the wordlist + * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead */ public static HashMap getWordtable(String path, String wordfile) { if (path == null || wordfile == null) { @@ -43,6 +46,7 @@ public class WordlistLoader { /** * @param wordfile Complete path to the wordlist + * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead */ public static HashMap getWordtable(String wordfile) { if (wordfile == null) { @@ -57,6 +61,7 @@ public class WordlistLoader { * i.e. tab seperated) * * @return Stem dictionary that overrules, the stemming algorithm + * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getStemDict(File)} instead */ public static HashMap getStemDict(File wordstemfile) { if (wordstemfile == null) { @@ -78,6 +83,7 @@ public class WordlistLoader { /** * @param wordfile File containing the wordlist + * @deprecated use {@link org.apache.lucene.analysis.WordlistLoader#getWordSet(File)} instead */ public static HashMap getWordtable(File wordfile) { if (wordfile == null) { diff --git a/src/java/org/apache/lucene/analysis/WordlistLoader.java b/src/java/org/apache/lucene/analysis/WordlistLoader.java index 02ddc02ea44..6d20eaa61b6 100644 --- a/src/java/org/apache/lucene/analysis/WordlistLoader.java +++ b/src/java/org/apache/lucene/analysis/WordlistLoader.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; +import java.util.HashMap; import java.util.HashSet; /** @@ -84,4 +85,35 @@ public class WordlistLoader { return result; } + /** + * Reads a stem dictionary. Each line contains: + *

word\tstem
+ * (i.e. two tab seperated words) + * + * @return stem dictionary that overrules the stemming algorithm + * @throws IOException + */ + public static HashMap getStemDict(File wordstemfile) throws IOException { + if (wordstemfile == null) + throw new NullPointerException("wordstemfile may not be null"); + HashMap result = new HashMap(); + BufferedReader br = null; + FileReader fr = null; + try { + fr = new FileReader(wordstemfile); + br = new BufferedReader(fr); + String line; + while ((line = br.readLine()) != null) { + String[] wordstem = line.split("\t", 2); + result.put(wordstem[0], wordstem[1]); + } + } finally { + if (fr != null) + fr.close(); + if (br != null) + br.close(); + } + return result; + } + }