From dc6b4b653396dc77a1e52639bf575078d11ada2f Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 10 Nov 2011 01:21:25 +0000 Subject: [PATCH] LUCENE-2564: Cut over WordListLoader to CharArrayMap/Set and use CharSetDecoder to detect encoding problems early git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200080 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 5 + .../java/org/apache/lucene/util/IOUtils.java | 101 ++++++- .../lucene/analysis/br/BrazilianAnalyzer.java | 6 +- .../lucene/analysis/core/StopAnalyzer.java | 7 +- .../lucene/analysis/cz/CzechAnalyzer.java | 6 +- .../lucene/analysis/da/DanishAnalyzer.java | 5 +- .../lucene/analysis/de/GermanAnalyzer.java | 5 +- .../lucene/analysis/es/SpanishAnalyzer.java | 5 +- .../lucene/analysis/fi/FinnishAnalyzer.java | 5 +- .../lucene/analysis/fr/FrenchAnalyzer.java | 5 +- .../lucene/analysis/gl/GalicianAnalyzer.java | 7 +- .../lucene/analysis/hu/HungarianAnalyzer.java | 5 +- .../lucene/analysis/it/ItalianAnalyzer.java | 5 +- .../lucene/analysis/lv/LatvianAnalyzer.java | 6 +- .../lucene/analysis/nl/DutchAnalyzer.java | 5 +- .../lucene/analysis/no/NorwegianAnalyzer.java | 5 +- .../analysis/pt/PortugueseAnalyzer.java | 5 +- .../lucene/analysis/ru/RussianAnalyzer.java | 7 +- .../analysis/standard/ClassicAnalyzer.java | 5 +- .../analysis/standard/StandardAnalyzer.java | 5 +- .../lucene/analysis/sv/SwedishAnalyzer.java | 5 +- .../analysis/util/StopwordAnalyzerBase.java | 63 ++++- .../lucene/analysis/util/WordlistLoader.java | 261 ++++++------------ .../analysis/util/TestCharArraySet.java | 2 +- .../analysis/util/TestWordlistLoader.java | 12 +- .../cn/smart/SmartChineseAnalyzer.java | 10 +- .../lucene/analysis/pl/PolishAnalyzer.java | 5 +- 27 files changed, 327 insertions(+), 236 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index d0caa19bf15..9d7d9654013 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -99,6 +99,11 @@ Changes in backwards compatibility policy * LUCENE-3558: Moved NRTManager & NRTManagerReopenThread into lucene core o.a.l.search. (Simon Willnauer) + + * LUCENE-2564: WordListLoader is now flaged as @lucene.internal. All methods in + WordListLoader now return CharArraySet/Map and expect Reader instances for + efficiency. Utilities to open Readers from Files, InputStreams or Java + resources were added to IOUtils. (Simon Willnauer, Robert Muir) New Features diff --git a/lucene/src/java/org/apache/lucene/util/IOUtils.java b/lucene/src/java/org/apache/lucene/util/IOUtils.java index 73d9dc6e571..8508c1803bc 100644 --- a/lucene/src/java/org/apache/lucene/util/IOUtils.java +++ b/lucene/src/java/org/apache/lucene/util/IOUtils.java @@ -17,15 +17,35 @@ package org.apache.lucene.util; * limitations under the License. */ +import java.io.BufferedReader; import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.lang.reflect.Method; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; /** This class emulates the new Java 7 "Try-With-Resources" statement. * Remove once Lucene is on Java 7. * @lucene.internal */ public final class IOUtils { - + + /** + * UTF-8 charset string + * @see Charset#forName(String) + */ + public static final String UTF_8 = "UTF-8"; + + /** + * UTF-8 {@link Charset} instance to prevent repeated + * {@link Charset#forName(String)} lookups + */ + public static final Charset CHARSET_UTF_8 = Charset.forName("UTF-8"); private IOUtils() {} // no instance /** @@ -220,5 +240,84 @@ public final class IOUtils { } } } + + /** + * Wrapping the given {@link InputStream} in a reader using a {@link CharsetDecoder}. + * Unlike Java's defaults this reader will throw an exception if your it detects + * the read charset doesn't match the expected {@link Charset}. + *

+ * Decoding readers are useful to load configuration files, stopword lists or synonym files + * to detect character set problems. However, its not recommended to use as a common purpose + * reader. + * + * @param stream the stream to wrap in a reader + * @param charSet the expected charset + * @return a wrapping reader + */ + public static Reader getDecodingReader(InputStream stream, Charset charSet) { + final CharsetDecoder charSetDecoder = charSet.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + return new BufferedReader(new InputStreamReader(stream, charSetDecoder)); + } + + /** + * Opens a Reader for the given {@link File} using a {@link CharsetDecoder}. + * Unlike Java's defaults this reader will throw an exception if your it detects + * the read charset doesn't match the expected {@link Charset}. + *

+ * Decoding readers are useful to load configuration files, stopword lists or synonym files + * to detect character set problems. However, its not recommended to use as a common purpose + * reader. + * @param file the file to open a reader on + * @param charSet the expected charset + * @return a reader to read the given file + */ + public static Reader getDecodingReader(File file, Charset charSet) throws IOException { + FileInputStream stream = null; + boolean success = false; + try { + stream = new FileInputStream(file); + final Reader reader = getDecodingReader(stream, charSet); + success = true; + return reader; + + } finally { + if (!success) { + IOUtils.close(stream); + } + } + } + + /** + * Opens a Reader for the given resource using a {@link CharsetDecoder}. + * Unlike Java's defaults this reader will throw an exception if your it detects + * the read charset doesn't match the expected {@link Charset}. + *

+ * Decoding readers are useful to load configuration files, stopword lists or synonym files + * to detect character set problems. However, its not recommended to use as a common purpose + * reader. + * @param clazz the class used to locate the resource + * @param resource the resource name to load + * @param charSet the expected charset + * @return a reader to read the given file + * + */ + public static Reader getDecodingReader(Class clazz, String resource, Charset charSet) throws IOException { + InputStream stream = null; + boolean success = false; + try { + stream = clazz + .getResourceAsStream(resource); + final Reader reader = getDecodingReader(stream, charSet); + success = true; + return reader; + } finally { + if (!success) { + IOUtils.close(stream); + } + } + } + } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java index 2ba53153998..23ed34b04f0 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java @@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -64,9 +65,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(new CharArraySet( - Version.LUCENE_CURRENT, WordlistLoader.getWordSet(BrazilianAnalyzer.class, - DEFAULT_STOPWORD_FILE, "#"), false)); + DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java index 75fb8c4c3aa..f83f3a7c38b 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java @@ -57,8 +57,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { "they", "this", "to", "was", "will", "with" ); final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, - stopWords.size(), false); - stopSet.addAll(stopWords); + stopWords, false); ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); } @@ -82,7 +81,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { * @param matchVersion See above * @param stopwordsFile File to load stop words from */ public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwordsFile)); + this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion)); } /** Builds an analyzer with the stop words from the given reader. @@ -90,7 +89,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase { * @param matchVersion See above * @param stopwords Reader to load stop words from */ public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java index 0df03a1ed93..ba845ff1609 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.*; @@ -70,9 +71,8 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet( - Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class, - DEFAULT_STOPWORD_FILE, "#"), false)); + DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java index 65505dca4e9..c94676a5196 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.DanishStemmer; @@ -62,8 +63,8 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 2c69900daad..9abde8c249c 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -36,6 +36,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.German2Stemmer; @@ -100,8 +101,8 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase { private static final Set DEFAULT_SET; static { try { - DEFAULT_SET = - WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); + DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java index 025415d9422..7be2b705582 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.SpanishStemmer; @@ -62,8 +63,8 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java index 85a0e595146..caf59278a3f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.FinnishStemmer; @@ -62,8 +63,8 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java index 087f6a104e8..8d0c4a15d43 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.IOException; @@ -118,8 +119,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase { static final Set DEFAULT_STOP_SET; static { try { - DEFAULT_STOP_SET = - WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java index 60dc7c3a6d2..7ce43f1bf15 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java @@ -32,6 +32,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -60,12 +61,12 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) - throw new RuntimeException("Unable to load default stopword set"); + throw new RuntimeException("Unable to load default stopword set", ex); } } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java index be3a8794782..a9270097d17 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.HungarianStemmer; @@ -62,8 +63,8 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java index 22790bb3e19..4e9011624fc 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java @@ -35,6 +35,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.ItalianStemmer; @@ -79,8 +80,8 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java index d0ff1e10323..370e706bd5a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java @@ -27,11 +27,13 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -60,8 +62,8 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java index 3931fa107c2..312242f196a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.File; @@ -83,8 +84,8 @@ public final class DutchAnalyzer extends Analyzer { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java index ecb66f6c8b2..00403f1f720 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.NorwegianStemmer; @@ -62,8 +63,8 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java index 3d2893313ba..853f423d795 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.PortugueseStemmer; @@ -62,8 +63,8 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java index 6ddf665a578..247bdf636e9 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java @@ -34,6 +34,7 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -84,12 +85,12 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase static { try { - DEFAULT_STOP_SET = - WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) - throw new RuntimeException("Unable to load default stopword set"); + throw new RuntimeException("Unable to load default stopword set", ex); } } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java index 9c9821d7792..dc3f0a676c9 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.File; @@ -85,7 +86,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase { * above} * @param stopwords File to read stop words from */ public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** Builds an analyzer with the stop words from the given reader. @@ -94,7 +95,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase { * above} * @param stopwords Reader to read stop words from */ public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java index cf0011d6db2..96b7e8c6e28 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import java.io.File; @@ -86,7 +87,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase { * above} * @param stopwords File to read stop words from */ public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** Builds an analyzer with the stop words from the given reader. @@ -95,7 +96,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase { * above} * @param stopwords Reader to read stop words from */ public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException { - this(matchVersion, WordlistLoader.getWordSet(stopwords)); + this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java index 7dd1702cde5..b1f9442b642 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.tartarus.snowball.ext.SwedishStemmer; @@ -62,8 +63,8 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java index c99dc54e092..ba85a499740 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java @@ -17,10 +17,13 @@ package org.apache.lucene.analysis.util; +import java.io.File; import java.io.IOException; +import java.io.Reader; import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; /** @@ -93,11 +96,59 @@ public abstract class StopwordAnalyzerBase extends Analyzer { protected static CharArraySet loadStopwordSet(final boolean ignoreCase, final Class aClass, final String resource, final String comment) throws IOException { - final Set wordSet = WordlistLoader.getWordSet(aClass, resource, - comment); - final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase); - set.addAll(wordSet); - return set; + Reader reader = null; + try { + reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8); + return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase)); + } finally { + IOUtils.close(reader); + } + + } + + /** + * Creates a CharArraySet from a file. + * + * @param stopwords + * the stopwords file to load + * + * @param matchVersion + * the Lucene version for cross version compatibility + * @return a CharArraySet containing the distinct stopwords from the given + * file + * @throws IOException + * if loading the stopwords throws an {@link IOException} + */ + protected static CharArraySet loadStopwordSet(File stopwords, + Version matchVersion) throws IOException { + Reader reader = null; + try { + reader = IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8); + return WordlistLoader.getWordSet(reader, matchVersion); + } finally { + IOUtils.close(reader); + } + } + + /** + * Creates a CharArraySet from a file. + * + * @param stopwords + * the stopwords reader to load + * + * @param matchVersion + * the Lucene version for cross version compatibility + * @return a CharArraySet containing the distinct stopwords from the given + * reader + * @throws IOException + * if loading the stopwords throws an {@link IOException} + */ + protected static CharArraySet loadStopwordSet(Reader stopwords, + Version matchVersion) throws IOException { + try { + return WordlistLoader.getWordSet(stopwords, matchVersion); + } finally { + IOUtils.close(stopwords); + } } - } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java index 78aa03d6c4f..e62b6af06ee 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java @@ -18,165 +18,91 @@ package org.apache.lucene.analysis.util; */ import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; import java.io.IOException; -import java.io.InputStreamReader; import java.io.Reader; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; + +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.Version; /** * Loader for text files that represent a list of stopwords. + * + * @see IOUtils to obtain {@link Reader} instances + * @lucene.internal */ public class WordlistLoader { - - /** - * Loads a text file associated with a given class (See - * {@link Class#getResourceAsStream(String)}) and adds every line as an entry - * to a {@link Set} (omitting leading and trailing whitespace). Every line of - * the file should contain only one word. The words need to be in lower-case if - * you make use of an Analyzer which uses LowerCaseFilter (like - * StandardAnalyzer). - * - * @param aClass - * a class that is associated with the given stopwordResource - * @param stopwordResource - * name of the resource file associated with the given class - * @return a {@link Set} with the file's words - */ - public static Set getWordSet(Class aClass, String stopwordResource) - throws IOException { - final Reader reader = new BufferedReader(new InputStreamReader(aClass - .getResourceAsStream(stopwordResource), "UTF-8")); - try { - return getWordSet(reader); - } finally { - reader.close(); - } - } + + private static final int INITITAL_CAPACITY = 16; /** - * Loads a text file associated with a given class (See - * {@link Class#getResourceAsStream(String)}) and adds every line as an entry - * to a {@link Set} (omitting leading and trailing whitespace). Every line of - * the file should contain only one word. The words need to be in lower-case if - * you make use of an Analyzer which uses LowerCaseFilter (like - * StandardAnalyzer). - * - * @param aClass - * a class that is associated with the given stopwordResource - * @param stopwordResource - * name of the resource file associated with the given class - * @param comment - * the comment string to ignore - * @return a {@link Set} with the file's words - */ - public static Set getWordSet(Class aClass, - String stopwordResource, String comment) throws IOException { - final Reader reader = new BufferedReader(new InputStreamReader(aClass - .getResourceAsStream(stopwordResource), "UTF-8")); - try { - return getWordSet(reader, comment); - } finally { - reader.close(); - } - } - - /** - * Loads a text file and adds every line as an entry to a HashSet (omitting - * leading and trailing whitespace). Every line of the file should contain only - * one word. The words need to be in lowercase if you make use of an - * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). - * - * @param wordfile File containing the wordlist - * @return A HashSet with the file's words - */ - public static HashSet getWordSet(File wordfile) throws IOException { - FileReader reader = null; - try { - reader = new FileReader(wordfile); - return getWordSet(reader); - } - finally { - if (reader != null) - reader.close(); - } - } - - /** - * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting - * leading and trailing whitespace). Every line of the file should contain only - * one word. The words need to be in lowercase if you make use of an - * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). - * - * @param wordfile File containing the wordlist - * @param comment The comment string to ignore - * @return A HashSet with the file's words - */ - public static HashSet getWordSet(File wordfile, String comment) throws IOException { - FileReader reader = null; - try { - reader = new FileReader(wordfile); - return getWordSet(reader, comment); - } - finally { - if (reader != null) - reader.close(); - } - } - - - /** - * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting + * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist - * @return A HashSet with the reader's words + * @param result the {@link CharArraySet} to fill with the readers words + * @return the given {@link CharArraySet} with the reader's words */ - public static HashSet getWordSet(Reader reader) throws IOException { - final HashSet result = new HashSet(); + public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException { BufferedReader br = null; try { - if (reader instanceof BufferedReader) { - br = (BufferedReader) reader; - } else { - br = new BufferedReader(reader); - } + br = getBufferedReader(reader); String word = null; while ((word = br.readLine()) != null) { result.add(word.trim()); } } finally { - if (br != null) - br.close(); + IOUtils.close(br); } return result; } + + /** + * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting + * leading and trailing whitespace). Every line of the Reader should contain only + * one word. The words need to be in lowercase if you make use of an + * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + * + * @param reader Reader containing the wordlist + * @param matchVersion the Lucene {@link Version} + * @return A {@link CharArraySet} with the reader's words + */ + public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException { + return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false)); + } /** - * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting + * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. - * @return A HashSet with the reader's words + * @param matchVersion the Lucene {@link Version} + * @return A CharArraySet with the reader's words */ - public static HashSet getWordSet(Reader reader, String comment) throws IOException { - final HashSet result = new HashSet(); + public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException { + return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false)); + } + + /** + * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting + * leading and trailing whitespace). Every line of the Reader should contain only + * one word. The words need to be in lowercase if you make use of an + * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + * + * @param reader Reader containing the wordlist + * @param comment The string representing a comment. + * @param result the {@link CharArraySet} to fill with the readers words + * @return the given {@link CharArraySet} with the reader's words + */ + public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException { BufferedReader br = null; try { - if (reader instanceof BufferedReader) { - br = (BufferedReader) reader; - } else { - br = new BufferedReader(reader); - } + br = getBufferedReader(reader); String word = null; while ((word = br.readLine()) != null) { if (word.startsWith(comment) == false){ @@ -185,33 +111,44 @@ public class WordlistLoader { } } finally { - if (br != null) - br.close(); + IOUtils.close(br); } return result; } + /** - * Loads a text file in Snowball format associated with a given class (See - * {@link Class#getResourceAsStream(String)}) and adds all words as entries to - * a {@link Set}. The words need to be in lower-case if you make use of an - * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). + * Reads stopwords from a stopword list in Snowball format. + *

+ * The snowball format is the following: + *

+ *

* - * @param aClass a class that is associated with the given stopwordResource - * @param stopwordResource name of the resource file associated with the given - * class - * @return a {@link Set} with the file's words - * @see #getSnowballWordSet(Reader) + * @param reader Reader containing a Snowball stopword list + * @param result the {@link CharArraySet} to fill with the readers words + * @return the given {@link CharArraySet} with the reader's words */ - public static Set getSnowballWordSet(Class aClass, - String stopwordResource) throws IOException { - final Reader reader = new BufferedReader(new InputStreamReader(aClass - .getResourceAsStream(stopwordResource), "UTF-8")); + public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result) + throws IOException { + BufferedReader br = null; try { - return getSnowballWordSet(reader); + br = getBufferedReader(reader); + String line = null; + while ((line = br.readLine()) != null) { + int comment = line.indexOf('|'); + if (comment >= 0) line = line.substring(0, comment); + String words[] = line.split("\\s+"); + for (int i = 0; i < words.length; i++) + if (words[i].length() > 0) result.add(words[i]); + } } finally { - reader.close(); + IOUtils.close(br); } + return result; } /** @@ -226,30 +163,12 @@ public class WordlistLoader { *

* * @param reader Reader containing a Snowball stopword list - * @return A Set with the reader's words + * @param result the {@link CharArraySet} to fill with the readers words + * @param matchVersion the Lucene {@link Version} + * @return A {@link CharArraySet} with the reader's words */ - public static Set getSnowballWordSet(Reader reader) - throws IOException { - final Set result = new HashSet(); - BufferedReader br = null; - try { - if (reader instanceof BufferedReader) { - br = (BufferedReader) reader; - } else { - br = new BufferedReader(reader); - } - String line = null; - while ((line = br.readLine()) != null) { - int comment = line.indexOf('|'); - if (comment >= 0) line = line.substring(0, comment); - String words[] = line.split("\\s+"); - for (int i = 0; i < words.length; i++) - if (words[i].length() > 0) result.add(words[i]); - } - } finally { - if (br != null) br.close(); - } - return result; + public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException { + return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false)); } @@ -261,24 +180,24 @@ public class WordlistLoader { * @return stem dictionary that overrules the stemming algorithm * @throws IOException */ - public static HashMap getStemDict(File wordstemfile) throws IOException { - if (wordstemfile == null) - throw new NullPointerException("wordstemfile may not be null"); - final HashMap result = new HashMap(); + public static CharArrayMap getStemDict(Reader reader, CharArrayMap result) throws IOException { BufferedReader br = null; - try { - br = new BufferedReader(new FileReader(wordstemfile)); + br = getBufferedReader(reader); String line; while ((line = br.readLine()) != null) { String[] wordstem = line.split("\t", 2); result.put(wordstem[0], wordstem[1]); } } finally { - if(br != null) - br.close(); + IOUtils.close(br); } return result; } - + + private static BufferedReader getBufferedReader(Reader reader) { + return (reader instanceof BufferedReader) ? (BufferedReader) reader + : new BufferedReader(reader); + } + } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java index 8983ead9bf3..9cb07577635 100755 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java @@ -46,7 +46,7 @@ public class TestCharArraySet extends LuceneTestCase { public void testNonZeroOffset() { String[] words={"Hello","World","this","is","a","test"}; char[] findme="xthisy".toCharArray(); - CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10,true); + CharArraySet set= new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.addAll(Arrays.asList(words)); assertTrue(set.contains(findme, 1, 4)); assertTrue(set.contains(new String(findme,1,4))); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java index 74356c42828..a9634f6d59d 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java @@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util; import java.io.BufferedReader; import java.io.IOException; import java.io.StringReader; -import java.util.HashSet; -import java.util.Set; import org.apache.lucene.util.LuceneTestCase; @@ -31,22 +29,22 @@ public class TestWordlistLoader extends LuceneTestCase { public void testWordlistLoading() throws IOException { String s = "ONE\n two \nthree"; - HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s)); + CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), TEST_VERSION_CURRENT); checkSet(wordSet1); - HashSet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s))); + CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)), TEST_VERSION_CURRENT); checkSet(wordSet2); } public void testComments() throws Exception { String s = "ONE\n two \nthree\n#comment"; - HashSet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#"); + CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT); checkSet(wordSet1); assertFalse(wordSet1.contains("#comment")); assertFalse(wordSet1.contains("comment")); } - private void checkSet(HashSet wordset) { + private void checkSet(CharArraySet wordset) { assertEquals(3, wordset.size()); assertTrue(wordset.contains("ONE")); // case is not modified assertTrue(wordset.contains("two")); // surrounding whitespace is removed @@ -68,7 +66,7 @@ public class TestWordlistLoader extends LuceneTestCase { " two \n" + // stopword with leading/trailing space " three four five \n" + // multiple stopwords "six seven | comment\n"; //multiple stopwords + comment - Set wordset = WordlistLoader.getSnowballWordSet(new StringReader(s)); + CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT); assertEquals(7, wordset.size()); assertTrue(wordset.contains("ONE")); assertTrue(wordset.contains("two")); diff --git a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java index f078b6ab1d7..8de8b34d567 100644 --- a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java +++ b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java @@ -26,6 +26,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -66,7 +67,7 @@ public final class SmartChineseAnalyzer extends Analyzer { * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -75,7 +76,7 @@ public final class SmartChineseAnalyzer extends Analyzer { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static { try { @@ -87,13 +88,14 @@ public final class SmartChineseAnalyzer extends Analyzer { } } - static Set loadDefaultStopWordSet() throws IOException { + static CharArraySet loadDefaultStopWordSet() throws IOException { InputStream stream = SmartChineseAnalyzer.class .getResourceAsStream(DEFAULT_STOPWORD_FILE); try { InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); // make sure it is unmodifiable as we expose it in the outer class - return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT)); + return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader, + STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT)); } finally { stream.close(); } diff --git a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java index 8dc589a6936..59c8fd9889a 100644 --- a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java +++ b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java @@ -34,6 +34,7 @@ import org.apache.lucene.analysis.stempel.StempelFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.Version; import org.egothor.stemmer.Trie; @@ -68,8 +69,8 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase { static { try { - DEFAULT_STOP_SET = WordlistLoader.getWordSet(PolishAnalyzer.class, - DEFAULT_STOPWORD_FILE); + DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class, + DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR)