From dc6b4b653396dc77a1e52639bf575078d11ada2f Mon Sep 17 00:00:00 2001
From: Simon Willnauer <simonw@apache.org>
Date: Thu, 10 Nov 2011 01:21:25 +0000
Subject: [PATCH] LUCENE-2564: Cut over WordListLoader to CharArrayMap/Set and
 use CharSetDecoder to detect encoding problems early

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200080 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/contrib/CHANGES.txt                    |   5 +
 .../java/org/apache/lucene/util/IOUtils.java  | 101 ++++++-
 .../lucene/analysis/br/BrazilianAnalyzer.java |   6 +-
 .../lucene/analysis/core/StopAnalyzer.java    |   7 +-
 .../lucene/analysis/cz/CzechAnalyzer.java     |   6 +-
 .../lucene/analysis/da/DanishAnalyzer.java    |   5 +-
 .../lucene/analysis/de/GermanAnalyzer.java    |   5 +-
 .../lucene/analysis/es/SpanishAnalyzer.java   |   5 +-
 .../lucene/analysis/fi/FinnishAnalyzer.java   |   5 +-
 .../lucene/analysis/fr/FrenchAnalyzer.java    |   5 +-
 .../lucene/analysis/gl/GalicianAnalyzer.java  |   7 +-
 .../lucene/analysis/hu/HungarianAnalyzer.java |   5 +-
 .../lucene/analysis/it/ItalianAnalyzer.java   |   5 +-
 .../lucene/analysis/lv/LatvianAnalyzer.java   |   6 +-
 .../lucene/analysis/nl/DutchAnalyzer.java     |   5 +-
 .../lucene/analysis/no/NorwegianAnalyzer.java |   5 +-
 .../analysis/pt/PortugueseAnalyzer.java       |   5 +-
 .../lucene/analysis/ru/RussianAnalyzer.java   |   7 +-
 .../analysis/standard/ClassicAnalyzer.java    |   5 +-
 .../analysis/standard/StandardAnalyzer.java   |   5 +-
 .../lucene/analysis/sv/SwedishAnalyzer.java   |   5 +-
 .../analysis/util/StopwordAnalyzerBase.java   |  63 ++++-
 .../lucene/analysis/util/WordlistLoader.java  | 261 ++++++------------
 .../analysis/util/TestCharArraySet.java       |   2 +-
 .../analysis/util/TestWordlistLoader.java     |  12 +-
 .../cn/smart/SmartChineseAnalyzer.java        |  10 +-
 .../lucene/analysis/pl/PolishAnalyzer.java    |   5 +-
 27 files changed, 327 insertions(+), 236 deletions(-)

diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index d0caa19bf15..9d7d9654013 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -99,6 +99,11 @@ Changes in backwards compatibility policy
    
  * LUCENE-3558: Moved NRTManager & NRTManagerReopenThread into lucene core 
    o.a.l.search. (Simon Willnauer)
+   
+ * LUCENE-2564: WordListLoader is now flaged as @lucene.internal. All methods in
+   WordListLoader now return CharArraySet/Map and expect Reader instances for 
+   efficiency. Utilities to open Readers from Files, InputStreams or Java 
+   resources were added to IOUtils. (Simon Willnauer, Robert Muir)
 
 New Features
 
diff --git a/lucene/src/java/org/apache/lucene/util/IOUtils.java b/lucene/src/java/org/apache/lucene/util/IOUtils.java
index 73d9dc6e571..8508c1803bc 100644
--- a/lucene/src/java/org/apache/lucene/util/IOUtils.java
+++ b/lucene/src/java/org/apache/lucene/util/IOUtils.java
@@ -17,15 +17,35 @@ package org.apache.lucene.util;
  * limitations under the License.
  */
 
+import java.io.BufferedReader;
 import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
 import java.lang.reflect.Method;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 
 /** This class emulates the new Java 7 "Try-With-Resources" statement.
  * Remove once Lucene is on Java 7.
  * @lucene.internal */
 public final class IOUtils {
-
+  
+  /**
+   * UTF-8 charset string
+   * @see Charset#forName(String)
+   */
+  public static final String UTF_8 = "UTF-8";
+  
+  /**
+   * UTF-8 {@link Charset} instance to prevent repeated
+   * {@link Charset#forName(String)} lookups
+   */
+  public static final Charset CHARSET_UTF_8 = Charset.forName("UTF-8");
   private IOUtils() {} // no instance
 
   /**
@@ -220,5 +240,84 @@ public final class IOUtils {
       }
     }
   }
+  
+  /**
+   * Wrapping the given {@link InputStream} in a reader using a {@link CharsetDecoder}.
+   * Unlike Java's defaults this reader will throw an exception if your it detects 
+   * the read charset doesn't match the expected {@link Charset}. 
+   * <p>
+   * Decoding readers are useful to load configuration files, stopword lists or synonym files
+   * to detect character set problems. However, its not recommended to use as a common purpose 
+   * reader.
+   * 
+   * @param stream the stream to wrap in a reader
+   * @param charSet the expected charset
+   * @return a wrapping reader
+   */
+  public static Reader getDecodingReader(InputStream stream, Charset charSet) {
+    final CharsetDecoder charSetDecoder = charSet.newDecoder()
+        .onMalformedInput(CodingErrorAction.REPORT)
+        .onUnmappableCharacter(CodingErrorAction.REPORT);
+    return new BufferedReader(new InputStreamReader(stream, charSetDecoder));
+  }
+  
+  /**
+   * Opens a Reader for the given {@link File} using a {@link CharsetDecoder}.
+   * Unlike Java's defaults this reader will throw an exception if your it detects 
+   * the read charset doesn't match the expected {@link Charset}. 
+   * <p>
+   * Decoding readers are useful to load configuration files, stopword lists or synonym files
+   * to detect character set problems. However, its not recommended to use as a common purpose 
+   * reader.
+   * @param file the file to open a reader on
+   * @param charSet the expected charset
+   * @return a reader to read the given file
+   */
+  public static Reader getDecodingReader(File file, Charset charSet) throws IOException {
+    FileInputStream stream = null;
+    boolean success = false;
+    try {
+      stream = new FileInputStream(file);
+      final Reader reader = getDecodingReader(stream, charSet);
+      success = true;
+      return reader;
+
+    } finally {
+      if (!success) {
+        IOUtils.close(stream);
+      }
+    }
+  }
+
+  /**
+   * Opens a Reader for the given resource using a {@link CharsetDecoder}.
+   * Unlike Java's defaults this reader will throw an exception if your it detects 
+   * the read charset doesn't match the expected {@link Charset}. 
+   * <p>
+   * Decoding readers are useful to load configuration files, stopword lists or synonym files
+   * to detect character set problems. However, its not recommended to use as a common purpose 
+   * reader.
+   * @param clazz the class used to locate the resource
+   * @param resource the resource name to load
+   * @param charSet the expected charset
+   * @return a reader to read the given file
+   * 
+   */
+  public static Reader getDecodingReader(Class<?> clazz, String resource, Charset charSet) throws IOException {
+    InputStream stream = null;
+    boolean success = false;
+    try {
+      stream = clazz
+      .getResourceAsStream(resource);
+      final Reader reader = getDecodingReader(stream, charSet);
+      success = true;
+      return reader;
+    } finally {
+      if (!success) {
+        IOUtils.close(stream);
+      }
+    }
+  }
+
 
 }
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index 2ba53153998..23ed34b04f0 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 /**
@@ -64,9 +65,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
     
     static {
       try {
-        DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(new CharArraySet(
-            Version.LUCENE_CURRENT, WordlistLoader.getWordSet(BrazilianAnalyzer.class, 
-                DEFAULT_STOPWORD_FILE, "#"), false));
+        DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
index 75fb8c4c3aa..f83f3a7c38b 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
@@ -57,8 +57,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
       "they", "this", "to", "was", "will", "with"
     );
     final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, 
-        stopWords.size(), false);
-    stopSet.addAll(stopWords);  
+        stopWords, false);
     ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); 
   }
   
@@ -82,7 +81,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
    * @param matchVersion See <a href="#version">above</a>
    * @param stopwordsFile File to load stop words from */
   public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
-    this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
+    this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion));
   }
 
   /** Builds an analyzer with the stop words from the given reader.
@@ -90,7 +89,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
    * @param matchVersion See <a href="#version">above</a>
    * @param stopwords Reader to load stop words from */
   public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
-    this(matchVersion, WordlistLoader.getWordSet(stopwords));
+    this(matchVersion, loadStopwordSet(stopwords, matchVersion));
   }
 
   /**
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 0df03a1ed93..ba845ff1609 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 import java.io.*;
@@ -70,9 +71,8 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
 	  
 	  static {
 	    try {
-	      DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
-	          Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class, 
-	              DEFAULT_STOPWORD_FILE, "#"), false));
+	      DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class, 
+	          DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
 	    } catch (IOException ex) {
 	      // default set should always be present as it is part of the
 	      // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
index 65505dca4e9..c94676a5196 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.DanishStemmer;
 
@@ -62,8 +63,8 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index 2c69900daad..9abde8c249c 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.German2Stemmer;
 
@@ -100,8 +101,8 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
     private static final Set<?> DEFAULT_SET;
     static {
       try {
-        DEFAULT_SET = 
-          WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+        DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
index 025415d9422..7be2b705582 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.SpanishStemmer;
 
@@ -62,8 +63,8 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
index 85a0e595146..caf59278a3f 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.FinnishStemmer;
 
@@ -62,8 +63,8 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index 087f6a104e8..8d0c4a15d43 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 import java.io.IOException;
@@ -118,8 +119,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
     static final Set<?> DEFAULT_STOP_SET;
     static {
       try {
-        DEFAULT_STOP_SET = 
-          WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+                DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
index 60dc7c3a6d2..7ce43f1bf15 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 /**
@@ -60,12 +61,12 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
-        throw new RuntimeException("Unable to load default stopword set");
+        throw new RuntimeException("Unable to load default stopword set", ex);
       }
     }
   }
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
index be3a8794782..a9270097d17 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.HungarianStemmer;
 
@@ -62,8 +63,8 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
index 22790bb3e19..4e9011624fc 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
@@ -35,6 +35,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.ItalianStemmer;
 
@@ -79,8 +80,8 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
index d0ff1e10323..370e706bd5a 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
@@ -27,11 +27,13 @@ import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 /**
@@ -60,8 +62,8 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
index 3931fa107c2..312242f196a 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 import java.io.File;
@@ -83,8 +84,8 @@ public final class DutchAnalyzer extends Analyzer {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
index ecb66f6c8b2..00403f1f720 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.NorwegianStemmer;
 
@@ -62,8 +63,8 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
index 3d2893313ba..853f423d795 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.PortugueseStemmer;
 
@@ -62,8 +63,8 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
index 6ddf665a578..247bdf636e9 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 /**
@@ -84,12 +85,12 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
       
       static {
         try {
-          DEFAULT_STOP_SET = 
-            WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+          DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+              DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
         } catch (IOException ex) {
           // default set should always be present as it is part of the
           // distribution (JAR)
-          throw new RuntimeException("Unable to load default stopword set");
+          throw new RuntimeException("Unable to load default stopword set", ex);
         }
       }
     }
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
index 9c9821d7792..dc3f0a676c9 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 import java.io.File;
@@ -85,7 +86,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
    * <a href="#version">above</a>}
    * @param stopwords File to read stop words from */
   public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
-    this(matchVersion, WordlistLoader.getWordSet(stopwords));
+    this(matchVersion, loadStopwordSet(stopwords, matchVersion));
   }
 
   /** Builds an analyzer with the stop words from the given reader.
@@ -94,7 +95,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
    * <a href="#version">above</a>}
    * @param stopwords Reader to read stop words from */
   public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
-    this(matchVersion, WordlistLoader.getWordSet(stopwords));
+    this(matchVersion, loadStopwordSet(stopwords, matchVersion));
   }
 
   /**
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
index cf0011d6db2..96b7e8c6e28 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 import java.io.File;
@@ -86,7 +87,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
    * <a href="#version">above</a>}
    * @param stopwords File to read stop words from */
   public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
-    this(matchVersion, WordlistLoader.getWordSet(stopwords));
+    this(matchVersion, loadStopwordSet(stopwords, matchVersion));
   }
 
   /** Builds an analyzer with the stop words from the given reader.
@@ -95,7 +96,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
    * <a href="#version">above</a>}
    * @param stopwords Reader to read stop words from */
   public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
-    this(matchVersion, WordlistLoader.getWordSet(stopwords));
+    this(matchVersion, loadStopwordSet(stopwords, matchVersion));
   }
 
   /**
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
index 7dd1702cde5..b1f9442b642 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.tartarus.snowball.ext.SwedishStemmer;
 
@@ -62,8 +63,8 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
 
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
index c99dc54e092..ba85a499740 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StopwordAnalyzerBase.java
@@ -17,10 +17,13 @@
 
 package org.apache.lucene.analysis.util;
 
+import java.io.File;
 import java.io.IOException;
+import java.io.Reader;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 
 /**
@@ -93,11 +96,59 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
   protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
       final Class<? extends Analyzer> aClass, final String resource,
       final String comment) throws IOException {
-    final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
-        comment);
-    final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
-    set.addAll(wordSet);
-    return set;
+    Reader reader = null;
+    try {
+      reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8);
+      return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase));
+    } finally {
+      IOUtils.close(reader);
+    }
+    
+  }
+  
+  /**
+   * Creates a CharArraySet from a file.
+   * 
+   * @param stopwords
+   *          the stopwords file to load
+   * 
+   * @param matchVersion
+   *          the Lucene version for cross version compatibility
+   * @return a CharArraySet containing the distinct stopwords from the given
+   *         file
+   * @throws IOException
+   *           if loading the stopwords throws an {@link IOException}
+   */
+  protected static CharArraySet loadStopwordSet(File stopwords,
+      Version matchVersion) throws IOException {
+    Reader reader = null;
+    try {
+      reader = IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8);
+      return WordlistLoader.getWordSet(reader, matchVersion);
+    } finally {
+      IOUtils.close(reader);
+    }
+  }
+  
+  /**
+   * Creates a CharArraySet from a file.
+   * 
+   * @param stopwords
+   *          the stopwords reader to load
+   * 
+   * @param matchVersion
+   *          the Lucene version for cross version compatibility
+   * @return a CharArraySet containing the distinct stopwords from the given
+   *         reader
+   * @throws IOException
+   *           if loading the stopwords throws an {@link IOException}
+   */
+  protected static CharArraySet loadStopwordSet(Reader stopwords,
+      Version matchVersion) throws IOException {
+    try {
+      return WordlistLoader.getWordSet(stopwords, matchVersion);
+    } finally {
+      IOUtils.close(stopwords);
+    }
   }
-
 }
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
index 78aa03d6c4f..e62b6af06ee 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/WordlistLoader.java
@@ -18,165 +18,91 @@ package org.apache.lucene.analysis.util;
  */
 
 import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
 import java.io.IOException;
-import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Set;
+
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.Version;
 
 /**
  * Loader for text files that represent a list of stopwords.
+ * 
+ * @see IOUtils to obtain {@link Reader} instances
+ * @lucene.internal
  */
 public class WordlistLoader {
- 
-  /**
-   * Loads a text file associated with a given class (See
-   * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
-   * to a {@link Set} (omitting leading and trailing whitespace). Every line of
-   * the file should contain only one word. The words need to be in lower-case if
-   * you make use of an Analyzer which uses LowerCaseFilter (like
-   * StandardAnalyzer).
-   * 
-   * @param aClass
-   *          a class that is associated with the given stopwordResource
-   * @param stopwordResource
-   *          name of the resource file associated with the given class
-   * @return a {@link Set} with the file's words
-   */
-  public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
-      throws IOException {
-    final Reader reader = new BufferedReader(new InputStreamReader(aClass
-        .getResourceAsStream(stopwordResource), "UTF-8"));
-    try {
-      return getWordSet(reader);
-    } finally {
-      reader.close();
-    }
-  }
+  
+  private static final int INITITAL_CAPACITY = 16;
   
   /**
-   * Loads a text file associated with a given class (See
-   * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
-   * to a {@link Set} (omitting leading and trailing whitespace). Every line of
-   * the file should contain only one word. The words need to be in lower-case if
-   * you make use of an Analyzer which uses LowerCaseFilter (like
-   * StandardAnalyzer).
-   * 
-   * @param aClass
-   *          a class that is associated with the given stopwordResource
-   * @param stopwordResource
-   *          name of the resource file associated with the given class
-   * @param comment
-   *          the comment string to ignore
-   * @return a {@link Set} with the file's words
-   */
-  public static Set<String> getWordSet(Class<?> aClass,
-      String stopwordResource, String comment) throws IOException {
-    final Reader reader = new BufferedReader(new InputStreamReader(aClass
-        .getResourceAsStream(stopwordResource), "UTF-8"));
-    try {
-      return getWordSet(reader, comment);
-    } finally {
-      reader.close();
-    }
-  }
-  
-  /**
-   * Loads a text file and adds every line as an entry to a HashSet (omitting
-   * leading and trailing whitespace). Every line of the file should contain only
-   * one word. The words need to be in lowercase if you make use of an
-   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
-   *
-   * @param wordfile File containing the wordlist
-   * @return A HashSet with the file's words
-   */
-  public static HashSet<String> getWordSet(File wordfile) throws IOException {
-    FileReader reader = null;
-    try {
-      reader = new FileReader(wordfile);
-      return getWordSet(reader);
-    }
-    finally {
-      if (reader != null)
-        reader.close();
-    }
-  }
-
-  /**
-   * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
-   * leading and trailing whitespace). Every line of the file should contain only
-   * one word. The words need to be in lowercase if you make use of an
-   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
-   *
-   * @param wordfile File containing the wordlist
-   * @param comment The comment string to ignore
-   * @return A HashSet with the file's words
-   */
-  public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
-    FileReader reader = null;
-    try {
-      reader = new FileReader(wordfile);
-      return getWordSet(reader, comment);
-    }
-    finally {
-      if (reader != null)
-        reader.close();
-    }
-  }
-
-
-  /**
-   * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
+   * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
    * leading and trailing whitespace). Every line of the Reader should contain only
    * one word. The words need to be in lowercase if you make use of an
    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
-   * @return A HashSet with the reader's words
+   * @param result the {@link CharArraySet} to fill with the readers words
+   * @return the given {@link CharArraySet} with the reader's words
    */
-  public static HashSet<String> getWordSet(Reader reader) throws IOException {
-    final HashSet<String> result = new HashSet<String>();
+  public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
     BufferedReader br = null;
     try {
-      if (reader instanceof BufferedReader) {
-        br = (BufferedReader) reader;
-      } else {
-        br = new BufferedReader(reader);
-      }
+      br = getBufferedReader(reader);
       String word = null;
       while ((word = br.readLine()) != null) {
         result.add(word.trim());
       }
     }
     finally {
-      if (br != null)
-        br.close();
+      IOUtils.close(br);
     }
     return result;
   }
+  
+  /**
+   * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param reader Reader containing the wordlist
+   * @param matchVersion the Lucene {@link Version}
+   * @return A {@link CharArraySet} with the reader's words
+   */
+  public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException {
+    return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
+  }
 
   /**
-   * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
+   * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
    * leading and trailing whitespace). Every line of the Reader should contain only
    * one word. The words need to be in lowercase if you make use of an
    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
    * @param comment The string representing a comment.
-   * @return A HashSet with the reader's words
+   * @param matchVersion the Lucene {@link Version}
+   * @return A CharArraySet with the reader's words
    */
-  public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
-    final HashSet<String> result = new HashSet<String>();
+  public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException {
+    return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
+  }
+
+  /**
+   * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   *
+   * @param reader Reader containing the wordlist
+   * @param comment The string representing a comment.
+   * @param result the {@link CharArraySet} to fill with the readers words
+   * @return the given {@link CharArraySet} with the reader's words
+   */
+  public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
     BufferedReader br = null;
     try {
-      if (reader instanceof BufferedReader) {
-        br = (BufferedReader) reader;
-      } else {
-        br = new BufferedReader(reader);
-      }
+      br = getBufferedReader(reader);
       String word = null;
       while ((word = br.readLine()) != null) {
         if (word.startsWith(comment) == false){
@@ -185,33 +111,44 @@ public class WordlistLoader {
       }
     }
     finally {
-      if (br != null)
-        br.close();
+      IOUtils.close(br);
     }
     return result;
   }
 
+  
   /**
-   * Loads a text file in Snowball format associated with a given class (See
-   * {@link Class#getResourceAsStream(String)}) and adds all words as entries to
-   * a {@link Set}. The words need to be in lower-case if you make use of an
-   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+   * Reads stopwords from a stopword list in Snowball format.
+   * <p>
+   * The snowball format is the following:
+   * <ul>
+   * <li>Lines may contain multiple words separated by whitespace.
+   * <li>The comment character is the vertical line (&#124;).
+   * <li>Lines may contain trailing comments.
+   * </ul>
+   * </p>
    * 
-   * @param aClass a class that is associated with the given stopwordResource
-   * @param stopwordResource name of the resource file associated with the given
-   *          class
-   * @return a {@link Set} with the file's words
-   * @see #getSnowballWordSet(Reader)
+   * @param reader Reader containing a Snowball stopword list
+   * @param result the {@link CharArraySet} to fill with the readers words
+   * @return the given {@link CharArraySet} with the reader's words
    */
-  public static Set<String> getSnowballWordSet(Class<?> aClass,
-      String stopwordResource) throws IOException {
-    final Reader reader = new BufferedReader(new InputStreamReader(aClass
-        .getResourceAsStream(stopwordResource), "UTF-8"));
+  public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
+      throws IOException {
+    BufferedReader br = null;
     try {
-      return getSnowballWordSet(reader);
+      br = getBufferedReader(reader);
+      String line = null;
+      while ((line = br.readLine()) != null) {
+        int comment = line.indexOf('|');
+        if (comment >= 0) line = line.substring(0, comment);
+        String words[] = line.split("\\s+");
+        for (int i = 0; i < words.length; i++)
+          if (words[i].length() > 0) result.add(words[i]);
+      }
     } finally {
-      reader.close();
+      IOUtils.close(br);
     }
+    return result;
   }
   
   /**
@@ -226,30 +163,12 @@ public class WordlistLoader {
    * </p>
    * 
    * @param reader Reader containing a Snowball stopword list
-   * @return A Set with the reader's words
+   * @param result the {@link CharArraySet} to fill with the readers words
+   * @param matchVersion the Lucene {@link Version}
+   * @return A {@link CharArraySet} with the reader's words
    */
-  public static Set<String> getSnowballWordSet(Reader reader)
-      throws IOException {
-    final Set<String> result = new HashSet<String>();
-    BufferedReader br = null;
-    try {
-      if (reader instanceof BufferedReader) {
-        br = (BufferedReader) reader;
-      } else {
-        br = new BufferedReader(reader);
-      }
-      String line = null;
-      while ((line = br.readLine()) != null) {
-        int comment = line.indexOf('|');
-        if (comment >= 0) line = line.substring(0, comment);
-        String words[] = line.split("\\s+");
-        for (int i = 0; i < words.length; i++)
-          if (words[i].length() > 0) result.add(words[i]);
-      }
-    } finally {
-      if (br != null) br.close();
-    }
-    return result;
+  public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException {
+    return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
   }
 
 
@@ -261,24 +180,24 @@ public class WordlistLoader {
    * @return stem dictionary that overrules the stemming algorithm
    * @throws IOException 
    */
-  public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
-    if (wordstemfile == null)
-      throw new NullPointerException("wordstemfile may not be null");
-    final HashMap<String, String> result = new HashMap<String,String>();
+  public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
     BufferedReader br = null;
-    
     try {
-      br = new BufferedReader(new FileReader(wordstemfile));
+      br = getBufferedReader(reader);
       String line;
       while ((line = br.readLine()) != null) {
         String[] wordstem = line.split("\t", 2);
         result.put(wordstem[0], wordstem[1]);
       }
     } finally {
-      if(br != null)
-        br.close();
+      IOUtils.close(br);
     }
     return result;
   }
-
+  
+  private static BufferedReader getBufferedReader(Reader reader) {
+    return (reader instanceof BufferedReader) ? (BufferedReader) reader
+        : new BufferedReader(reader);
+  }
+  
 }
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java
index 8983ead9bf3..9cb07577635 100755
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArraySet.java
@@ -46,7 +46,7 @@ public class TestCharArraySet extends LuceneTestCase {
   public void testNonZeroOffset() {
     String[] words={"Hello","World","this","is","a","test"};
     char[] findme="xthisy".toCharArray();   
-    CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10,true);
+    CharArraySet set= new CharArraySet(TEST_VERSION_CURRENT, 10, true);
     set.addAll(Arrays.asList(words));
     assertTrue(set.contains(findme, 1, 4));
     assertTrue(set.contains(new String(findme,1,4)));
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java
index 74356c42828..a9634f6d59d 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestWordlistLoader.java
@@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.StringReader;
-import java.util.HashSet;
-import java.util.Set;
 
 import org.apache.lucene.util.LuceneTestCase;
 
@@ -31,22 +29,22 @@ public class TestWordlistLoader extends LuceneTestCase {
 
   public void testWordlistLoading() throws IOException {
     String s = "ONE\n  two \nthree";
-    HashSet<String> wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
+    CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), TEST_VERSION_CURRENT);
     checkSet(wordSet1);
-    HashSet<String> wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
+    CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)), TEST_VERSION_CURRENT);
     checkSet(wordSet2);
   }
 
   public void testComments() throws Exception {
     String s = "ONE\n  two \nthree\n#comment";
-    HashSet<String> wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
+    CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT);
     checkSet(wordSet1);
     assertFalse(wordSet1.contains("#comment"));
     assertFalse(wordSet1.contains("comment"));
   }
 
 
-  private void checkSet(HashSet<String> wordset) {
+  private void checkSet(CharArraySet wordset) {
     assertEquals(3, wordset.size());
     assertTrue(wordset.contains("ONE"));		// case is not modified
     assertTrue(wordset.contains("two"));		// surrounding whitespace is removed
@@ -68,7 +66,7 @@ public class TestWordlistLoader extends LuceneTestCase {
       "   two   \n" + // stopword with leading/trailing space
       " three   four five \n" + // multiple stopwords
       "six seven | comment\n"; //multiple stopwords + comment
-    Set<String> wordset = WordlistLoader.getSnowballWordSet(new StringReader(s));
+    CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT);
     assertEquals(7, wordset.size());
     assertTrue(wordset.contains("ONE"));
     assertTrue(wordset.contains("two"));
diff --git a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
index f078b6ab1d7..8de8b34d567 100644
--- a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
+++ b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
@@ -26,6 +26,7 @@ import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.WordlistLoader;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -66,7 +67,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
    * Returns an unmodifiable instance of the default stop-words set.
    * @return an unmodifiable instance of the default stop-words set.
    */
-  public static Set<String> getDefaultStopSet(){
+  public static CharArraySet getDefaultStopSet(){
     return DefaultSetHolder.DEFAULT_STOP_SET;
   }
   
@@ -75,7 +76,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
    * accesses the static final set the first time.;
    */
   private static class DefaultSetHolder {
-    static final Set<String> DEFAULT_STOP_SET;
+    static final CharArraySet DEFAULT_STOP_SET;
 
     static {
       try {
@@ -87,13 +88,14 @@ public final class SmartChineseAnalyzer extends Analyzer {
       }
     }
 
-    static Set<String> loadDefaultStopWordSet() throws IOException {
+    static CharArraySet loadDefaultStopWordSet() throws IOException {
       InputStream stream = SmartChineseAnalyzer.class
           .getResourceAsStream(DEFAULT_STOPWORD_FILE);
       try {
         InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
         // make sure it is unmodifiable as we expose it in the outer class
-        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
+        return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader,
+            STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT));
       } finally {
         stream.close();
       }
diff --git a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
index 8dc589a6936..59c8fd9889a 100644
--- a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
+++ b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.stempel.StempelFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.Version;
 import org.egothor.stemmer.Trie;
 
@@ -68,8 +69,8 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
     
     static {
       try {
-        DEFAULT_STOP_SET = WordlistLoader.getWordSet(PolishAnalyzer.class, 
-            DEFAULT_STOPWORD_FILE);
+        DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
       } catch (IOException ex) {
         // default set should always be present as it is part of the
         // distribution (JAR)