LUCENE-1688: Deprecate static final String stop word array in and StopAnalzyer and replace it with an immutable implementation of CharArraySet.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@794078 13f79535-47bb-0310-9956-ffa450edef68
2009-07-14 21:39:22 +00:00 · 2009-07-14 21:39:22 +00:00 · ea7e4ad344
parent 6bf4d35ce8
commit ea7e4ad344
15 changed files with 340 additions and 97 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -309,6 +309,11 @@ API Changes
    all synchronization in TermInfosReader, which previously could
    cause threads to pile up in certain cases. (Dan Rosher via Mike
    McCandless)
+    
+30. LUCENE-1688: Deprecate static final String stop word array in and 
+    StopAnalzyer and replace it with an immutable implementation of 
+    CharArraySet.  (Simon Willnauer via Mark Miller)
+

 Bug fixes

@ -604,6 +609,11 @@ Optimizations
 9. LUCENE-1653: Avoid creating a Calendar in every call to 
    DateTools#dateToString, DateTools#timeToString and
    DateTools#round.  (Shai Erera via Mark Miller)
+    
+10. LUCENE-1688: Deprecate static final String stop word array and 
+    replace it with an immutable implementation of CharArraySet.
+    Removes conversions between Set and array.
+    (Simon Willnauer via Mark Miller)

 Documentation

--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
@ -33,7 +33,7 @@ public class ThaiAnalyzer extends Analyzer {
 	  TokenStream ts = new StandardTokenizer(reader);
    ts = new StandardFilter(ts);
    ts = new ThaiWordFilter(ts);
-    ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS);
+    ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    return ts;
  }
 }
--- a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@ -23,9 +23,11 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.StringTokenizer;

 import javax.xml.parsers.DocumentBuilder;
@ -982,7 +984,8 @@ public class HighlighterTest extends TestCase implements Formatter {

      public void run() throws Exception {
        String goodWord = "goodtoken";
-        String stopWords[] = { "stoppedtoken" };
+        Set stopWords = new HashSet(1);
+        stopWords.add("stoppedtoken");

        TermQuery query = new TermQuery(new Term("data", goodWord));

@ -991,7 +994,8 @@ public class HighlighterTest extends TestCase implements Formatter {
        sb.append(goodWord);
        for (int i = 0; i < 10000; i++) {
          sb.append(" ");
-          sb.append(stopWords[0]);
+          // only one stopword
+          sb.append(stopWords.iterator().next());
        }
        SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
        Highlighter hg = getHighlighter(query, "data", new StandardAnalyzer(stopWords).tokenStream(
@ -1024,7 +1028,9 @@ public class HighlighterTest extends TestCase implements Formatter {
  public void testMaxSizeEndHighlight() throws Exception {
    TestHighlightRunner helper = new TestHighlightRunner() {
      public void run() throws Exception {
-        String stopWords[] = { "in", "it" };
+        Set stopWords = new HashSet();
+        stopWords.add("in");
+        stopWords.add("it");
        TermQuery query = new TermQuery(new Term("text", "searchterm"));

        String text = "this is a text with searchterm in it";
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
@ -70,55 +70,60 @@ public class PatternAnalyzer extends Analyzer {
  /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
  public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
  
-  private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
-    "a", "about", "above", "across", "adj", "after", "afterwards",
-    "again", "against", "albeit", "all", "almost", "alone", "along",
-    "already", "also", "although", "always", "among", "amongst", "an",
-    "and", "another", "any", "anyhow", "anyone", "anything",
-    "anywhere", "are", "around", "as", "at", "be", "became", "because",
-    "become", "becomes", "becoming", "been", "before", "beforehand",
-    "behind", "being", "below", "beside", "besides", "between",
-    "beyond", "both", "but", "by", "can", "cannot", "co", "could",
-    "down", "during", "each", "eg", "either", "else", "elsewhere",
-    "enough", "etc", "even", "ever", "every", "everyone", "everything",
-    "everywhere", "except", "few", "first", "for", "former",
-    "formerly", "from", "further", "had", "has", "have", "he", "hence",
-    "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
-    "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
-    "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
-    "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
-    "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
-    "must", "my", "myself", "namely", "neither", "never",
-    "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
-    "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
-    "once one", "only", "onto", "or", "other", "others", "otherwise",
-    "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
-    "rather", "s", "same", "seem", "seemed", "seeming", "seems",
-    "several", "she", "should", "since", "so", "some", "somehow",
-    "someone", "something", "sometime", "sometimes", "somewhere",
-    "still", "such", "t", "than", "that", "the", "their", "them",
-    "themselves", "then", "thence", "there", "thereafter", "thereby",
-    "therefor", "therein", "thereupon", "these", "they", "this",
-    "those", "though", "through", "throughout", "thru", "thus", "to",
-    "together", "too", "toward", "towards", "under", "until", "up",
-    "upon", "us", "very", "via", "was", "we", "well", "were", "what",
-    "whatever", "whatsoever", "when", "whence", "whenever",
-    "whensoever", "where", "whereafter", "whereas", "whereat",
-    "whereby", "wherefrom", "wherein", "whereinto", "whereof",
-    "whereon", "whereto", "whereunto", "whereupon", "wherever",
-    "wherewith", "whether", "which", "whichever", "whichsoever",
-    "while", "whilst", "whither", "who", "whoever", "whole", "whom",
-    "whomever", "whomsoever", "whose", "whosoever", "why", "will",
-    "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
-    "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
-    "yourselves"});
+  private static final Set EXTENDED_ENGLISH_STOP_WORDS;
+  static {
+    EXTENDED_ENGLISH_STOP_WORDS = new HashSet();
+  
+    EXTENDED_ENGLISH_STOP_WORDS.addAll(Arrays.asList(new String[] {
+      "a", "about", "above", "across", "adj", "after", "afterwards",
+      "again", "against", "albeit", "all", "almost", "alone", "along",
+      "already", "also", "although", "always", "among", "amongst", "an",
+      "and", "another", "any", "anyhow", "anyone", "anything",
+      "anywhere", "are", "around", "as", "at", "be", "became", "because",
+      "become", "becomes", "becoming", "been", "before", "beforehand",
+      "behind", "being", "below", "beside", "besides", "between",
+      "beyond", "both", "but", "by", "can", "cannot", "co", "could",
+      "down", "during", "each", "eg", "either", "else", "elsewhere",
+      "enough", "etc", "even", "ever", "every", "everyone", "everything",
+      "everywhere", "except", "few", "first", "for", "former",
+      "formerly", "from", "further", "had", "has", "have", "he", "hence",
+      "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+      "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
+      "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
+      "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
+      "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
+      "must", "my", "myself", "namely", "neither", "never",
+      "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
+      "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+      "once one", "only", "onto", "or", "other", "others", "otherwise",
+      "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
+      "rather", "s", "same", "seem", "seemed", "seeming", "seems",
+      "several", "she", "should", "since", "so", "some", "somehow",
+      "someone", "something", "sometime", "sometimes", "somewhere",
+      "still", "such", "t", "than", "that", "the", "their", "them",
+      "themselves", "then", "thence", "there", "thereafter", "thereby",
+      "therefor", "therein", "thereupon", "these", "they", "this",
+      "those", "though", "through", "throughout", "thru", "thus", "to",
+      "together", "too", "toward", "towards", "under", "until", "up",
+      "upon", "us", "very", "via", "was", "we", "well", "were", "what",
+      "whatever", "whatsoever", "when", "whence", "whenever",
+      "whensoever", "where", "whereafter", "whereas", "whereat",
+      "whereby", "wherefrom", "wherein", "whereinto", "whereof",
+      "whereon", "whereto", "whereunto", "whereupon", "wherever",
+      "wherewith", "whether", "which", "whichever", "whichsoever",
+      "while", "whilst", "whither", "who", "whoever", "whole", "whom",
+      "whomever", "whomsoever", "whose", "whosoever", "why", "will",
+      "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
+      "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
+      "yourselves"}));
+  }
    
  /**
   * A lower-casing word analyzer with English stop words (can be shared
   * freely across threads without harm); global per class loader.
   */
  public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
-    NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
+    NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    
  /**
   * A lower-casing word analyzer with <b>extended </b> English stop words
@ -191,7 +196,7 @@ public class PatternAnalyzer extends Analyzer {
    }
    else {
      stream = new PatternTokenizer(text, pattern, toLowerCase);
-      if (stopWords != null) stream = new StopFilter(stream, stopWords);
+      if (stopWords != null) stream = new StopFilter(false, stream, stopWords);
    }
    
    return stream;
@ -304,9 +309,9 @@ public class PatternAnalyzer extends Analyzer {
  }
    
  /** somewhat oversized to minimize hash collisions */
-  private static Set makeStopSet(String[] stopWords) {
-    Set stops = new HashSet(stopWords.length * 2, 0.3f); 
-    stops.addAll(Arrays.asList(stopWords));
+  private static Set makeStopSet(Set stopWords) {
+    Set stops = new HashSet(stopWords.size() * 2, 0.3f); 
+    stops.addAll(stopWords);
    return stops;
 //    return Collections.unmodifiableSet(stops);
  }
--- a/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
+++ b/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
@ -271,7 +271,7 @@ public class MemoryIndexTest extends TestCase {
    boolean toLowerCase = true;
 //    boolean toLowerCase = false;
 //    Set stopWords = null;
-    Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+    Set stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    
    Analyzer[] analyzers = new Analyzer[] { 
        new SimpleAnalyzer(),
--- a/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java
+++ b/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java
@ -135,7 +135,7 @@ public class PatternAnalyzerTest extends TestCase {
          
          for (int stops=0; stops < maxStops; stops++) {
            Set stopWords = null;
-            if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+            if (stops != 0) stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
                
            for (int toLower=0; toLower < maxToLower; toLower++) {
              boolean toLowerCase = toLower != 0;
--- a/src/java/org/apache/lucene/analysis/CharArraySet.java
+++ b/src/java/org/apache/lucene/analysis/CharArraySet.java
@ -2,6 +2,7 @@ package org.apache.lucene.analysis;

 import java.util.AbstractSet;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Iterator;

 /**
@ -53,6 +54,12 @@ public class CharArraySet extends AbstractSet {
    this(c.size(), ignoreCase);
    addAll(c);
  }
+  /** Create set from entries */
+  private CharArraySet(char[][] entries, boolean ignoreCase, int count){
+    this.entries = entries;
+    this.ignoreCase = ignoreCase;
+    this.count = count;
+  }

  /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
   * are in the set */
@ -100,7 +107,7 @@ public class CharArraySet extends AbstractSet {
  public boolean add(CharSequence text) {
    return add(text.toString()); // could be more efficient
  }
-
+  
  /** Add this String into the set */
  public boolean add(String text) {
    return add(text.toCharArray());
@ -228,6 +235,26 @@ public class CharArraySet extends AbstractSet {
    }
    return add(o.toString());
  }
+  
+  /**
+   * Returns an unmodifiable {@link CharArraySet}. This allows to provide
+   * unmodifiable views of internal sets for "read-only" use.
+   * 
+   * @param set
+   *          a set for which the unmodifiable set is returned.
+   * @return an new unmodifiable {@link CharArraySet}.
+   * @throws NullPointerException
+   *           if the given set is <code>null</code>.
+   */
+  public static CharArraySet unmodifiableSet(CharArraySet set) {
+    if (set == null)
+      throw new NullPointerException("Given set is null");
+    /*
+     * Instead of delegating calls to the given set copy the low-level values to
+     * the unmodifiable Subclass
+     */
+    return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
+  }

  /** The Iterator<String> for this set.  Strings are constructed on the fly, so
   * use <code>nextCharArray</code> for more efficient access. */
@ -270,5 +297,40 @@ public class CharArraySet extends AbstractSet {
  public Iterator iterator() {
    return new CharArraySetIterator();
  }
+  
+  /**
+   * Efficient unmodifiable {@link CharArraySet}. This implementation does not
+   * delegate calls to a give {@link CharArraySet} like
+   * {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes
+   * the internal representation of a {@link CharArraySet} to a super
+   * constructor and overrides all mutators. 
+   */
+  private static final class UnmodifiableCharArraySet extends CharArraySet {
+
+    private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
+        int count) {
+      super(entries, ignoreCase, count);
+    }
+
+    public boolean add(Object o){
+      throw new UnsupportedOperationException();
+    }
+    
+    public boolean addAll(Collection coll) {
+      throw new UnsupportedOperationException();
+    }
+    
+    public boolean add(char[] text) {
+      throw new UnsupportedOperationException();
+    }
+
+    public boolean add(CharSequence text) {
+      throw new UnsupportedOperationException();
+    }
+
+    public boolean add(String text) {
+      throw new UnsupportedOperationException();
+    }
+  }

 }
--- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
@ -20,18 +20,20 @@ package org.apache.lucene.analysis;
 import java.io.File;
 import java.io.IOException;
 import java.io.Reader;
+import java.util.Arrays;
 import java.util.Set;

 /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */

 public final class StopAnalyzer extends Analyzer {
-  private Set stopWords;
+  private final Set/*<String>*/ stopWords;
  // @deprecated
-  private boolean useDefaultStopPositionIncrement;
-  private boolean enablePositionIncrements;
+  private final boolean useDefaultStopPositionIncrement;
+  private final boolean enablePositionIncrements;

  /** An array containing some common English words that are not usually useful
-    for searching. */
+    for searching. 
+    @deprecated Use {@link #ENGLISH_STOP_WORDS_SET} instead */
  public static final String[] ENGLISH_STOP_WORDS = {
    "a", "an", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
@ -39,13 +41,31 @@ public final class StopAnalyzer extends Analyzer {
    "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
  };
-
+  
+  /** An unmodifiable set containing some common English words that are not usually useful
+  for searching.*/
+  public static final Set/*<String>*/ ENGLISH_STOP_WORDS_SET;
+  
+  static {
+	  final String[] stopWords = new String[]{
+  	    "a", "an", "and", "are", "as", "at", "be", "but", "by",
+	    "for", "if", "in", "into", "is", "it",
+	    "no", "not", "of", "on", "or", "such",
+	    "that", "the", "their", "then", "there", "these",
+	    "they", "this", "to", "was", "will", "with"
+	  };
+	  final CharArraySet stopSet = new CharArraySet(stopWords.length, false);
+    stopSet.addAll(Arrays.asList(stopWords));  
+	  ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); 
+  }
+  
  /** Builds an analyzer which removes words in
   * ENGLISH_STOP_WORDS.
   * @deprecated Use {@link #StopAnalyzer(boolean)} instead */
  public StopAnalyzer() {
-    stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
+    stopWords = ENGLISH_STOP_WORDS_SET;
    useDefaultStopPositionIncrement = true;
+    enablePositionIncrements = false;
  }

  /** Builds an analyzer which removes words in
@ -53,8 +73,9 @@ public final class StopAnalyzer extends Analyzer {
   * @param enablePositionIncrements See {@link
   * StopFilter#setEnablePositionIncrements} */
  public StopAnalyzer(boolean enablePositionIncrements) {
-    stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
+    stopWords = ENGLISH_STOP_WORDS_SET;
    this.enablePositionIncrements = enablePositionIncrements;
+    useDefaultStopPositionIncrement = false;
  }

  /** Builds an analyzer with the stop words from the given set.
@ -62,6 +83,7 @@ public final class StopAnalyzer extends Analyzer {
  public StopAnalyzer(Set stopWords) {
    this.stopWords = stopWords;
    useDefaultStopPositionIncrement = true;
+    enablePositionIncrements = false;
  }

  /** Builds an analyzer with the stop words from the given set.
@ -71,22 +93,26 @@ public final class StopAnalyzer extends Analyzer {
  public StopAnalyzer(Set stopWords, boolean enablePositionIncrements) {
    this.stopWords = stopWords;
    this.enablePositionIncrements = enablePositionIncrements;
+    useDefaultStopPositionIncrement = false;
  }

  /** Builds an analyzer which removes words in the provided array.
-   * @deprecated Use {@link #StopAnalyzer(String[], boolean)} instead */
+   * @deprecated Use {@link #StopAnalyzer(Set, boolean)} instead */
  public StopAnalyzer(String[] stopWords) {
    this.stopWords = StopFilter.makeStopSet(stopWords);
    useDefaultStopPositionIncrement = true;
+    enablePositionIncrements = false;
  }
  
  /** Builds an analyzer which removes words in the provided array.
   * @param stopWords Array of stop words
   * @param enablePositionIncrements See {@link
-   * StopFilter#setEnablePositionIncrements} */
+   * StopFilter#setEnablePositionIncrements} 
+   * @deprecated Use {@link #StopAnalyzer(Set, boolean) instead*/
  public StopAnalyzer(String[] stopWords, boolean enablePositionIncrements) {
    this.stopWords = StopFilter.makeStopSet(stopWords);
    this.enablePositionIncrements = enablePositionIncrements;
+    useDefaultStopPositionIncrement = false;
  }
  
  /** Builds an analyzer with the stop words from the given file.
@ -95,6 +121,7 @@ public final class StopAnalyzer extends Analyzer {
  public StopAnalyzer(File stopwordsFile) throws IOException {
    stopWords = WordlistLoader.getWordSet(stopwordsFile);
    useDefaultStopPositionIncrement = true;
+    enablePositionIncrements = false;
  }

  /** Builds an analyzer with the stop words from the given file.
@ -105,6 +132,7 @@ public final class StopAnalyzer extends Analyzer {
  public StopAnalyzer(File stopwordsFile, boolean enablePositionIncrements) throws IOException {
    stopWords = WordlistLoader.getWordSet(stopwordsFile);
    this.enablePositionIncrements = enablePositionIncrements;
+    useDefaultStopPositionIncrement = false;
  }

  /** Builds an analyzer with the stop words from the given reader.
@ -114,6 +142,7 @@ public final class StopAnalyzer extends Analyzer {
  public StopAnalyzer(Reader stopwords) throws IOException {
    stopWords = WordlistLoader.getWordSet(stopwords);
    useDefaultStopPositionIncrement = true;
+    enablePositionIncrements = false;
  }

  /** Builds an analyzer with the stop words from the given reader.
@ -124,6 +153,7 @@ public final class StopAnalyzer extends Analyzer {
  public StopAnalyzer(Reader stopwords, boolean enablePositionIncrements) throws IOException {
    stopWords = WordlistLoader.getWordSet(stopwords);
    this.enablePositionIncrements = enablePositionIncrements;
+    useDefaultStopPositionIncrement = false;
  }

  /** Filters LowerCaseTokenizer with StopFilter. */
--- a/src/java/org/apache/lucene/analysis/StopFilter.java
+++ b/src/java/org/apache/lucene/analysis/StopFilter.java
@ -55,6 +55,7 @@ public final class StopFilter extends TokenFilter {
   * @param enablePositionIncrements true if token positions should record the removed stop words
   * @param input input TokenStream
   * @param stopWords array of stop words
+   * @deprecated Use {@link #StopFilter(boolean, TokenStream, Set)} instead.
   */
  public StopFilter(boolean enablePositionIncrements, TokenStream input, String [] stopWords)
  {
@ -77,6 +78,7 @@ public final class StopFilter extends TokenFilter {
   * @param in input TokenStream
   * @param stopWords array of stop words
   * @param ignoreCase true if case is ignored
+   * @deprecated Use {@link #StopFilter(boolean, TokenStream, Set, boolean)} instead.
   */
  public StopFilter(boolean enablePositionIncrements, TokenStream in, String[] stopWords, boolean ignoreCase) {
    super(in);
--- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@ -101,15 +101,19 @@ public class StandardAnalyzer extends Analyzer {


  /** An array containing some common English words that are usually not
-  useful for searching. */
+  useful for searching. 
+  @deprecated Use {@link #STOP_WORDS_SET} instead */
  public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
+  
+  /** An unmodifiable set containing some common English words that are usually not
+  useful for searching. */
+  public static final Set/*<String>*/ STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 

  /** Builds an analyzer with the default stop words ({@link
-   * #STOP_WORDS}).
-   * @deprecated Use {@link #StandardAnalyzer(Version)},
-   * instead. */
+   * #STOP_WORDS_SET}).
+   * @deprecated Use {@link #StandardAnalyzer(Version)} instead. */
  public StandardAnalyzer() {
-    this(Version.LUCENE_24, STOP_WORDS);
+    this(Version.LUCENE_24, STOP_WORDS_SET);
  }

  /** Builds an analyzer with the default stop words ({@link
@ -118,7 +122,7 @@ public class StandardAnalyzer extends Analyzer {
   * <a href="#version">above</a>}
   */
  public StandardAnalyzer(Version matchVersion) {
-    this(matchVersion, STOP_WORDS);
+    this(matchVersion, STOP_WORDS_SET);
  }

  /** Builds an analyzer with the given stop words.
@ -138,22 +142,9 @@ public class StandardAnalyzer extends Analyzer {
  }

  /** Builds an analyzer with the given stop words.
-   * @deprecated Use {@link #StandardAnalyzer(Version,
-   * String[])} instead */
+   * @deprecated Use {@link #StandardAnalyzer(Version, Set)} instead */
  public StandardAnalyzer(String[] stopWords) {
-    this(Version.LUCENE_24, stopWords);
-  }
-
-  /** Builds an analyzer with the given stop words.
-   * @param matchVersion Lucene version to match See {@link
-   * <a href="#version">above</a>}
-   * @param stopWords Array of stop words */
-  public StandardAnalyzer(Version matchVersion, String[] stopWords) {
-    if (stopWords == null) {
-      stopWords = STOP_WORDS;
-    }
-    stopSet = StopFilter.makeStopSet(stopWords);
-    init(matchVersion);
+    this(Version.LUCENE_24, StopFilter.makeStopSet(stopWords));
  }

  /** Builds an analyzer with the stop words from the given file.
@ -203,8 +194,9 @@ public class StandardAnalyzer extends Analyzer {
   * @deprecated Remove in 3.X and make true the only valid value
   */
  public StandardAnalyzer(boolean replaceInvalidAcronym) {
-    this(Version.LUCENE_24, STOP_WORDS);
+    this(Version.LUCENE_24, STOP_WORDS_SET);
    this.replaceInvalidAcronym = replaceInvalidAcronym;
+    useDefaultStopPositionIncrements = true;
  }

  /**
@ -243,7 +235,7 @@ public class StandardAnalyzer extends Analyzer {
   * @deprecated Remove in 3.X and make true the only valid value
   */
  public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
-    this(Version.LUCENE_24, stopwords);
+    this(Version.LUCENE_24, StopFilter.makeStopSet(stopwords));
    this.replaceInvalidAcronym = replaceInvalidAcronym;
  }

--- a/src/test/org/apache/lucene/analysis/TestCharArraySet.java
+++ b/src/test/org/apache/lucene/analysis/TestCharArraySet.java
@ -23,13 +23,22 @@ import org.apache.lucene.util.LuceneTestCase;

 public class TestCharArraySet extends LuceneTestCase {
  
+  static final String[] TEST_STOP_WORDS = {
+    "a", "an", "and", "are", "as", "at", "be", "but", "by",
+    "for", "if", "in", "into", "is", "it",
+    "no", "not", "of", "on", "or", "such",
+    "that", "the", "their", "then", "there", "these",
+    "they", "this", "to", "was", "will", "with"
+  };
+  
+  
  public void testRehash() throws Exception {
    CharArraySet cas = new CharArraySet(0, true);
-    for(int i=0;i<StopAnalyzer.ENGLISH_STOP_WORDS.length;i++)
-      cas.add(StopAnalyzer.ENGLISH_STOP_WORDS[i]);
-    assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS.length, cas.size());
-    for(int i=0;i<StopAnalyzer.ENGLISH_STOP_WORDS.length;i++)
-      assertTrue(cas.contains(StopAnalyzer.ENGLISH_STOP_WORDS[i]));
+    for(int i=0;i<TEST_STOP_WORDS.length;i++)
+      cas.add(TEST_STOP_WORDS[i]);
+    assertEquals(TEST_STOP_WORDS.length, cas.size());
+    for(int i=0;i<TEST_STOP_WORDS.length;i++)
+      assertTrue(cas.contains(TEST_STOP_WORDS[i]));
  }

  public void testNonZeroOffset() {
@ -39,6 +48,11 @@ public class TestCharArraySet extends LuceneTestCase {
    set.addAll(Arrays.asList(words));
    assertTrue(set.contains(findme, 1, 4));
    assertTrue(set.contains(new String(findme,1,4)));
+    
+    // test unmodifiable
+    set = CharArraySet.unmodifiableSet(set);
+    assertTrue(set.contains(findme, 1, 4));
+    assertTrue(set.contains(new String(findme,1,4)));
  }
  
  public void testObjectContains() {
@ -47,5 +61,118 @@ public class TestCharArraySet extends LuceneTestCase {
    set.add(val);
    assertTrue(set.contains(val));
    assertTrue(set.contains(new Integer(1)));
+    // test unmodifiable
+    set = CharArraySet.unmodifiableSet(set);
+    assertTrue(set.contains(val));
+    assertTrue(set.contains(new Integer(1)));
+  }
+  
+  public void testClear(){
+    CharArraySet set=new CharArraySet(10,true);
+    set.addAll(Arrays.asList(TEST_STOP_WORDS));
+    assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+    try{
+      set.clear();
+      fail("remove is not supported");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
+    }
+  }
+  
+  public void testModifyOnUnmodifiable(){
+    CharArraySet set=new CharArraySet(10,true);
+    set.addAll(Arrays.asList(TEST_STOP_WORDS));
+    final int size = set.size();
+    set = CharArraySet.unmodifiableSet(set);
+    assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+    String NOT_IN_SET = "SirGallahad";
+    assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
+    
+    try{
+      set.add(NOT_IN_SET.toCharArray());  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.add(NOT_IN_SET);  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.add(new StringBuffer(NOT_IN_SET));  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.clear();  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    try{
+      set.add((Object) NOT_IN_SET);  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    try{
+      set.removeAll(Arrays.asList(TEST_STOP_WORDS));  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.retainAll(Arrays.asList(new String[]{NOT_IN_SET}));  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertEquals("Size of unmodifiable set has changed", size, set.size());
+    }
+    
+    try{
+      set.addAll(Arrays.asList(new String[]{NOT_IN_SET}));  
+      fail("Modified unmodifiable set");
+    }catch (UnsupportedOperationException e) {
+      // expected
+      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
+    }
+    
+    for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
+      assertTrue(set.contains(TEST_STOP_WORDS[i]));  
+    }
+  }
+  
+  public void testUnmodifiableSet(){
+    CharArraySet set=new CharArraySet(10,true);
+    set.addAll(Arrays.asList(TEST_STOP_WORDS));
+    final int size = set.size();
+    set = CharArraySet.unmodifiableSet(set);
+    assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
+    
+    try{
+      CharArraySet.unmodifiableSet(null);
+      fail("can not make null unmodifiable");
+    }catch (NullPointerException e) {
+      // expected
+    }
  }
 }
--- a/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java
+++ b/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java
@ -23,12 +23,13 @@ import org.apache.lucene.util.LuceneTestCase;

 import java.io.StringReader;
 import java.io.IOException;
+import java.util.Iterator;
 import java.util.Set;
 import java.util.HashSet;

 public class TestStopAnalyzer extends LuceneTestCase {
  
-  private StopAnalyzer stop = new StopAnalyzer();
+  private StopAnalyzer stop = new StopAnalyzer(false);
  private Set inValidTokens = new HashSet();
  
  public TestStopAnalyzer(String s) {
@ -37,8 +38,10 @@ public class TestStopAnalyzer extends LuceneTestCase {

  protected void setUp() throws Exception {
    super.setUp();
-    for (int i = 0; i < StopAnalyzer.ENGLISH_STOP_WORDS.length; i++) {
-      inValidTokens.add(StopAnalyzer.ENGLISH_STOP_WORDS[i]);
+    
+    Iterator it = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator();
+    while(it.hasNext()) {
+      inValidTokens.add(it.next());
    }
  }

--- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java
+++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
@ -23,7 +23,9 @@ import java.text.Collator;
 import java.text.DateFormat;
 import java.util.Calendar;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.Locale;
+import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.KeywordAnalyzer;
@ -768,7 +770,9 @@ public class TestQueryParser extends LuceneTestCase {

  public void testBoost()
    throws Exception {
-    StandardAnalyzer oneStopAnalyzer = new StandardAnalyzer(new String[]{"on"});
+    Set stopWords = new HashSet(1);
+    stopWords.add("on");
+    StandardAnalyzer oneStopAnalyzer = new StandardAnalyzer(stopWords);
    QueryParser qp = new QueryParser("field", oneStopAnalyzer);
    Query q = qp.parse("on^1.0");
    assertNotNull(q);
--- a/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
+++ b/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java
@ -31,6 +31,7 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.util.LuceneTestCase;

 import java.io.IOException;
+import java.util.HashSet;
 import java.util.LinkedList;

 /**
@ -169,7 +170,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase
    
  public void testPhrasePrefixWithBooleanQuery() throws IOException {
    RAMDirectory indexStore = new RAMDirectory();
-    IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(new String[]{}), true, IndexWriter.MaxFieldLength.LIMITED);
+    IndexWriter writer = new IndexWriter(indexStore, new StandardAnalyzer(new HashSet(0)), true, IndexWriter.MaxFieldLength.LIMITED);
    add("This is a test", "object", writer);
    add("a note", "note", writer);
    writer.close();
--- a/src/test/org/apache/lucene/search/spans/TestSpans.java
+++ b/src/test/org/apache/lucene/search/spans/TestSpans.java
@ -39,6 +39,7 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.util.LuceneTestCase;

 import java.io.IOException;
+import java.util.HashSet;

 public class TestSpans extends LuceneTestCase {
  private IndexSearcher searcher;
@ -448,7 +449,7 @@ public class TestSpans extends LuceneTestCase {
  // LUCENE-1404
  public void testNPESpanQuery() throws Throwable {
    final Directory dir = new MockRAMDirectory();
-    final IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(new String[0]), IndexWriter.MaxFieldLength.LIMITED);
+    final IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(new HashSet(0)), IndexWriter.MaxFieldLength.LIMITED);

    // Add documents
    addDoc(writer, "1", "the big dogs went running to the market");