LUCENE-1936: Remove deprecated charset support from Greek and Russian analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@820756 13f79535-47bb-0310-9956-ffa450edef68
2009-10-01 19:20:09 +00:00 · 2009-10-01 19:20:09 +00:00 · dd9c1b0101
parent c1f5e753d7
commit dd9c1b0101
16 changed files with 168 additions and 1422 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -6,6 +6,10 @@ Changes in runtime behavior
 API Changes
 * LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms
   text exactly the same as LowerCaseFilter. Please use LowerCaseFilter
   instead, which has the same functionality.  (Robert Muir)
 Bug fixes
 * LUCENE-1781: Fixed various issues with the lat/lng bounding box
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@ -39,111 +39,19 @@ import java.util.Set;
 */
 public final class GreekAnalyzer extends Analyzer
 {
    // the letters are indexes to the charset array (see GreekCharsets.java)
    private static char A = 6;
    private static char B = 7;
    private static char G = 8;
    private static char D = 9;
    private static char E = 10;
    private static char Z = 11;
    private static char H = 12;
    private static char TH = 13;
    private static char I = 14;
    private static char K = 15;
    private static char L = 16;
    private static char M = 17;
    private static char N = 18;
    private static char KS = 19;
    private static char O = 20;
    private static char P = 21;
    private static char R = 22;
    private static char S = 24;	// skip final sigma
    private static char T = 25;
    private static char Y = 26;
    private static char F = 27;
    private static char X = 28;
    private static char PS = 29;
    private static char W = 30;
    /**
     * List of typical Greek stopwords.
     */
-    private static char[][] GREEK_STOP_WORDS = {
+    private static final String[] GREEK_STOP_WORDS = {
-        {O},
+      "ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον", "την", "και", 
-		{H},
+      "κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε", "στο", "στον",
-		{T, O},
+      "στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με", "σε", "ωσ",
-        {O, I},
+      "παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν", "μη", "μην",
-		{T, A},
+      "επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ", "ποια", "ποιο",
-		{T, O, Y},
+      "ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη", "αυτο", "αυτοι",
-		{T, H, S},
+      "αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη", "εκεινο",
-		{T, W, N},
+      "εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
-		{T, O, N},
+      "ισωσ", "οσο", "οτι"
 		{T, H, N},
 		{K, A, I},
 		{K, I},
 		{K},
 		{E, I, M, A, I},
 		{E, I, S, A, I},
 		{E, I, N, A, I},
 		{E, I, M, A, S, T, E},
 		{E, I, S, T, E},
 		{S, T, O},
 		{S, T, O, N},
 		{S, T, H},
 		{S, T, H, N},
 		{M, A},
 		{A, L, L, A},
 		{A, P, O},
 		{G, I, A},
 		{P, R, O, S},
 		{M, E},
 		{S, E},
 		{W, S},
 		{P, A, R, A},
 		{A, N, T, I},
 		{K, A, T, A},
 		{M, E, T, A},
 		{TH, A},
 		{N, A},
 		{D, E},
 		{D, E, N},
 		{M, H},
 		{M, H, N},
 		{E, P, I},
 		{E, N, W},
 		{E, A, N},
 		{A, N},
 		{T, O, T, E},
 		{P, O, Y},
 		{P, W, S},
 		{P, O, I, O, S},
 		{P, O, I, A},
 		{P, O, I, O},
 		{P, O, I, O, I},
 		{P, O, I, E, S},
 		{P, O, I, W, N},
 		{P, O, I, O, Y, S},
 		{A, Y, T, O, S},
 		{A, Y, T, H},
 		{A, Y, T, O},
 		{A, Y, T, O, I},
 		{A, Y, T, W, N},
 		{A, Y, T, O, Y, S},
 		{A, Y, T, E, S},
 		{A, Y, T, A},
 		{E, K, E, I, N, O, S},
 		{E, K, E, I, N, H},
 		{E, K, E, I, N, O},
 		{E, K, E, I, N, O, I},
 		{E, K, E, I, N, E, S},
 		{E, K, E, I, N, A},
 		{E, K, E, I, N, W, N},
 		{E, K, E, I, N, O, Y, S},
 		{O, P, W, S},
 		{O, M, W, S},
 		{I, S, W, S},
 		{O, S, O},
 		{O, T, I}
    };
    /**
@ -151,28 +59,8 @@ public final class GreekAnalyzer extends Analyzer
     */
    private Set stopSet = new HashSet();
    /**
     * Charset for Greek letters.
     * Represents encoding for 24 lowercase Greek letters.
     * Predefined charsets can be taken from {@link GreekCharsets} class
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private char[] charset;
    public GreekAnalyzer() {
-        charset = GreekCharsets.UnicodeGreek;
+        this(GREEK_STOP_WORDS);
        stopSet = StopFilter.makeStopSet(
                    makeStopWords(GreekCharsets.UnicodeGreek));
    }
    /**
     * Builds an analyzer.
     * @deprecated Use {@link #GreekAnalyzer()} instead.
     */
    public GreekAnalyzer(char[] charset)
    {
        this.charset = charset;
        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
    }
    /**
@ -181,58 +69,16 @@ public final class GreekAnalyzer extends Analyzer
     */
    public GreekAnalyzer(String [] stopwords)
    {
-    	charset = GreekCharsets.UnicodeGreek;
+        super();
    	stopSet = StopFilter.makeStopSet(stopwords);
    }
    /**
     * Builds an analyzer with the given stop words.
     * @deprecated Use {@link #GreekAnalyzer(String[])} instead.
     */
    public GreekAnalyzer(char[] charset, String[] stopwords)
    {
        this.charset = charset;
        stopSet = StopFilter.makeStopSet(stopwords);
    }
    /**
     * Takes greek stop words and translates them to a String array, using
     * the given charset.
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private static String[] makeStopWords(char[] charset)
    {
        String[] res = new String[GREEK_STOP_WORDS.length];
        for (int i = 0; i < res.length; i++)
        {
            char[] theStopWord = GREEK_STOP_WORDS[i];
            // translate the word,using the charset
            StringBuffer theWord = new StringBuffer();
            for (int j = 0; j < theStopWord.length; j++)
            {
                theWord.append(charset[theStopWord[j]]);
            }
            res[i] = theWord.toString();
        }
        return res;
    }
    /**
     * Builds an analyzer with the given stop words.
     * @deprecated Use {@link #GreekAnalyzer(Map)} instead.
     */
    public GreekAnalyzer(char[] charset, Map stopwords)
    {
        this.charset = charset;
        stopSet = new HashSet(stopwords.keySet());
    }
    /**
     * Builds an analyzer with the given stop words.
     */
    public GreekAnalyzer(Map stopwords)
    {
-    	charset = GreekCharsets.UnicodeGreek;
+        super();
    	stopSet = new HashSet(stopwords.keySet());
    }
@ -245,7 +91,7 @@ public final class GreekAnalyzer extends Analyzer
    public TokenStream tokenStream(String fieldName, Reader reader)
    {
    	TokenStream result = new StandardTokenizer(reader);
-        result = new GreekLowerCaseFilter(result, charset);
+        result = new GreekLowerCaseFilter(result);
        result = new StopFilter(result, stopSet);
        return result;
    }
@ -268,7 +114,7 @@ public final class GreekAnalyzer extends Analyzer
      if (streams == null) {
        streams = new SavedStreams();
        streams.source = new StandardTokenizer(reader);
-        streams.result = new GreekLowerCaseFilter(streams.source, charset);
+        streams.result = new GreekLowerCaseFilter(streams.source);
        streams.result = new StopFilter(streams.result, stopSet);
        setPreviousTokenStream(streams);
      } else {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
@ -1,482 +0,0 @@
 package org.apache.lucene.analysis.el;
 /**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 * GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
 * for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
 * <p>
 * Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
 * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
 * the definition of a new charset as well as the required logic in the toLowerCase() method.
 * </p>
 * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
 */
 public class GreekCharsets
 {
    // Unicode Greek charset
    public static char[] UnicodeGreek = {
    	// lower case
        '\u0390',
        '\u03AC',
        '\u03AD',
        '\u03AE',
        '\u03AF',
        '\u03B0',
        '\u03B1',
        '\u03B2',
        '\u03B3',
        '\u03B4',
        '\u03B5',
        '\u03B6',
        '\u03B7',
        '\u03B8',
        '\u03B9',
        '\u03BA',
        '\u03BB',
        '\u03BC',
        '\u03BD',
        '\u03BE',
        '\u03BF',
        '\u03C0',
        '\u03C1',
        '\u03C2',
        '\u03C3',
        '\u03C4',
        '\u03C5',
        '\u03C6',
        '\u03C7',
        '\u03C8',
        '\u03C9',
        '\u03CA',
        '\u03CB',
        '\u03CC',
        '\u03CD',
        '\u03CE',
        // upper case
        '\u0386',
        '\u0388',
        '\u0389',
        '\u038A',
        '\u038C',
        '\u038E',
        '\u038F',
        '\u0391',
        '\u0392',
        '\u0393',
        '\u0394',
        '\u0395',
        '\u0396',
        '\u0397',
        '\u0398',
        '\u0399',
        '\u039A',
        '\u039B',
        '\u039C',
        '\u039D',
        '\u039E',
        '\u039F',
        '\u03A0',
        '\u03A1',
        '\u03A3',
        '\u03A4',
        '\u03A5',
        '\u03A6',
        '\u03A7',
        '\u03A8',
        '\u03A9',
        '\u03AA',
        '\u03AB'
    };
    // ISO-8859-7 charset (ELOT-928)
    public static char[] ISO = {
       	// lower case
        0xc0,
        0xdc,
        0xdd,
        0xde,
        0xdf,
        0xe0,
        0xe1,
        0xe2,
        0xe3,
        0xe4,
        0xe5,
        0xe6,
        0xe7,
        0xe8,
        0xe9,
        0xea,
        0xeb,
        0xec,
        0xed,
        0xee,
        0xef,
        0xf0,
        0xf1,
        0xf2,
        0xf3,
        0xf4,
        0xf5,
        0xf6,
        0xf7,
        0xf8,
        0xf9,
        0xfa,
 		0xfb,
 		0xfc,
 		0xfd,
 		0xfe,
        // upper case
        0xb6,
        0xb8,
        0xb9,
        0xba,
        0xbc,
        0xbe,
        0xbf,
        0xc1,
        0xc2,
        0xc3,
        0xc4,
        0xc5,
        0xc6,
        0xc7,
        0xc8,
        0xc9,
        0xca,
        0xcb,
        0xcc,
        0xcd,
        0xce,
        0xcf,
        0xd0,
        0xd1,
        0xd3,
        0xd4,
        0xd5,
        0xd6,
        0xd7,
        0xd8,
        0xd9,
        0xda,
 		0xdb
    };
    // CP1253 charset
    public static char[] CP1253 = {
       	// lower case
        0xc0,
        0xdc,
        0xdd,
        0xde,
        0xdf,
        0xe0,
        0xe1,
        0xe2,
        0xe3,
        0xe4,
        0xe5,
        0xe6,
        0xe7,
        0xe8,
        0xe9,
        0xea,
        0xeb,
        0xec,
        0xed,
        0xee,
        0xef,
        0xf0,
        0xf1,
        0xf2,
        0xf3,
        0xf4,
        0xf5,
        0xf6,
        0xf7,
        0xf8,
        0xf9,
        0xfa,
 		0xfb,
 		0xfc,
 		0xfd,
 		0xfe,
        // upper case
        0xa2,
        0xb8,
        0xb9,
        0xba,
        0xbc,
        0xbe,
        0xbf,
        0xc1,
        0xc2,
        0xc3,
        0xc4,
        0xc5,
        0xc6,
        0xc7,
        0xc8,
        0xc9,
        0xca,
        0xcb,
        0xcc,
        0xcd,
        0xce,
        0xcf,
        0xd0,
        0xd1,
        0xd3,
        0xd4,
        0xd5,
        0xd6,
        0xd7,
        0xd8,
        0xd9,
        0xda,
 		0xdb
    };
    public static char toLowerCase(char letter, char[] charset)
    {
        if (charset == UnicodeGreek) {
        	// First deal with lower case, not accented letters
            if (letter >= '\u03B1' && letter <= '\u03C9')
            {
            	// Special case 'small final sigma', where we return 'small sigma'
                if (letter == '\u03C2') {
                	return '\u03C3';
                } else {
                	return letter;
                }
            }
            // Then deal with lower case, accented letters
            // alpha with acute
            if (letter == '\u03AC') {
            	return '\u03B1';
            }
            // epsilon with acute
            if (letter == '\u03AD') {
            	return '\u03B5';
            }
            // eta with acute
            if (letter == '\u03AE') {
            	return '\u03B7';
            }
            // iota with acute, iota with diaeresis, iota with acute and diaeresis
            if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') {
            	return '\u03B9';
            }
            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
            if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') {
            	return '\u03C5';
            }
            // omicron with acute
            if (letter == '\u03CC') {
            	return '\u03BF';
            }
            // omega with acute
            if (letter == '\u03CE') {
            	return '\u03C9';
            }
            // After that, deal with upper case, not accented letters
            if (letter >= '\u0391' && letter <= '\u03A9')
            {
                return (char) (letter + 32);
            }
            // Finally deal with upper case, accented letters
            // alpha with acute
            if (letter == '\u0386') {
            	return '\u03B1';
            }
            // epsilon with acute
            if (letter == '\u0388') {
            	return '\u03B5';
            }
            // eta with acute
            if (letter == '\u0389') {
            	return '\u03B7';
            }
            // iota with acute, iota with diaeresis
            if (letter == '\u038A' || letter == '\u03AA') {
            	return '\u03B9';
            }
            // upsilon with acute, upsilon with diaeresis
            if (letter == '\u038E' || letter == '\u03AB') {
            	return '\u03C5';
            }
            // omicron with acute
            if (letter == '\u038C') {
            	return '\u03BF';
            }
            // omega with acute
            if (letter == '\u038F') {
            	return '\u03C9';
            }
        } else if (charset == ISO) {
        	// First deal with lower case, not accented letters
            if (letter >= 0xe1 && letter <= 0xf9)
            {
            	// Special case 'small final sigma', where we return 'small sigma'
                if (letter == 0xf2) {
                	return 0xf3;
                } else {
                	return letter;
                }
            }
            // Then deal with lower case, accented letters
            // alpha with acute
            if (letter == 0xdc) {
            	return 0xe1;
            }
            // epsilon with acute
            if (letter == 0xdd) {
            	return 0xe5;
            }
            // eta with acute
            if (letter == 0xde) {
            	return 0xe7;
            }
            // iota with acute, iota with diaeresis, iota with acute and diaeresis
            if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
            	return '\u03B9';
            }
            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
            if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
            	return 0xf5;
            }
            // omicron with acute
            if (letter == 0xfc) {
            	return 0xef;
            }
            // omega with acute
            if (letter == 0xfe) {
            	return 0xf9;
            }
            // After that, deal with upper case, not accented letters
            if (letter >= 0xc1 && letter <= 0xd9) {
                return (char) (letter + 32);
            }
            // Finally deal with upper case, accented letters
            // alpha with acute
            if (letter == 0xb6) {
            	return 0xe1;
            }
            // epsilon with acute
            if (letter == 0xb8) {
            	return 0xe5;
            }
            // eta with acute
            if (letter == 0xb9) {
            	return 0xe7;
            }
            // iota with acute, iota with diaeresis
            if (letter == 0xba || letter == 0xda) {
            	return 0xe9;
            }
            // upsilon with acute, upsilon with diaeresis
            if (letter == 0xbe || letter == 0xdb) {
            	return 0xf5;
            }
            // omicron with acute
            if (letter == 0xbc) {
            	return 0xef;
            }
            // omega with acute
            if (letter == 0xbf) {
            	return 0xf9;
            }
        } else if (charset == CP1253) {
        	// First deal with lower case, not accented letters
            if (letter >= 0xe1 && letter <= 0xf9)
            {
            	// Special case 'small final sigma', where we return 'small sigma'
                if (letter == 0xf2) {
                	return 0xf3;
                } else {
                	return letter;
                }
            }
            // Then deal with lower case, accented letters
            // alpha with acute
            if (letter == 0xdc) {
            	return 0xe1;
            }
            // epsilon with acute
            if (letter == 0xdd) {
            	return 0xe5;
            }
            // eta with acute
            if (letter == 0xde) {
            	return 0xe7;
            }
            // iota with acute, iota with diaeresis, iota with acute and diaeresis
            if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
            	return '\u03B9';
            }
            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
            if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
            	return 0xf5;
            }
            // omicron with acute
            if (letter == 0xfc) {
            	return 0xef;
            }
            // omega with acute
            if (letter == 0xfe) {
            	return 0xf9;
            }
            // After that, deal with upper case, not accented letters
            if (letter >= 0xc1 && letter <= 0xd9) {
                return (char) (letter + 32);
            }
            // Finally deal with upper case, accented letters
            // alpha with acute
            if (letter == 0xa2) {
            	return 0xe1;
            }
            // epsilon with acute
            if (letter == 0xb8) {
            	return 0xe5;
            }
            // eta with acute
            if (letter == 0xb9) {
            	return 0xe7;
            }
            // iota with acute, iota with diaeresis
            if (letter == 0xba || letter == 0xda) {
            	return 0xe9;
            }
            // upsilon with acute, upsilon with diaeresis
            if (letter == 0xbe || letter == 0xdb) {
            	return 0xf5;
            }
            // omicron with acute
            if (letter == 0xbc) {
            	return 0xef;
            }
            // omega with acute
            if (letter == 0xbf) {
            	return 0xf9;
            }
        }
        return Character.toLowerCase(letter);
    }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
@ -23,44 +23,93 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 /**
- * Normalizes token text to lower case, analyzing given ("greek") charset.
+ * Normalizes token text to lower case, removes some Greek diacritics,
 * and standardizes final sigma to sigma. 
 *
 */
 public final class GreekLowerCaseFilter extends TokenFilter
 {
    /**
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    char[] charset;
    private TermAttribute termAtt;
    /**
     * @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
     */
    public GreekLowerCaseFilter(TokenStream in, char[] charset)
    {
        super(in);
        this.charset = charset;
        termAtt = addAttribute(TermAttribute.class);
    }
    public GreekLowerCaseFilter(TokenStream in)
    {
-    	this(in, GreekCharsets.UnicodeGreek);
+    	super(in);
    	termAtt = addAttribute(TermAttribute.class);
    }
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        char[] chArray = termAtt.termBuffer();
        int chLen = termAtt.termLength();
        // TODO: iterate codepoints to support supp. characters
        for (int i = 0; i < chLen; i++)
        {
-          chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
+          chArray[i] = (char) lowerCase(chArray[i]);
        }
        return true;
      } else {
        return false;
      }
    }
    private int lowerCase(int codepoint) {
      switch(codepoint) {
        /* There are two lowercase forms of sigma:
         *   U+03C2: small final sigma (end of word)
         *   U+03C3: small sigma (otherwise)
         *   
         * Standardize both to U+03C3
         */
        case '\u03C2': /* small final sigma */
          return '\u03C3'; /* small sigma */
        /* Some greek characters contain diacritics.
         * This filter removes these, converting to the lowercase base form.
         */
        case '\u0386': /* capital alpha with tonos */
        case '\u03AC': /* small alpha with tonos */
          return '\u03B1'; /* small alpha */
        case '\u0388': /* capital epsilon with tonos */
        case '\u03AD': /* small epsilon with tonos */
          return '\u03B5'; /* small epsilon */
        case '\u0389': /* capital eta with tonos */
        case '\u03AE': /* small eta with tonos */
          return '\u03B7'; /* small eta */
        case '\u038A': /* capital iota with tonos */
        case '\u03AA': /* capital iota with dialytika */
        case '\u03AF': /* small iota with tonos */
        case '\u03CA': /* small iota with dialytika */
        case '\u0390': /* small iota with dialytika and tonos */
          return '\u03B9'; /* small iota */
        case '\u038E': /* capital upsilon with tonos */
        case '\u03AB': /* capital upsilon with dialytika */
        case '\u03CD': /* small upsilon with tonos */
        case '\u03CB': /* small upsilon with dialytika */
        case '\u03B0': /* small upsilon with dialytika and tonos */
          return '\u03C5'; /* small upsilon */
        case '\u038C': /* capital omicron with tonos */
        case '\u03CC': /* small omicron with tonos */
          return '\u03BF'; /* small omicron */
        case '\u038F': /* capital omega with tonos */
        case '\u03CE': /* small omega with tonos */
          return '\u03C9'; /* small omega */
        /* The previous implementation did the conversion below.
         * Only implemented for backwards compatibility with old indexes.
         */
        case '\u03A2': /* reserved */
          return '\u03C2'; /* small final sigma */
        default:
          return Character.toLowerCase(codepoint);
      }
    }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@ -24,6 +24,7 @@ import java.util.Map;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@ -40,145 +41,20 @@ import org.apache.lucene.analysis.Tokenizer;
 */
 public final class RussianAnalyzer extends Analyzer
 {
    // letters (currently unused letters are commented out)
    private final static char A = 0;
    private final static char B = 1;
    private final static char V = 2;
    private final static char G = 3;
    private final static char D = 4;
    private final static char E = 5;
    private final static char ZH = 6;
    private final static char Z = 7;
    private final static char I = 8;
    private final static char I_ = 9;
    private final static char K = 10;
    private final static char L = 11;
    private final static char M = 12;
    private final static char N = 13;
    private final static char O = 14;
    private final static char P = 15;
    private final static char R = 16;
    private final static char S = 17;
    private final static char T = 18;
    private final static char U = 19;
    //private final static char F = 20;
    private final static char X = 21;
    //private final static char TS = 22;
    private final static char CH = 23;
    private final static char SH = 24;
    private final static char SHCH = 25;
    //private final static char HARD = 26;
    private final static char Y = 27;
    private final static char SOFT = 28;
    private final static char AE = 29;
    private final static char IU = 30;
    private final static char IA = 31;
    /**
     * List of typical Russian stopwords.
     */
-    private static char[][] RUSSIAN_STOP_WORDS = {
+    private static final String[] RUSSIAN_STOP_WORDS = {
-        {A},
+      "а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
-        {B, E, Z},
+      "вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где", 
-        {B, O, L, E, E},
+      "да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть", 
-        {B, Y},
+      "еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как",
-        {B, Y, L},
+      "ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо", 
-        {B, Y, L, A},
+      "наш", "не", "него", "нее", "нет", "ни", "них", "но", "ну", "о", "об", 
-        {B, Y, L, I},
+      "однако", "он", "она", "они", "оно", "от", "очень", "по", "под", "при", 
-        {B, Y, L, O},
+      "с", "со", "так", "также", "такой", "там", "те", "тем", "то", "того", 
-        {B, Y, T, SOFT},
+      "тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей", 
-        {V},
+      "чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
        {V, A, M},
        {V, A, S},
        {V, E, S, SOFT},
        {V, O},
        {V, O, T},
        {V, S, E},
        {V, S, E, G, O},
        {V, S, E, X},
        {V, Y},
        {G, D, E},
        {D, A},
        {D, A, ZH, E},
        {D, L, IA},
        {D, O},
        {E, G, O},
        {E, E},
        {E, I_,},
        {E, IU},
        {E, S, L, I},
        {E, S, T, SOFT},
        {E, SHCH, E},
        {ZH, E},
        {Z, A},
        {Z, D, E, S, SOFT},
        {I},
        {I, Z},
        {I, L, I},
        {I, M},
        {I, X},
        {K},
        {K, A, K},
        {K, O},
        {K, O, G, D, A},
        {K, T, O},
        {L, I},
        {L, I, B, O},
        {M, N, E},
        {M, O, ZH, E, T},
        {M, Y},
        {N, A},
        {N, A, D, O},
        {N, A, SH},
        {N, E},
        {N, E, G, O},
        {N, E, E},
        {N, E, T},
        {N, I},
        {N, I, X},
        {N, O},
        {N, U},
        {O},
        {O, B},
        {O, D, N, A, K, O},
        {O, N},
        {O, N, A},
        {O, N, I},
        {O, N, O},
        {O, T},
        {O, CH, E, N, SOFT},
        {P, O},
        {P, O, D},
        {P, R, I},
        {S},
        {S, O},
        {T, A, K},
        {T, A, K, ZH, E},
        {T, A, K, O, I_},
        {T, A, M},
        {T, E},
        {T, E, M},
        {T, O},
        {T, O, G, O},
        {T, O, ZH, E},
        {T, O, I_},
        {T, O, L, SOFT, K, O},
        {T, O, M},
        {T, Y},
        {U},
        {U, ZH, E},
        {X, O, T, IA},
        {CH, E, G, O},
        {CH, E, I_},
        {CH, E, M},
        {CH, T, O},
        {CH, T, O, B, Y},
        {CH, SOFT, E},
        {CH, SOFT, IA},
        {AE, T, A},
        {AE, T, I},
        {AE, T, O},
        {IA}
    };
    /**
@ -186,39 +62,8 @@ public final class RussianAnalyzer extends Analyzer
     */
    private Set stopSet = new HashSet();
    /**
     * Charset for Russian letters.
     * Represents encoding for 32 lowercase Russian letters.
     * Predefined charsets can be taken from RussianCharSets class
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private char[] charset;
    public RussianAnalyzer() {
-        charset = RussianCharsets.UnicodeRussian;
+        this(RUSSIAN_STOP_WORDS);
        stopSet = StopFilter.makeStopSet(
                    makeStopWords(RussianCharsets.UnicodeRussian));
    }
    /**
     * Builds an analyzer.
     * @deprecated Use {@link #RussianAnalyzer()} instead.
     */
    public RussianAnalyzer(char[] charset)
    {
        this.charset = charset;
        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
    }
    /**
     * Builds an analyzer with the given stop words.
     * @deprecated Use {@link #RussianAnalyzer(String[])} instead.
     */
    public RussianAnalyzer(char[] charset, String[] stopwords)
    {
        this.charset = charset;
        stopSet = StopFilter.makeStopSet(stopwords);
    }
    /**
@ -226,49 +71,17 @@ public final class RussianAnalyzer extends Analyzer
     */
    public RussianAnalyzer(String[] stopwords)
    {
-    	this.charset = RussianCharsets.UnicodeRussian;
+    	super();
    	stopSet = StopFilter.makeStopSet(stopwords);
    }
    /** Takes russian stop words and translates them to a String array, using
     * the given charset.
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private static String[] makeStopWords(char[] charset)
    {
        String[] res = new String[RUSSIAN_STOP_WORDS.length];
        for (int i = 0; i < res.length; i++)
        {
            char[] theStopWord = RUSSIAN_STOP_WORDS[i];
            // translate the word, using the charset
            StringBuffer theWord = new StringBuffer();
            for (int j = 0; j < theStopWord.length; j++)
            {
                theWord.append(charset[theStopWord[j]]);
            }
            res[i] = theWord.toString();
        }
        return res;
    }
    /**
     * Builds an analyzer with the given stop words.
     * TODO: create a Set version of this ctor
     * @deprecated Use {@link #RussianAnalyzer(Map)} instead.
     */
    public RussianAnalyzer(char[] charset, Map stopwords)
    {
        this.charset = charset;
        stopSet = new HashSet(stopwords.keySet());
    }
    /**
     * Builds an analyzer with the given stop words.
     * TODO: create a Set version of this ctor
     */
    public RussianAnalyzer(Map stopwords)
    {
-    	charset = RussianCharsets.UnicodeRussian;
+    	super();
    	stopSet = new HashSet(stopwords.keySet());
    }
@ -283,10 +96,10 @@ public final class RussianAnalyzer extends Analyzer
     */
    public TokenStream tokenStream(String fieldName, Reader reader)
    {
-        TokenStream result = new RussianLetterTokenizer(reader, charset);
+        TokenStream result = new RussianLetterTokenizer(reader);
-        result = new RussianLowerCaseFilter(result, charset);
+        result = new LowerCaseFilter(result);
        result = new StopFilter(result, stopSet);
-        result = new RussianStemFilter(result, charset);
+        result = new RussianStemFilter(result);
        return result;
    }
@ -309,10 +122,10 @@ public final class RussianAnalyzer extends Analyzer
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
      streams = new SavedStreams();
-      streams.source = new RussianLetterTokenizer(reader, charset);
+      streams.source = new RussianLetterTokenizer(reader);
-      streams.result = new RussianLowerCaseFilter(streams.source, charset);
+      streams.result = new LowerCaseFilter(streams.source);
      streams.result = new StopFilter(streams.result, stopSet);
-      streams.result = new RussianStemFilter(streams.result, charset);
+      streams.result = new RussianStemFilter(streams.result);
      setPreviousTokenStream(streams);
    } else {
      streams.source.reset(reader);
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
@ -1,314 +0,0 @@
 package org.apache.lucene.analysis.ru;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
 * for russian characters in Unicode, KOI8 and CP1252.
 * <p>
 * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
 * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
 * and adding logic to toLowerCase() method for that charset.
 * </p>
 * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
 * @version $Id$
 */
 public class RussianCharsets
 {
    // Unicode Russian charset (lowercase letters only)
    public static char[] UnicodeRussian = {
        '\u0430',
        '\u0431',
        '\u0432',
        '\u0433',
        '\u0434',
        '\u0435',
        '\u0436',
        '\u0437',
        '\u0438',
        '\u0439',
        '\u043A',
        '\u043B',
        '\u043C',
        '\u043D',
        '\u043E',
        '\u043F',
        '\u0440',
        '\u0441',
        '\u0442',
        '\u0443',
        '\u0444',
        '\u0445',
        '\u0446',
        '\u0447',
        '\u0448',
        '\u0449',
        '\u044A',
        '\u044B',
        '\u044C',
        '\u044D',
        '\u044E',
        '\u044F',
        // upper case
        '\u0410',
        '\u0411',
        '\u0412',
        '\u0413',
        '\u0414',
        '\u0415',
        '\u0416',
        '\u0417',
        '\u0418',
        '\u0419',
        '\u041A',
        '\u041B',
        '\u041C',
        '\u041D',
        '\u041E',
        '\u041F',
        '\u0420',
        '\u0421',
        '\u0422',
        '\u0423',
        '\u0424',
        '\u0425',
        '\u0426',
        '\u0427',
        '\u0428',
        '\u0429',
        '\u042A',
        '\u042B',
        '\u042C',
        '\u042D',
        '\u042E',
        '\u042F',
        // numbers
        '0',
        '1',
        '2',
        '3',
        '4',
        '5',
        '6',
        '7',
        '8',
        '9'
    };
    // KOI8 charset
    public static char[] KOI8 = {
        0xc1,
        0xc2,
        0xd7,
        0xc7,
        0xc4,
        0xc5,
        0xd6,
        0xda,
        0xc9,
        0xca,
        0xcb,
        0xcc,
        0xcd,
        0xce,
        0xcf,
        0xd0,
        0xd2,
        0xd3,
        0xd4,
        0xd5,
        0xc6,
        0xc8,
        0xc3,
        0xde,
        0xdb,
        0xdd,
        0xdf,
        0xd9,
        0xd8,
        0xdc,
        0xc0,
        0xd1,
        // upper case
        0xe1,
        0xe2,
        0xf7,
        0xe7,
        0xe4,
        0xe5,
        0xf6,
        0xfa,
        0xe9,
        0xea,
        0xeb,
        0xec,
        0xed,
        0xee,
        0xef,
        0xf0,
        0xf2,
        0xf3,
        0xf4,
        0xf5,
        0xe6,
        0xe8,
        0xe3,
        0xfe,
        0xfb,
        0xfd,
        0xff,
        0xf9,
        0xf8,
        0xfc,
        0xe0,
        0xf1,
        // numbers
        '0',
        '1',
        '2',
        '3',
        '4',
        '5',
        '6',
        '7',
        '8',
        '9'
    };
    // CP1251 eharset
    public static char[] CP1251 = {
        0xE0,
        0xE1,
        0xE2,
        0xE3,
        0xE4,
        0xE5,
        0xE6,
        0xE7,
        0xE8,
        0xE9,
        0xEA,
        0xEB,
        0xEC,
        0xED,
        0xEE,
        0xEF,
        0xF0,
        0xF1,
        0xF2,
        0xF3,
        0xF4,
        0xF5,
        0xF6,
        0xF7,
        0xF8,
        0xF9,
        0xFA,
        0xFB,
        0xFC,
        0xFD,
        0xFE,
        0xFF,
        // upper case
        0xC0,
        0xC1,
        0xC2,
        0xC3,
        0xC4,
        0xC5,
        0xC6,
        0xC7,
        0xC8,
        0xC9,
        0xCA,
        0xCB,
        0xCC,
        0xCD,
        0xCE,
        0xCF,
        0xD0,
        0xD1,
        0xD2,
        0xD3,
        0xD4,
        0xD5,
        0xD6,
        0xD7,
        0xD8,
        0xD9,
        0xDA,
        0xDB,
        0xDC,
        0xDD,
        0xDE,
        0xDF,
        // numbers
        '0',
        '1',
        '2',
        '3',
        '4',
        '5',
        '6',
        '7',
        '8',
        '9'
    };
    public static char toLowerCase(char letter, char[] charset)
    {
        if (charset == UnicodeRussian)
        {
            if (letter >= '\u0430' && letter <= '\u044F')
            {
                return letter;
            }
            if (letter >= '\u0410' && letter <= '\u042F')
            {
                return (char) (letter + 32);
            }
        }
        if (charset == KOI8)
        {
            if (letter >= 0xe0 && letter <= 0xff)
            {
                return (char) (letter - 32);
            }
            if (letter >= 0xc0 && letter <= 0xdf)
            {
                return letter;
            }
        }
        if (charset == CP1251)
        {
            if (letter >= 0xC0 && letter <= 0xDF)
            {
                return (char) (letter + 32);
            }
            if (letter >= 0xE0 && letter <= 0xFF)
            {
                return letter;
            }
        }
        return Character.toLowerCase(letter);
    }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@ -25,49 +25,26 @@ import org.apache.lucene.util.AttributeSource;
 /**
 * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
- * by additionally looking up letters in a given "russian charset". 
+ * by also allowing the basic latin digits 0-9. 
 * <p>
 * The problem with 
 * {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
 * which doesn't know how to detect letters in encodings like CP1252 and KOI8
 * (well-known problems with 0xD7 and 0xF7 chars)
 * </p>
 *
 * @version $Id$
 */
 public class RussianLetterTokenizer extends CharTokenizer
 {    
    /** 
     * Charset this tokenizer uses.
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private char[] charset;
    /**
     * @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead. 
     */
    public RussianLetterTokenizer(Reader in, char[] charset)
    {
        super(in);
        this.charset = charset;
    }
    public RussianLetterTokenizer(Reader in)
    {
-    	this(in, RussianCharsets.UnicodeRussian);
+    	super(in);
    }
    public RussianLetterTokenizer(AttributeSource source, Reader in)
    {
        super(source, in);
        this.charset = RussianCharsets.UnicodeRussian;
    }
    public RussianLetterTokenizer(AttributeFactory factory, Reader in)
    {
        super(factory, in);
        this.charset = RussianCharsets.UnicodeRussian;
    }
    /**
@ -76,14 +53,9 @@ public class RussianLetterTokenizer extends CharTokenizer
     */
    protected boolean isTokenChar(char c)
    {
-    	/* in the next release, this can be implemented as isLetter(c) or [0-9] */
+        if (Character.isLetter(c) || (c >= '0' && c <= '9'))
        if (Character.isLetter(c))
            return true;
-        for (int i = 0; i < charset.length; i++)
+        else
-        {
+            return false;
            if (c == charset[i])
                return true;
        }
        return false;
    }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@ -19,38 +19,26 @@ package org.apache.lucene.analysis.ru;
 import java.io.IOException;
 import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 /**
- * Normalizes token text to lower case, analyzing given ("russian") charset.
+ * Normalizes token text to lower case.
- *
+ * @deprecated Use {@link LowerCaseFilter} instead, which has the same
 *  functionality. This filter will be removed in Lucene 3.1
 *
 * @version $Id$
 */
 public final class RussianLowerCaseFilter extends TokenFilter
 {
    /**
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    char[] charset;
    private TermAttribute termAtt;
    /**
     * @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
     */
    public RussianLowerCaseFilter(TokenStream in, char[] charset)
    {
        super(in);
        this.charset = charset;
        termAtt = addAttribute(TermAttribute.class);
    }
    public RussianLowerCaseFilter(TokenStream in)
    {
-    	this(in, RussianCharsets.UnicodeRussian);
+        super(in);
        termAtt = addAttribute(TermAttribute.class);
    }
    public final boolean incrementToken() throws IOException
@ -60,7 +48,7 @@ public final class RussianLowerCaseFilter extends TokenFilter
        int chLen = termAtt.termLength();
        for (int i = 0; i < chLen; i++)
        {
-          chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
+          chArray[i] = Character.toLowerCase(chArray[i]);
        }
        return true;
      } else {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ru;
 * limitations under the License.
 */
 import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -28,8 +29,8 @@ import java.io.IOException;
 * A {@link TokenFilter} that stems Russian words. 
 * <p>
 * The implementation was inspired by GermanStemFilter.
- * The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter ,
+ * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
- * because RussianStemFilter only works with lowercase part of any "russian" charset.
+ * because RussianStemFilter only works with lowercase characters.
 * </p>
 *
 * @version   $Id$
@ -43,19 +44,11 @@ public final class RussianStemFilter extends TokenFilter
    private TermAttribute termAtt;
    /**
     * @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
     */
    public RussianStemFilter(TokenStream in, char[] charset)
    {
        super(in);
        stemmer = new RussianStemmer(charset);
        termAtt = addAttribute(TermAttribute.class);
    }
    public RussianStemFilter(TokenStream in)
    {
-    	this(in, RussianCharsets.UnicodeRussian);
+        super(in);
        stemmer = new RussianStemmer();
        termAtt = addAttribute(TermAttribute.class);
    }
    /**
     * Returns the next token in the stream, or null at EOS
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
@ -25,47 +25,42 @@ package org.apache.lucene.analysis.ru;
 */
 class RussianStemmer
 {
    /**
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 
     */
    private char[] charset;
    // positions of RV, R1 and R2 respectively
    private int RV, R1, R2;
    // letters (currently unused letters are commented out)
-    private final static char A = 0;
+    private final static char A = '\u0430';
-    //private final static char B = 1;
+    //private final static char B = '\u0431';
-    private final static char V = 2;
+    private final static char V = '\u0432';
-    private final static char G = 3;
+    private final static char G = '\u0433';
-    //private final static char D = 4;
+    //private final static char D = '\u0434';
-    private final static char E = 5;
+    private final static char E = '\u0435';
-    //private final static char ZH = 6;
+    //private final static char ZH = '\u0436';
-    //private final static char Z = 7;
+    //private final static char Z = '\u0437';
-    private final static char I = 8;
+    private final static char I = '\u0438';
-    private final static char I_ = 9;
+    private final static char I_ = '\u0439';
-    //private final static char K = 10;
+    //private final static char K = '\u043A';
-    private final static char L = 11;
+    private final static char L = '\u043B';
-    private final static char M = 12;
+    private final static char M = '\u043C';
-    private final static char N = 13;
+    private final static char N = '\u043D';
-    private final static char O = 14;
+    private final static char O = '\u043E';
-    //private final static char P = 15;
+    //private final static char P = '\u043F';
-    //private final static char R = 16;
+    //private final static char R = '\u0440';
-    private final static char S = 17;
+    private final static char S = '\u0441';
-    private final static char T = 18;
+    private final static char T = '\u0442';
-    private final static char U = 19;
+    private final static char U = '\u0443';
-    //private final static char F = 20;
+    //private final static char F = '\u0444';
-    private final static char X = 21;
+    private final static char X = '\u0445';
-    //private final static char TS = 22;
+    //private final static char TS = '\u0446';
-    //private final static char CH = 23;
+    //private final static char CH = '\u0447';
-    private final static char SH = 24;
+    private final static char SH = '\u0448';
-    private final static char SHCH = 25;
+    private final static char SHCH = '\u0449';
-    //private final static char HARD = 26;
+    //private final static char HARD = '\u044A';
-    private final static char Y = 27;
+    private final static char Y = '\u044B';
-    private final static char SOFT = 28;
+    private final static char SOFT = '\u044C';
-    private final static char AE = 29;
+    private final static char AE = '\u044D';
-    private final static char IU = 30;
+    private final static char IU = '\u044E';
-    private final static char IA = 31;
+    private final static char IA = '\u044F';
    // stem definitions
    private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
@ -256,16 +251,6 @@ class RussianStemmer
        super();
    }
    /**
     * RussianStemmer constructor comment.
     * @deprecated Use {@link #RussianStemmer()} instead.
     */
    public RussianStemmer(char[] charset)
    {
        super();
        this.charset = charset;
    }
    /**
     * Adjectival ending is an adjective ending,
     * optionally preceded by participle ending.
@ -333,7 +318,7 @@ class RussianStemmer
            int stemmingIndex = startIndex;
            for (int j = theEnding.length - 1; j >= 0; j--)
            {
-                if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
+                if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
                {
                    match = false;
                    break;
@ -451,7 +436,7 @@ class RussianStemmer
    {
        for (int i = 0; i < vowels.length; i++)
        {
-            if (letter == charset[vowels[i]])
+            if (letter == vowels[i])
                return true;
        }
        return false;
@ -499,7 +484,7 @@ class RussianStemmer
    private boolean removeI(StringBuffer stemmingZone)
    {
        if (stemmingZone.length() > 0
-            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
+            && stemmingZone.charAt(stemmingZone.length() - 1) == I)
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
@ -518,7 +503,7 @@ class RussianStemmer
    private boolean removeSoft(StringBuffer stemmingZone)
    {
        if (stemmingZone.length() > 0
-            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
+            && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
@ -529,17 +514,6 @@ class RussianStemmer
        }
    }
    /**
     * Insert the method's description here.
     * Creation date: (16/03/2002 10:58:42 PM)
     * @param newCharset char[]
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    public void setCharset(char[] newCharset)
    {
        charset = newCharset;
    }
    /**
     * Finds the stem for given Russian word.
     * Creation date: (16/03/2002 3:36:48 PM)
@ -623,24 +597,12 @@ class RussianStemmer
            || findAndRemoveEnding(stemmingZone, verbEndings2);
    }
    /**
     * Static method for stemming with different charsets
     * @deprecated Use {@link #stemWord(String)} instead.
     */
    public static String stem(String theWord, char[] charset)
    {
        RussianStemmer stemmer = new RussianStemmer();
        stemmer.setCharset(charset);
        return stemmer.stem(theWord);
    }
    /**
     * Static method for stemming.
     */
    public static String stemWord(String theWord)
    {
        RussianStemmer stemmer = new RussianStemmer();
        stemmer.setCharset(RussianCharsets.UnicodeRussian);
        return stemmer.stem(theWord);
    }
 }
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
@ -42,14 +42,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
    private InputStreamReader sampleUnicode;
    private Reader inWordsKOI8;
    private Reader sampleKOI8;
    private Reader inWords1251;
    private Reader sample1251;
    private File dataDir;
    protected void setUp() throws Exception
@ -98,76 +90,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
        sampleUnicode.close();
    }
    public void testKOI8() throws IOException
    {
        //System.out.println(new java.util.Date());
        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
        // KOI8
        inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
        sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
        TokenStream in = ra.tokenStream("all", inWordsKOI8);
        RussianLetterTokenizer sample =
            new RussianLetterTokenizer(
                sampleKOI8,
                RussianCharsets.KOI8);
        TermAttribute text = in.getAttribute(TermAttribute.class);
        TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
        for (;;)
        {
          if (in.incrementToken() == false)
            break;
            boolean nextSampleToken = sample.incrementToken();
            assertEquals(
                "KOI8",
                text.term(),
                nextSampleToken == false
                ? null
                : sampleText.term());
        }
        inWordsKOI8.close();
        sampleKOI8.close();
    }
    public void test1251() throws IOException
    {
        // 1251
        inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
        sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
        TokenStream in = ra.tokenStream("", inWords1251);
        RussianLetterTokenizer sample =
            new RussianLetterTokenizer(
                sample1251,
                RussianCharsets.CP1251);
        TermAttribute text = in.getAttribute(TermAttribute.class);
        TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
        for (;;)
        {
          if (in.incrementToken() == false)
            break;
            boolean nextSampleToken = sample.incrementToken();
            assertEquals(
                "1251",
                text.term(),
                nextSampleToken == false
                ? null
                : sampleText.term());
        }
        inWords1251.close();
        sample1251.close();
    }
    public void testDigitsInRussianCharset() 
    {
        Reader reader = new StringReader("text 1000");
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
@ -84,9 +84,8 @@ public class TestRussianStem extends LuceneTestCase
        {
            //if ( (i % 100) == 0 ) System.err.println(i);
            String realStem =
-                RussianStemmer.stem(
+                RussianStemmer.stemWord(
-                    (String) words.get(i),
+                    (String) words.get(i));
                    RussianCharsets.UnicodeRussian);
            assertEquals("unicode", stems.get(i), realStem);
        }
    }
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/res1251.htm
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/res1251.htm
@ -1 +0,0 @@
 [вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
@ -1 +0,0 @@
 [淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/test1251.txt
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/test1251.txt
@ -1,2 +0,0 @@
 Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в 
 узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
@ -1,2 +0,0 @@
 Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в 
 узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
		`@ -1 +0,0 @@`
			[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
		`@ -1 +0,0 @@`
			`[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃`
		`@ -1,2 +0,0 @@`
			`Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в`
			узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.