LUCENE-1936: Remove deprecated charset support from Greek and Russian analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@820756 13f79535-47bb-0310-9956-ffa450edef68
2009-10-01 19:20:09 +00:00 · 2009-10-01 19:20:09 +00:00 · dd9c1b0101
parent c1f5e753d7
commit dd9c1b0101
16 changed files with 168 additions and 1422 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -6,6 +6,10 @@ Changes in runtime behavior

 API Changes

+ * LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms
+   text exactly the same as LowerCaseFilter. Please use LowerCaseFilter
+   instead, which has the same functionality.  (Robert Muir)
+   
 Bug fixes

 * LUCENE-1781: Fixed various issues with the lat/lng bounding box
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@ -39,111 +39,19 @@ import java.util.Set;
 */
 public final class GreekAnalyzer extends Analyzer
 {
-    // the letters are indexes to the charset array (see GreekCharsets.java)
-    private static char A = 6;
-    private static char B = 7;
-    private static char G = 8;
-    private static char D = 9;
-    private static char E = 10;
-    private static char Z = 11;
-    private static char H = 12;
-    private static char TH = 13;
-    private static char I = 14;
-    private static char K = 15;
-    private static char L = 16;
-    private static char M = 17;
-    private static char N = 18;
-    private static char KS = 19;
-    private static char O = 20;
-    private static char P = 21;
-    private static char R = 22;
-    private static char S = 24;	// skip final sigma
-    private static char T = 25;
-    private static char Y = 26;
-    private static char F = 27;
-    private static char X = 28;
-    private static char PS = 29;
-    private static char W = 30;
-
    /**
     * List of typical Greek stopwords.
     */
-    private static char[][] GREEK_STOP_WORDS = {
-        {O},
-		{H},
-		{T, O},
-        {O, I},
-		{T, A},
-		{T, O, Y},
-		{T, H, S},
-		{T, W, N},
-		{T, O, N},
-		{T, H, N},
-		{K, A, I},
-		{K, I},
-		{K},
-		{E, I, M, A, I},
-		{E, I, S, A, I},
-		{E, I, N, A, I},
-		{E, I, M, A, S, T, E},
-		{E, I, S, T, E},
-		{S, T, O},
-		{S, T, O, N},
-		{S, T, H},
-		{S, T, H, N},
-		{M, A},
-		{A, L, L, A},
-		{A, P, O},
-		{G, I, A},
-		{P, R, O, S},
-		{M, E},
-		{S, E},
-		{W, S},
-		{P, A, R, A},
-		{A, N, T, I},
-		{K, A, T, A},
-		{M, E, T, A},
-		{TH, A},
-		{N, A},
-		{D, E},
-		{D, E, N},
-		{M, H},
-		{M, H, N},
-		{E, P, I},
-		{E, N, W},
-		{E, A, N},
-		{A, N},
-		{T, O, T, E},
-		{P, O, Y},
-		{P, W, S},
-		{P, O, I, O, S},
-		{P, O, I, A},
-		{P, O, I, O},
-		{P, O, I, O, I},
-		{P, O, I, E, S},
-		{P, O, I, W, N},
-		{P, O, I, O, Y, S},
-		{A, Y, T, O, S},
-		{A, Y, T, H},
-		{A, Y, T, O},
-		{A, Y, T, O, I},
-		{A, Y, T, W, N},
-		{A, Y, T, O, Y, S},
-		{A, Y, T, E, S},
-		{A, Y, T, A},
-		{E, K, E, I, N, O, S},
-		{E, K, E, I, N, H},
-		{E, K, E, I, N, O},
-		{E, K, E, I, N, O, I},
-		{E, K, E, I, N, E, S},
-		{E, K, E, I, N, A},
-		{E, K, E, I, N, W, N},
-		{E, K, E, I, N, O, Y, S},
-		{O, P, W, S},
-		{O, M, W, S},
-		{I, S, W, S},
-		{O, S, O},
-		{O, T, I}
+    private static final String[] GREEK_STOP_WORDS = {
+      "ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον", "την", "και", 
+      "κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε", "στο", "στον",
+      "στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με", "σε", "ωσ",
+      "παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν", "μη", "μην",
+      "επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ", "ποια", "ποιο",
+      "ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη", "αυτο", "αυτοι",
+      "αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη", "εκεινο",
+      "εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
+      "ισωσ", "οσο", "οτι"
    };

    /**
@ -151,28 +59,8 @@ public final class GreekAnalyzer extends Analyzer
     */
    private Set stopSet = new HashSet();

-    /**
-     * Charset for Greek letters.
-     * Represents encoding for 24 lowercase Greek letters.
-     * Predefined charsets can be taken from {@link GreekCharsets} class
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private char[] charset;
-
    public GreekAnalyzer() {
-        charset = GreekCharsets.UnicodeGreek;
-        stopSet = StopFilter.makeStopSet(
-                    makeStopWords(GreekCharsets.UnicodeGreek));
-    }
-
-    /**
-     * Builds an analyzer.
-     * @deprecated Use {@link #GreekAnalyzer()} instead.
-     */
-    public GreekAnalyzer(char[] charset)
-    {
-        this.charset = charset;
-        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
+        this(GREEK_STOP_WORDS);
    }
    
    /**
@ -181,58 +69,16 @@ public final class GreekAnalyzer extends Analyzer
     */
    public GreekAnalyzer(String [] stopwords)
    {
-    	charset = GreekCharsets.UnicodeGreek;
+        super();
    	stopSet = StopFilter.makeStopSet(stopwords);
    }
-
-    /**
-     * Builds an analyzer with the given stop words.
-     * @deprecated Use {@link #GreekAnalyzer(String[])} instead.
-     */
-    public GreekAnalyzer(char[] charset, String[] stopwords)
-    {
-        this.charset = charset;
-        stopSet = StopFilter.makeStopSet(stopwords);
-    }
-
-    /**
-     * Takes greek stop words and translates them to a String array, using
-     * the given charset.
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private static String[] makeStopWords(char[] charset)
-    {
-        String[] res = new String[GREEK_STOP_WORDS.length];
-        for (int i = 0; i < res.length; i++)
-        {
-            char[] theStopWord = GREEK_STOP_WORDS[i];
-            // translate the word,using the charset
-            StringBuffer theWord = new StringBuffer();
-            for (int j = 0; j < theStopWord.length; j++)
-            {
-                theWord.append(charset[theStopWord[j]]);
-            }
-            res[i] = theWord.toString();
-        }
-        return res;
-    }
-
-    /**
-     * Builds an analyzer with the given stop words.
-     * @deprecated Use {@link #GreekAnalyzer(Map)} instead.
-     */
-    public GreekAnalyzer(char[] charset, Map stopwords)
-    {
-        this.charset = charset;
-        stopSet = new HashSet(stopwords.keySet());
-    }
    
    /**
     * Builds an analyzer with the given stop words.
     */
    public GreekAnalyzer(Map stopwords)
    {
-    	charset = GreekCharsets.UnicodeGreek;
+        super();
    	stopSet = new HashSet(stopwords.keySet());
    }

@ -245,7 +91,7 @@ public final class GreekAnalyzer extends Analyzer
    public TokenStream tokenStream(String fieldName, Reader reader)
    {
    	TokenStream result = new StandardTokenizer(reader);
-        result = new GreekLowerCaseFilter(result, charset);
+        result = new GreekLowerCaseFilter(result);
        result = new StopFilter(result, stopSet);
        return result;
    }
@ -268,7 +114,7 @@ public final class GreekAnalyzer extends Analyzer
      if (streams == null) {
        streams = new SavedStreams();
        streams.source = new StandardTokenizer(reader);
-        streams.result = new GreekLowerCaseFilter(streams.source, charset);
+        streams.result = new GreekLowerCaseFilter(streams.source);
        streams.result = new StopFilter(streams.result, stopSet);
        setPreviousTokenStream(streams);
      } else {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
@ -1,482 +0,0 @@
-package org.apache.lucene.analysis.el;
-
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
- * for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
- * <p>
- * Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
- * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
- * the definition of a new charset as well as the required logic in the toLowerCase() method.
- * </p>
- * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
- */
-public class GreekCharsets
-{
-    // Unicode Greek charset
-    public static char[] UnicodeGreek = {
-    	// lower case
-        '\u0390',
-        '\u03AC',
-        '\u03AD',
-        '\u03AE',
-        '\u03AF',
-        '\u03B0',
-        '\u03B1',
-        '\u03B2',
-        '\u03B3',
-        '\u03B4',
-        '\u03B5',
-        '\u03B6',
-        '\u03B7',
-        '\u03B8',
-        '\u03B9',
-        '\u03BA',
-        '\u03BB',
-        '\u03BC',
-        '\u03BD',
-        '\u03BE',
-        '\u03BF',
-        '\u03C0',
-        '\u03C1',
-        '\u03C2',
-        '\u03C3',
-        '\u03C4',
-        '\u03C5',
-        '\u03C6',
-        '\u03C7',
-        '\u03C8',
-        '\u03C9',
-        '\u03CA',
-        '\u03CB',
-        '\u03CC',
-        '\u03CD',
-        '\u03CE',
-        // upper case
-        '\u0386',
-        '\u0388',
-        '\u0389',
-        '\u038A',
-        '\u038C',
-        '\u038E',
-        '\u038F',
-        '\u0391',
-        '\u0392',
-        '\u0393',
-        '\u0394',
-        '\u0395',
-        '\u0396',
-        '\u0397',
-        '\u0398',
-        '\u0399',
-        '\u039A',
-        '\u039B',
-        '\u039C',
-        '\u039D',
-        '\u039E',
-        '\u039F',
-        '\u03A0',
-        '\u03A1',
-        '\u03A3',
-        '\u03A4',
-        '\u03A5',
-        '\u03A6',
-        '\u03A7',
-        '\u03A8',
-        '\u03A9',
-        '\u03AA',
-        '\u03AB'
-    };
-
-    // ISO-8859-7 charset (ELOT-928)
-    public static char[] ISO = {
-       	// lower case
-        0xc0,
-        0xdc,
-        0xdd,
-        0xde,
-        0xdf,
-        0xe0,
-        0xe1,
-        0xe2,
-        0xe3,
-        0xe4,
-        0xe5,
-        0xe6,
-        0xe7,
-        0xe8,
-        0xe9,
-        0xea,
-        0xeb,
-        0xec,
-        0xed,
-        0xee,
-        0xef,
-        0xf0,
-        0xf1,
-        0xf2,
-        0xf3,
-        0xf4,
-        0xf5,
-        0xf6,
-        0xf7,
-        0xf8,
-        0xf9,
-        0xfa,
-		0xfb,
-		0xfc,
-		0xfd,
-		0xfe,
-        // upper case
-        0xb6,
-        0xb8,
-        0xb9,
-        0xba,
-        0xbc,
-        0xbe,
-        0xbf,
-        0xc1,
-        0xc2,
-        0xc3,
-        0xc4,
-        0xc5,
-        0xc6,
-        0xc7,
-        0xc8,
-        0xc9,
-        0xca,
-        0xcb,
-        0xcc,
-        0xcd,
-        0xce,
-        0xcf,
-        0xd0,
-        0xd1,
-        0xd3,
-        0xd4,
-        0xd5,
-        0xd6,
-        0xd7,
-        0xd8,
-        0xd9,
-        0xda,
-		0xdb
-    };
-
-    // CP1253 charset
-    public static char[] CP1253 = {
-       	// lower case
-        0xc0,
-        0xdc,
-        0xdd,
-        0xde,
-        0xdf,
-        0xe0,
-        0xe1,
-        0xe2,
-        0xe3,
-        0xe4,
-        0xe5,
-        0xe6,
-        0xe7,
-        0xe8,
-        0xe9,
-        0xea,
-        0xeb,
-        0xec,
-        0xed,
-        0xee,
-        0xef,
-        0xf0,
-        0xf1,
-        0xf2,
-        0xf3,
-        0xf4,
-        0xf5,
-        0xf6,
-        0xf7,
-        0xf8,
-        0xf9,
-        0xfa,
-		0xfb,
-		0xfc,
-		0xfd,
-		0xfe,
-        // upper case
-        0xa2,
-        0xb8,
-        0xb9,
-        0xba,
-        0xbc,
-        0xbe,
-        0xbf,
-        0xc1,
-        0xc2,
-        0xc3,
-        0xc4,
-        0xc5,
-        0xc6,
-        0xc7,
-        0xc8,
-        0xc9,
-        0xca,
-        0xcb,
-        0xcc,
-        0xcd,
-        0xce,
-        0xcf,
-        0xd0,
-        0xd1,
-        0xd3,
-        0xd4,
-        0xd5,
-        0xd6,
-        0xd7,
-        0xd8,
-        0xd9,
-        0xda,
-		0xdb
-    };
-
-    public static char toLowerCase(char letter, char[] charset)
-    {
-        if (charset == UnicodeGreek) {
-        	// First deal with lower case, not accented letters
-            if (letter >= '\u03B1' && letter <= '\u03C9')
-            {
-            	// Special case 'small final sigma', where we return 'small sigma'
-                if (letter == '\u03C2') {
-                	return '\u03C3';
-                } else {
-                	return letter;
-                }
-            }
-            // Then deal with lower case, accented letters
-            // alpha with acute
-            if (letter == '\u03AC') {
-            	return '\u03B1';
-            }
-            // epsilon with acute
-            if (letter == '\u03AD') {
-            	return '\u03B5';
-            }
-            // eta with acute
-            if (letter == '\u03AE') {
-            	return '\u03B7';
-            }
-            // iota with acute, iota with diaeresis, iota with acute and diaeresis
-            if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') {
-            	return '\u03B9';
-            }
-            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
-            if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') {
-            	return '\u03C5';
-            }
-            // omicron with acute
-            if (letter == '\u03CC') {
-            	return '\u03BF';
-            }
-            // omega with acute
-            if (letter == '\u03CE') {
-            	return '\u03C9';
-            }
-            // After that, deal with upper case, not accented letters
-            if (letter >= '\u0391' && letter <= '\u03A9')
-            {
-                return (char) (letter + 32);
-            }
-            // Finally deal with upper case, accented letters
-            // alpha with acute
-            if (letter == '\u0386') {
-            	return '\u03B1';
-            }
-            // epsilon with acute
-            if (letter == '\u0388') {
-            	return '\u03B5';
-            }
-            // eta with acute
-            if (letter == '\u0389') {
-            	return '\u03B7';
-            }
-            // iota with acute, iota with diaeresis
-            if (letter == '\u038A' || letter == '\u03AA') {
-            	return '\u03B9';
-            }
-            // upsilon with acute, upsilon with diaeresis
-            if (letter == '\u038E' || letter == '\u03AB') {
-            	return '\u03C5';
-            }
-            // omicron with acute
-            if (letter == '\u038C') {
-            	return '\u03BF';
-            }
-            // omega with acute
-            if (letter == '\u038F') {
-            	return '\u03C9';
-            }
-        } else if (charset == ISO) {
-        	// First deal with lower case, not accented letters
-            if (letter >= 0xe1 && letter <= 0xf9)
-            {
-            	// Special case 'small final sigma', where we return 'small sigma'
-                if (letter == 0xf2) {
-                	return 0xf3;
-                } else {
-                	return letter;
-                }
-            }
-            // Then deal with lower case, accented letters
-            // alpha with acute
-            if (letter == 0xdc) {
-            	return 0xe1;
-            }
-            // epsilon with acute
-            if (letter == 0xdd) {
-            	return 0xe5;
-            }
-            // eta with acute
-            if (letter == 0xde) {
-            	return 0xe7;
-            }
-            // iota with acute, iota with diaeresis, iota with acute and diaeresis
-            if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
-            	return '\u03B9';
-            }
-            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
-            if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
-            	return 0xf5;
-            }
-            // omicron with acute
-            if (letter == 0xfc) {
-            	return 0xef;
-            }
-            // omega with acute
-            if (letter == 0xfe) {
-            	return 0xf9;
-            }
-            // After that, deal with upper case, not accented letters
-            if (letter >= 0xc1 && letter <= 0xd9) {
-                return (char) (letter + 32);
-            }
-            // Finally deal with upper case, accented letters
-            // alpha with acute
-            if (letter == 0xb6) {
-            	return 0xe1;
-            }
-            // epsilon with acute
-            if (letter == 0xb8) {
-            	return 0xe5;
-            }
-            // eta with acute
-            if (letter == 0xb9) {
-            	return 0xe7;
-            }
-            // iota with acute, iota with diaeresis
-            if (letter == 0xba || letter == 0xda) {
-            	return 0xe9;
-            }
-            // upsilon with acute, upsilon with diaeresis
-            if (letter == 0xbe || letter == 0xdb) {
-            	return 0xf5;
-            }
-            // omicron with acute
-            if (letter == 0xbc) {
-            	return 0xef;
-            }
-            // omega with acute
-            if (letter == 0xbf) {
-            	return 0xf9;
-            }
-        } else if (charset == CP1253) {
-        	// First deal with lower case, not accented letters
-            if (letter >= 0xe1 && letter <= 0xf9)
-            {
-            	// Special case 'small final sigma', where we return 'small sigma'
-                if (letter == 0xf2) {
-                	return 0xf3;
-                } else {
-                	return letter;
-                }
-            }
-            // Then deal with lower case, accented letters
-            // alpha with acute
-            if (letter == 0xdc) {
-            	return 0xe1;
-            }
-            // epsilon with acute
-            if (letter == 0xdd) {
-            	return 0xe5;
-            }
-            // eta with acute
-            if (letter == 0xde) {
-            	return 0xe7;
-            }
-            // iota with acute, iota with diaeresis, iota with acute and diaeresis
-            if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
-            	return '\u03B9';
-            }
-            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
-            if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
-            	return 0xf5;
-            }
-            // omicron with acute
-            if (letter == 0xfc) {
-            	return 0xef;
-            }
-            // omega with acute
-            if (letter == 0xfe) {
-            	return 0xf9;
-            }
-            // After that, deal with upper case, not accented letters
-            if (letter >= 0xc1 && letter <= 0xd9) {
-                return (char) (letter + 32);
-            }
-            // Finally deal with upper case, accented letters
-            // alpha with acute
-            if (letter == 0xa2) {
-            	return 0xe1;
-            }
-            // epsilon with acute
-            if (letter == 0xb8) {
-            	return 0xe5;
-            }
-            // eta with acute
-            if (letter == 0xb9) {
-            	return 0xe7;
-            }
-            // iota with acute, iota with diaeresis
-            if (letter == 0xba || letter == 0xda) {
-            	return 0xe9;
-            }
-            // upsilon with acute, upsilon with diaeresis
-            if (letter == 0xbe || letter == 0xdb) {
-            	return 0xf5;
-            }
-            // omicron with acute
-            if (letter == 0xbc) {
-            	return 0xef;
-            }
-            // omega with acute
-            if (letter == 0xbf) {
-            	return 0xf9;
-            }
-        }
-
-        return Character.toLowerCase(letter);
-    }
-}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
@ -23,44 +23,93 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;

 /**
- * Normalizes token text to lower case, analyzing given ("greek") charset.
+ * Normalizes token text to lower case, removes some Greek diacritics,
+ * and standardizes final sigma to sigma. 
 *
 */
 public final class GreekLowerCaseFilter extends TokenFilter
 {
-    /**
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    char[] charset;
-
    private TermAttribute termAtt;
    
-    /**
-     * @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
-     */
-    public GreekLowerCaseFilter(TokenStream in, char[] charset)
-    {
-        super(in);
-        this.charset = charset;
-        termAtt = addAttribute(TermAttribute.class);
-    }
-    
    public GreekLowerCaseFilter(TokenStream in)
    {
-    	this(in, GreekCharsets.UnicodeGreek);
+    	super(in);
+    	termAtt = addAttribute(TermAttribute.class);
    }

    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        char[] chArray = termAtt.termBuffer();
        int chLen = termAtt.termLength();
+        // TODO: iterate codepoints to support supp. characters
        for (int i = 0; i < chLen; i++)
        {
-          chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
+          chArray[i] = (char) lowerCase(chArray[i]);
        }
        return true;
      } else {
        return false;
      }
    }
+    
+    private int lowerCase(int codepoint) {
+      switch(codepoint) {
+        /* There are two lowercase forms of sigma:
+         *   U+03C2: small final sigma (end of word)
+         *   U+03C3: small sigma (otherwise)
+         *   
+         * Standardize both to U+03C3
+         */
+        case '\u03C2': /* small final sigma */
+          return '\u03C3'; /* small sigma */
+        
+        /* Some greek characters contain diacritics.
+         * This filter removes these, converting to the lowercase base form.
+         */
+        
+        case '\u0386': /* capital alpha with tonos */
+        case '\u03AC': /* small alpha with tonos */
+          return '\u03B1'; /* small alpha */
+          
+        case '\u0388': /* capital epsilon with tonos */
+        case '\u03AD': /* small epsilon with tonos */
+          return '\u03B5'; /* small epsilon */
+          
+        case '\u0389': /* capital eta with tonos */
+        case '\u03AE': /* small eta with tonos */
+          return '\u03B7'; /* small eta */
+        
+        case '\u038A': /* capital iota with tonos */
+        case '\u03AA': /* capital iota with dialytika */
+        case '\u03AF': /* small iota with tonos */
+        case '\u03CA': /* small iota with dialytika */
+        case '\u0390': /* small iota with dialytika and tonos */
+          return '\u03B9'; /* small iota */
+          
+        case '\u038E': /* capital upsilon with tonos */
+        case '\u03AB': /* capital upsilon with dialytika */
+        case '\u03CD': /* small upsilon with tonos */
+        case '\u03CB': /* small upsilon with dialytika */
+        case '\u03B0': /* small upsilon with dialytika and tonos */
+          return '\u03C5'; /* small upsilon */
+          
+        case '\u038C': /* capital omicron with tonos */
+        case '\u03CC': /* small omicron with tonos */
+          return '\u03BF'; /* small omicron */
+          
+        case '\u038F': /* capital omega with tonos */
+        case '\u03CE': /* small omega with tonos */
+          return '\u03C9'; /* small omega */
+          
+        /* The previous implementation did the conversion below.
+         * Only implemented for backwards compatibility with old indexes.
+         */
+          
+        case '\u03A2': /* reserved */
+          return '\u03C2'; /* small final sigma */
+          
+        default:
+          return Character.toLowerCase(codepoint);
+      }
+    }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@ -24,6 +24,7 @@ import java.util.Map;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@ -40,145 +41,20 @@ import org.apache.lucene.analysis.Tokenizer;
 */
 public final class RussianAnalyzer extends Analyzer
 {
-    // letters (currently unused letters are commented out)
-    private final static char A = 0;
-    private final static char B = 1;
-    private final static char V = 2;
-    private final static char G = 3;
-    private final static char D = 4;
-    private final static char E = 5;
-    private final static char ZH = 6;
-    private final static char Z = 7;
-    private final static char I = 8;
-    private final static char I_ = 9;
-    private final static char K = 10;
-    private final static char L = 11;
-    private final static char M = 12;
-    private final static char N = 13;
-    private final static char O = 14;
-    private final static char P = 15;
-    private final static char R = 16;
-    private final static char S = 17;
-    private final static char T = 18;
-    private final static char U = 19;
-    //private final static char F = 20;
-    private final static char X = 21;
-    //private final static char TS = 22;
-    private final static char CH = 23;
-    private final static char SH = 24;
-    private final static char SHCH = 25;
-    //private final static char HARD = 26;
-    private final static char Y = 27;
-    private final static char SOFT = 28;
-    private final static char AE = 29;
-    private final static char IU = 30;
-    private final static char IA = 31;
-
    /**
     * List of typical Russian stopwords.
     */
-    private static char[][] RUSSIAN_STOP_WORDS = {
-        {A},
-        {B, E, Z},
-        {B, O, L, E, E},
-        {B, Y},
-        {B, Y, L},
-        {B, Y, L, A},
-        {B, Y, L, I},
-        {B, Y, L, O},
-        {B, Y, T, SOFT},
-        {V},
-        {V, A, M},
-        {V, A, S},
-        {V, E, S, SOFT},
-        {V, O},
-        {V, O, T},
-        {V, S, E},
-        {V, S, E, G, O},
-        {V, S, E, X},
-        {V, Y},
-        {G, D, E},
-        {D, A},
-        {D, A, ZH, E},
-        {D, L, IA},
-        {D, O},
-        {E, G, O},
-        {E, E},
-        {E, I_,},
-        {E, IU},
-        {E, S, L, I},
-        {E, S, T, SOFT},
-        {E, SHCH, E},
-        {ZH, E},
-        {Z, A},
-        {Z, D, E, S, SOFT},
-        {I},
-        {I, Z},
-        {I, L, I},
-        {I, M},
-        {I, X},
-        {K},
-        {K, A, K},
-        {K, O},
-        {K, O, G, D, A},
-        {K, T, O},
-        {L, I},
-        {L, I, B, O},
-        {M, N, E},
-        {M, O, ZH, E, T},
-        {M, Y},
-        {N, A},
-        {N, A, D, O},
-        {N, A, SH},
-        {N, E},
-        {N, E, G, O},
-        {N, E, E},
-        {N, E, T},
-        {N, I},
-        {N, I, X},
-        {N, O},
-        {N, U},
-        {O},
-        {O, B},
-        {O, D, N, A, K, O},
-        {O, N},
-        {O, N, A},
-        {O, N, I},
-        {O, N, O},
-        {O, T},
-        {O, CH, E, N, SOFT},
-        {P, O},
-        {P, O, D},
-        {P, R, I},
-        {S},
-        {S, O},
-        {T, A, K},
-        {T, A, K, ZH, E},
-        {T, A, K, O, I_},
-        {T, A, M},
-        {T, E},
-        {T, E, M},
-        {T, O},
-        {T, O, G, O},
-        {T, O, ZH, E},
-        {T, O, I_},
-        {T, O, L, SOFT, K, O},
-        {T, O, M},
-        {T, Y},
-        {U},
-        {U, ZH, E},
-        {X, O, T, IA},
-        {CH, E, G, O},
-        {CH, E, I_},
-        {CH, E, M},
-        {CH, T, O},
-        {CH, T, O, B, Y},
-        {CH, SOFT, E},
-        {CH, SOFT, IA},
-        {AE, T, A},
-        {AE, T, I},
-        {AE, T, O},
-        {IA}
+    private static final String[] RUSSIAN_STOP_WORDS = {
+      "а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
+      "вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где", 
+      "да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть", 
+      "еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как",
+      "ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо", 
+      "наш", "не", "него", "нее", "нет", "ни", "них", "но", "ну", "о", "об", 
+      "однако", "он", "она", "они", "оно", "от", "очень", "по", "под", "при", 
+      "с", "со", "так", "также", "такой", "там", "те", "тем", "то", "того", 
+      "тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей", 
+      "чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
    };

    /**
@ -186,89 +62,26 @@ public final class RussianAnalyzer extends Analyzer
     */
    private Set stopSet = new HashSet();

-    /**
-     * Charset for Russian letters.
-     * Represents encoding for 32 lowercase Russian letters.
-     * Predefined charsets can be taken from RussianCharSets class
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private char[] charset;
-
-
    public RussianAnalyzer() {
-        charset = RussianCharsets.UnicodeRussian;
-        stopSet = StopFilter.makeStopSet(
-                    makeStopWords(RussianCharsets.UnicodeRussian));
+        this(RUSSIAN_STOP_WORDS);
    }
-
-    /**
-     * Builds an analyzer.
-     * @deprecated Use {@link #RussianAnalyzer()} instead.
-     */
-    public RussianAnalyzer(char[] charset)
-    {
-        this.charset = charset;
-        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
-    }
-
-    /**
-     * Builds an analyzer with the given stop words.
-     * @deprecated Use {@link #RussianAnalyzer(String[])} instead.
-     */
-    public RussianAnalyzer(char[] charset, String[] stopwords)
-    {
-        this.charset = charset;
-        stopSet = StopFilter.makeStopSet(stopwords);
-    }
-    
+  
    /**
     * Builds an analyzer with the given stop words.
     */
    public RussianAnalyzer(String[] stopwords)
    {
-    	this.charset = RussianCharsets.UnicodeRussian;
+    	super();
    	stopSet = StopFilter.makeStopSet(stopwords);
    }
-
-    /** Takes russian stop words and translates them to a String array, using
-     * the given charset.
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private static String[] makeStopWords(char[] charset)
-    {
-        String[] res = new String[RUSSIAN_STOP_WORDS.length];
-        for (int i = 0; i < res.length; i++)
-        {
-            char[] theStopWord = RUSSIAN_STOP_WORDS[i];
-            // translate the word, using the charset
-            StringBuffer theWord = new StringBuffer();
-            for (int j = 0; j < theStopWord.length; j++)
-            {
-                theWord.append(charset[theStopWord[j]]);
-            }
-            res[i] = theWord.toString();
-        }
-        return res;
-    }
-
-    /**
-     * Builds an analyzer with the given stop words.
-     * TODO: create a Set version of this ctor
-     * @deprecated Use {@link #RussianAnalyzer(Map)} instead.
-     */
-    public RussianAnalyzer(char[] charset, Map stopwords)
-    {
-        this.charset = charset;
-        stopSet = new HashSet(stopwords.keySet());
-    }
-    
+   
    /**
     * Builds an analyzer with the given stop words.
     * TODO: create a Set version of this ctor
     */
    public RussianAnalyzer(Map stopwords)
    {
-    	charset = RussianCharsets.UnicodeRussian;
+    	super();
    	stopSet = new HashSet(stopwords.keySet());
    }

@ -283,10 +96,10 @@ public final class RussianAnalyzer extends Analyzer
     */
    public TokenStream tokenStream(String fieldName, Reader reader)
    {
-        TokenStream result = new RussianLetterTokenizer(reader, charset);
-        result = new RussianLowerCaseFilter(result, charset);
+        TokenStream result = new RussianLetterTokenizer(reader);
+        result = new LowerCaseFilter(result);
        result = new StopFilter(result, stopSet);
-        result = new RussianStemFilter(result, charset);
+        result = new RussianStemFilter(result);
        return result;
    }
    
@ -309,10 +122,10 @@ public final class RussianAnalyzer extends Analyzer
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
      streams = new SavedStreams();
-      streams.source = new RussianLetterTokenizer(reader, charset);
-      streams.result = new RussianLowerCaseFilter(streams.source, charset);
+      streams.source = new RussianLetterTokenizer(reader);
+      streams.result = new LowerCaseFilter(streams.source);
      streams.result = new StopFilter(streams.result, stopSet);
-      streams.result = new RussianStemFilter(streams.result, charset);
+      streams.result = new RussianStemFilter(streams.result);
      setPreviousTokenStream(streams);
    } else {
      streams.source.reset(reader);
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
@ -1,314 +0,0 @@
-package org.apache.lucene.analysis.ru;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
- * for russian characters in Unicode, KOI8 and CP1252.
- * <p>
- * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
- * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
- * and adding logic to toLowerCase() method for that charset.
- * </p>
- * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
- * @version $Id$
- */
-public class RussianCharsets
-{
-    // Unicode Russian charset (lowercase letters only)
-    public static char[] UnicodeRussian = {
-        '\u0430',
-        '\u0431',
-        '\u0432',
-        '\u0433',
-        '\u0434',
-        '\u0435',
-        '\u0436',
-        '\u0437',
-        '\u0438',
-        '\u0439',
-        '\u043A',
-        '\u043B',
-        '\u043C',
-        '\u043D',
-        '\u043E',
-        '\u043F',
-        '\u0440',
-        '\u0441',
-        '\u0442',
-        '\u0443',
-        '\u0444',
-        '\u0445',
-        '\u0446',
-        '\u0447',
-        '\u0448',
-        '\u0449',
-        '\u044A',
-        '\u044B',
-        '\u044C',
-        '\u044D',
-        '\u044E',
-        '\u044F',
-        // upper case
-        '\u0410',
-        '\u0411',
-        '\u0412',
-        '\u0413',
-        '\u0414',
-        '\u0415',
-        '\u0416',
-        '\u0417',
-        '\u0418',
-        '\u0419',
-        '\u041A',
-        '\u041B',
-        '\u041C',
-        '\u041D',
-        '\u041E',
-        '\u041F',
-        '\u0420',
-        '\u0421',
-        '\u0422',
-        '\u0423',
-        '\u0424',
-        '\u0425',
-        '\u0426',
-        '\u0427',
-        '\u0428',
-        '\u0429',
-        '\u042A',
-        '\u042B',
-        '\u042C',
-        '\u042D',
-        '\u042E',
-        '\u042F',
-        // numbers
-        '0',
-        '1',
-        '2',
-        '3',
-        '4',
-        '5',
-        '6',
-        '7',
-        '8',
-        '9'
-    };
-
-    // KOI8 charset
-    public static char[] KOI8 = {
-        0xc1,
-        0xc2,
-        0xd7,
-        0xc7,
-        0xc4,
-        0xc5,
-        0xd6,
-        0xda,
-        0xc9,
-        0xca,
-        0xcb,
-        0xcc,
-        0xcd,
-        0xce,
-        0xcf,
-        0xd0,
-        0xd2,
-        0xd3,
-        0xd4,
-        0xd5,
-        0xc6,
-        0xc8,
-        0xc3,
-        0xde,
-        0xdb,
-        0xdd,
-        0xdf,
-        0xd9,
-        0xd8,
-        0xdc,
-        0xc0,
-        0xd1,
-        // upper case
-        0xe1,
-        0xe2,
-        0xf7,
-        0xe7,
-        0xe4,
-        0xe5,
-        0xf6,
-        0xfa,
-        0xe9,
-        0xea,
-        0xeb,
-        0xec,
-        0xed,
-        0xee,
-        0xef,
-        0xf0,
-        0xf2,
-        0xf3,
-        0xf4,
-        0xf5,
-        0xe6,
-        0xe8,
-        0xe3,
-        0xfe,
-        0xfb,
-        0xfd,
-        0xff,
-        0xf9,
-        0xf8,
-        0xfc,
-        0xe0,
-        0xf1,
-        // numbers
-        '0',
-        '1',
-        '2',
-        '3',
-        '4',
-        '5',
-        '6',
-        '7',
-        '8',
-        '9'
-    };
-
-    // CP1251 eharset
-    public static char[] CP1251 = {
-        0xE0,
-        0xE1,
-        0xE2,
-        0xE3,
-        0xE4,
-        0xE5,
-        0xE6,
-        0xE7,
-        0xE8,
-        0xE9,
-        0xEA,
-        0xEB,
-        0xEC,
-        0xED,
-        0xEE,
-        0xEF,
-        0xF0,
-        0xF1,
-        0xF2,
-        0xF3,
-        0xF4,
-        0xF5,
-        0xF6,
-        0xF7,
-        0xF8,
-        0xF9,
-        0xFA,
-        0xFB,
-        0xFC,
-        0xFD,
-        0xFE,
-        0xFF,
-        // upper case
-        0xC0,
-        0xC1,
-        0xC2,
-        0xC3,
-        0xC4,
-        0xC5,
-        0xC6,
-        0xC7,
-        0xC8,
-        0xC9,
-        0xCA,
-        0xCB,
-        0xCC,
-        0xCD,
-        0xCE,
-        0xCF,
-        0xD0,
-        0xD1,
-        0xD2,
-        0xD3,
-        0xD4,
-        0xD5,
-        0xD6,
-        0xD7,
-        0xD8,
-        0xD9,
-        0xDA,
-        0xDB,
-        0xDC,
-        0xDD,
-        0xDE,
-        0xDF,
-        // numbers
-        '0',
-        '1',
-        '2',
-        '3',
-        '4',
-        '5',
-        '6',
-        '7',
-        '8',
-        '9'
-    };
-
-    public static char toLowerCase(char letter, char[] charset)
-    {
-        if (charset == UnicodeRussian)
-        {
-            if (letter >= '\u0430' && letter <= '\u044F')
-            {
-                return letter;
-            }
-            if (letter >= '\u0410' && letter <= '\u042F')
-            {
-                return (char) (letter + 32);
-            }
-        }
-
-        if (charset == KOI8)
-        {
-            if (letter >= 0xe0 && letter <= 0xff)
-            {
-                return (char) (letter - 32);
-            }
-            if (letter >= 0xc0 && letter <= 0xdf)
-            {
-                return letter;
-            }
-
-        }
-
-        if (charset == CP1251)
-        {
-            if (letter >= 0xC0 && letter <= 0xDF)
-            {
-                return (char) (letter + 32);
-            }
-            if (letter >= 0xE0 && letter <= 0xFF)
-            {
-                return letter;
-            }
-
-        }
-
-        return Character.toLowerCase(letter);
-    }
-}
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@ -25,49 +25,26 @@ import org.apache.lucene.util.AttributeSource;

 /**
 * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
- * by additionally looking up letters in a given "russian charset". 
- * <p>
- * The problem with 
- * {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
- * which doesn't know how to detect letters in encodings like CP1252 and KOI8
- * (well-known problems with 0xD7 and 0xF7 chars)
- * </p>
+ * by also allowing the basic latin digits 0-9. 
 *
 * @version $Id$
 */

 public class RussianLetterTokenizer extends CharTokenizer
-{
-    /** 
-     * Charset this tokenizer uses.
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    private char[] charset;
-
-    /**
-     * @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead. 
-     */
-    public RussianLetterTokenizer(Reader in, char[] charset)
-    {
-        super(in);
-        this.charset = charset;
-    }
-    
+{    
    public RussianLetterTokenizer(Reader in)
    {
-    	this(in, RussianCharsets.UnicodeRussian);
+    	super(in);
    }

    public RussianLetterTokenizer(AttributeSource source, Reader in)
    {
        super(source, in);
-        this.charset = RussianCharsets.UnicodeRussian;
    }

    public RussianLetterTokenizer(AttributeFactory factory, Reader in)
    {
        super(factory, in);
-        this.charset = RussianCharsets.UnicodeRussian;
    }
    
    /**
@ -76,14 +53,9 @@ public class RussianLetterTokenizer extends CharTokenizer
     */
    protected boolean isTokenChar(char c)
    {
-    	/* in the next release, this can be implemented as isLetter(c) or [0-9] */
-        if (Character.isLetter(c))
+        if (Character.isLetter(c) || (c >= '0' && c <= '9'))
            return true;
-        for (int i = 0; i < charset.length; i++)
-        {
-            if (c == charset[i])
-                return true;
-        }
-        return false;
+        else
+            return false;
    }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@ -19,38 +19,26 @@ package org.apache.lucene.analysis.ru;

 import java.io.IOException;

+import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;

 /**
- * Normalizes token text to lower case, analyzing given ("russian") charset.
- *
+ * Normalizes token text to lower case.
+ * @deprecated Use {@link LowerCaseFilter} instead, which has the same
+ *  functionality. This filter will be removed in Lucene 3.1
 *
 * @version $Id$
 */
 public final class RussianLowerCaseFilter extends TokenFilter
 {
-    /**
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    char[] charset;
-
    private TermAttribute termAtt;
-
-    /**
-     * @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
-     */
-    public RussianLowerCaseFilter(TokenStream in, char[] charset)
-    {
-        super(in);
-        this.charset = charset;
-        termAtt = addAttribute(TermAttribute.class);
-    }
-    
+   
    public RussianLowerCaseFilter(TokenStream in)
    {
-    	this(in, RussianCharsets.UnicodeRussian);
+        super(in);
+        termAtt = addAttribute(TermAttribute.class);
    }

    public final boolean incrementToken() throws IOException
@ -60,7 +48,7 @@ public final class RussianLowerCaseFilter extends TokenFilter
        int chLen = termAtt.termLength();
        for (int i = 0; i < chLen; i++)
        {
-          chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
+          chArray[i] = Character.toLowerCase(chArray[i]);
        }
        return true;
      } else {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ru;
 * limitations under the License.
 */

+import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -28,8 +29,8 @@ import java.io.IOException;
 * A {@link TokenFilter} that stems Russian words. 
 * <p>
 * The implementation was inspired by GermanStemFilter.
- * The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter ,
- * because RussianStemFilter only works with lowercase part of any "russian" charset.
+ * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
+ * because RussianStemFilter only works with lowercase characters.
 * </p>
 *
 * @version   $Id$
@ -43,19 +44,11 @@ public final class RussianStemFilter extends TokenFilter

    private TermAttribute termAtt;

-    /**
-     * @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
-     */
-    public RussianStemFilter(TokenStream in, char[] charset)
-    {
-        super(in);
-        stemmer = new RussianStemmer(charset);
-        termAtt = addAttribute(TermAttribute.class);
-    }
-
    public RussianStemFilter(TokenStream in)
    {
-    	this(in, RussianCharsets.UnicodeRussian);
+        super(in);
+        stemmer = new RussianStemmer();
+        termAtt = addAttribute(TermAttribute.class);
    }
    /**
     * Returns the next token in the stream, or null at EOS
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
@ -25,47 +25,42 @@ package org.apache.lucene.analysis.ru;
 */
 class RussianStemmer
 {
-    /**
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 
-     */
-    private char[] charset;
-
    // positions of RV, R1 and R2 respectively
    private int RV, R1, R2;

    // letters (currently unused letters are commented out)
-    private final static char A = 0;
-    //private final static char B = 1;
-    private final static char V = 2;
-    private final static char G = 3;
-    //private final static char D = 4;
-    private final static char E = 5;
-    //private final static char ZH = 6;
-    //private final static char Z = 7;
-    private final static char I = 8;
-    private final static char I_ = 9;
-    //private final static char K = 10;
-    private final static char L = 11;
-    private final static char M = 12;
-    private final static char N = 13;
-    private final static char O = 14;
-    //private final static char P = 15;
-    //private final static char R = 16;
-    private final static char S = 17;
-    private final static char T = 18;
-    private final static char U = 19;
-    //private final static char F = 20;
-    private final static char X = 21;
-    //private final static char TS = 22;
-    //private final static char CH = 23;
-    private final static char SH = 24;
-    private final static char SHCH = 25;
-    //private final static char HARD = 26;
-    private final static char Y = 27;
-    private final static char SOFT = 28;
-    private final static char AE = 29;
-    private final static char IU = 30;
-    private final static char IA = 31;
+    private final static char A = '\u0430';
+    //private final static char B = '\u0431';
+    private final static char V = '\u0432';
+    private final static char G = '\u0433';
+    //private final static char D = '\u0434';
+    private final static char E = '\u0435';
+    //private final static char ZH = '\u0436';
+    //private final static char Z = '\u0437';
+    private final static char I = '\u0438';
+    private final static char I_ = '\u0439';
+    //private final static char K = '\u043A';
+    private final static char L = '\u043B';
+    private final static char M = '\u043C';
+    private final static char N = '\u043D';
+    private final static char O = '\u043E';
+    //private final static char P = '\u043F';
+    //private final static char R = '\u0440';
+    private final static char S = '\u0441';
+    private final static char T = '\u0442';
+    private final static char U = '\u0443';
+    //private final static char F = '\u0444';
+    private final static char X = '\u0445';
+    //private final static char TS = '\u0446';
+    //private final static char CH = '\u0447';
+    private final static char SH = '\u0448';
+    private final static char SHCH = '\u0449';
+    //private final static char HARD = '\u044A';
+    private final static char Y = '\u044B';
+    private final static char SOFT = '\u044C';
+    private final static char AE = '\u044D';
+    private final static char IU = '\u044E';
+    private final static char IA = '\u044F';

    // stem definitions
    private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
@ -256,16 +251,6 @@ class RussianStemmer
        super();
    }

-    /**
-     * RussianStemmer constructor comment.
-     * @deprecated Use {@link #RussianStemmer()} instead.
-     */
-    public RussianStemmer(char[] charset)
-    {
-        super();
-        this.charset = charset;
-    }
-
    /**
     * Adjectival ending is an adjective ending,
     * optionally preceded by participle ending.
@ -333,7 +318,7 @@ class RussianStemmer
            int stemmingIndex = startIndex;
            for (int j = theEnding.length - 1; j >= 0; j--)
            {
-                if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
+                if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
                {
                    match = false;
                    break;
@ -451,7 +436,7 @@ class RussianStemmer
    {
        for (int i = 0; i < vowels.length; i++)
        {
-            if (letter == charset[vowels[i]])
+            if (letter == vowels[i])
                return true;
        }
        return false;
@ -499,7 +484,7 @@ class RussianStemmer
    private boolean removeI(StringBuffer stemmingZone)
    {
        if (stemmingZone.length() > 0
-            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
+            && stemmingZone.charAt(stemmingZone.length() - 1) == I)
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
@ -518,7 +503,7 @@ class RussianStemmer
    private boolean removeSoft(StringBuffer stemmingZone)
    {
        if (stemmingZone.length() > 0
-            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
+            && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
@ -529,17 +514,6 @@ class RussianStemmer
        }
    }

-    /**
-     * Insert the method's description here.
-     * Creation date: (16/03/2002 10:58:42 PM)
-     * @param newCharset char[]
-     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
-     */
-    public void setCharset(char[] newCharset)
-    {
-        charset = newCharset;
-    }
-
    /**
     * Finds the stem for given Russian word.
     * Creation date: (16/03/2002 3:36:48 PM)
@ -622,25 +596,13 @@ class RussianStemmer
            verb1Predessors)
            || findAndRemoveEnding(stemmingZone, verbEndings2);
    }
-
-    /**
-     * Static method for stemming with different charsets
-     * @deprecated Use {@link #stemWord(String)} instead.
-     */
-    public static String stem(String theWord, char[] charset)
-    {
-        RussianStemmer stemmer = new RussianStemmer();
-        stemmer.setCharset(charset);
-        return stemmer.stem(theWord);
-    }
-    
+   
    /**
     * Static method for stemming.
     */
    public static String stemWord(String theWord)
    {
        RussianStemmer stemmer = new RussianStemmer();
-        stemmer.setCharset(RussianCharsets.UnicodeRussian);
        return stemmer.stem(theWord);
    }
 }
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
@ -42,14 +42,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase

    private InputStreamReader sampleUnicode;

-    private Reader inWordsKOI8;
-
-    private Reader sampleKOI8;
-
-    private Reader inWords1251;
-
-    private Reader sample1251;
-
    private File dataDir;

    protected void setUp() throws Exception
@ -97,76 +89,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
        inWords.close();
        sampleUnicode.close();
    }
-
-    public void testKOI8() throws IOException
-    {
-        //System.out.println(new java.util.Date());
-        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
-        // KOI8
-        inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
-
-        sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
-
-        TokenStream in = ra.tokenStream("all", inWordsKOI8);
-        RussianLetterTokenizer sample =
-            new RussianLetterTokenizer(
-                sampleKOI8,
-                RussianCharsets.KOI8);
-
-        TermAttribute text = in.getAttribute(TermAttribute.class);
-        TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
-
-        for (;;)
-        {
-          if (in.incrementToken() == false)
-            break;
-
-            boolean nextSampleToken = sample.incrementToken();
-            assertEquals(
-                "KOI8",
-                text.term(),
-                nextSampleToken == false
-                ? null
-                : sampleText.term());
-        }
-        inWordsKOI8.close();
-        sampleKOI8.close();
-    }
-
-    public void test1251() throws IOException
-    {
-        // 1251
-        inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
-
-        sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
-
-        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
-        TokenStream in = ra.tokenStream("", inWords1251);
-        RussianLetterTokenizer sample =
-            new RussianLetterTokenizer(
-                sample1251,
-                RussianCharsets.CP1251);
-
-        TermAttribute text = in.getAttribute(TermAttribute.class);
-        TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
-
-        for (;;)
-        {
-          if (in.incrementToken() == false)
-            break;
-
-            boolean nextSampleToken = sample.incrementToken();
-            assertEquals(
-                "1251",
-                text.term(),
-                nextSampleToken == false
-                ? null
-                : sampleText.term());
-        }
-
-        inWords1251.close();
-        sample1251.close();
-    }
    
    public void testDigitsInRussianCharset() 
    {
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
@ -84,9 +84,8 @@ public class TestRussianStem extends LuceneTestCase
        {
            //if ( (i % 100) == 0 ) System.err.println(i);
            String realStem =
-                RussianStemmer.stem(
-                    (String) words.get(i),
-                    RussianCharsets.UnicodeRussian);
+                RussianStemmer.stemWord(
+                    (String) words.get(i));
            assertEquals("unicode", stems.get(i), realStem);
        }
    }
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/res1251.htm
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/res1251.htm
@ -1 +0,0 @@
-[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
@ -1 +0,0 @@
-[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/test1251.txt
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/test1251.txt
@ -1,2 +0,0 @@
-Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в 
-узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
@ -1,2 +0,0 @@
-Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в 
-узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
				`@ -1 +0,0 @@`
				[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
				`@ -1 +0,0 @@`
				`[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃`