- Russian Analyzer, by Boris Okner. Initial checkin.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149840 13f79535-47bb-0310-9956-ffa450edef68
2025-02-20 17:07:09 +00:00 · 2002-09-16 02:51:58 +00:00 · 2002-09-16 02:51:58 +00:00 · e63750554f
commit e63750554f
parent e79dfcfaa1
6 changed files with 1772 additions and 0 deletions
--- a/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@ -0,0 +1,291 @@
+package org.apache.lucene.analysis.ru;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+import java.io.File;
+import java.io.Reader;
+import java.util.Hashtable;
+
+/**
+ * Analyzer for Russian language. Supports an external list of stopwords (words that
+ * will not be indexed at all).
+ * A default set of stopwords is used unless an alternative list is specified.
+ *
+ * @author    Boris Okner
+ * @version $Id
+ */
+public final class RussianAnalyzer extends Analyzer
+{
+    // letters
+    private static char A = 0;
+    private static char B = 1;
+    private static char V = 2;
+    private static char G = 3;
+    private static char D = 4;
+    private static char E = 5;
+    private static char ZH = 6;
+    private static char Z = 7;
+    private static char I = 8;
+    private static char I_ = 9;
+    private static char K = 10;
+    private static char L = 11;
+    private static char M = 12;
+    private static char N = 13;
+    private static char O = 14;
+    private static char P = 15;
+    private static char R = 16;
+    private static char S = 17;
+    private static char T = 18;
+    private static char U = 19;
+    private static char F = 20;
+    private static char X = 21;
+    private static char TS = 22;
+    private static char CH = 23;
+    private static char SH = 24;
+    private static char SHCH = 25;
+    private static char HARD = 26;
+    private static char Y = 27;
+    private static char SOFT = 28;
+    private static char AE = 29;
+    private static char IU = 30;
+    private static char IA = 31;
+
+    /**
+     * List of typical Russian stopwords.
+     */
+    private static char[][] RUSSIAN_STOP_WORDS = {
+        {A},
+        {B, E, Z},
+        {B, O, L, E, E},
+        {B, Y},
+        {B, Y, L},
+        {B, Y, L, A},
+        {B, Y, L, I},
+        {B, Y, L, O},
+        {B, Y, T, SOFT},
+        {V},
+        {V, A, M},
+        {V, A, S},
+        {V, E, S, SOFT},
+        {V, O},
+        {V, O, T},
+        {V, S, E},
+        {V, S, E, G, O},
+        {V, S, E, X},
+        {V, Y},
+        {G, D, E},
+        {D, A},
+        {D, A, ZH, E},
+        {D, L, IA},
+        {D, O},
+        {E, G, O},
+        {E, E},
+        {E, I_,},
+        {E, IU},
+        {E, S, L, I},
+        {E, S, T, SOFT},
+        {E, SHCH, E},
+        {ZH, E},
+        {Z, A},
+        {Z, D, E, S, SOFT},
+        {I},
+        {I, Z},
+        {I, L, I},
+        {I, M},
+        {I, X},
+        {K},
+        {K, A, K},
+        {K, O},
+        {K, O, G, D, A},
+        {K, T, O},
+        {L, I},
+        {L, I, B, O},
+        {M, N, E},
+        {M, O, ZH, E, T},
+        {M, Y},
+        {N, A},
+        {N, A, D, O},
+        {N, A, SH},
+        {N, E},
+        {N, E, G, O},
+        {N, E, E},
+        {N, E, T},
+        {N, I},
+        {N, I, X},
+        {N, O},
+        {N, U},
+        {O},
+        {O, B},
+        {O, D, N, A, K, O},
+        {O, N},
+        {O, N, A},
+        {O, N, I},
+        {O, N, O},
+        {O, T},
+        {O, CH, E, N, SOFT},
+        {P, O},
+        {P, O, D},
+        {P, R, I},
+        {S},
+        {S, O},
+        {T, A, K},
+        {T, A, K, ZH, E},
+        {T, A, K, O, I_},
+        {T, A, M},
+        {T, E},
+        {T, E, M},
+        {T, O},
+        {T, O, G, O},
+        {T, O, ZH, E},
+        {T, O, I_},
+        {T, O, L, SOFT, K, O},
+        {T, O, M},
+        {T, Y},
+        {U},
+        {U, ZH, E},
+        {X, O, T, IA},
+        {CH, E, G, O},
+        {CH, E, I_},
+        {CH, E, M},
+        {CH, T, O},
+        {CH, T, O, B, Y},
+        {CH, SOFT, E},
+        {CH, SOFT, IA},
+        {AE, T, A},
+        {AE, T, I},
+        {AE, T, O},
+        {IA}
+    };
+
+    /**
+     * Contains the stopwords used with the StopFilter.
+     */
+    private Hashtable stoptable = new Hashtable();
+
+    /**
+     * Charset for Russian letters.
+     * Represents encoding for 32 lowercase Russian letters.
+     * Predefined charsets can be taken from RussianCharSets class
+     */
+    private char[] charset;
+
+    /**
+     * Builds an analyzer.
+     */
+    public RussianAnalyzer(char[] charset)
+    {
+        this.charset = charset;
+        stoptable = StopFilter.makeStopTable(makeStopWords(charset));
+    }
+
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public RussianAnalyzer(char[] charset, String[] stopwords)
+    {
+        this.charset = charset;
+        stoptable = StopFilter.makeStopTable(stopwords);
+    }
+
+    // Takes russian stop words and translates them to a String array, using
+    // the given charset
+    private static String[] makeStopWords(char[] charset)
+    {
+        String[] res = new String[RUSSIAN_STOP_WORDS.length];
+        for (int i = 0; i < res.length; i++)
+        {
+            char[] theStopWord = RUSSIAN_STOP_WORDS[i];
+            // translate the word,using the charset
+            StringBuffer theWord = new StringBuffer();
+            for (int j = 0; j < theStopWord.length; j++)
+            {
+                theWord.append(charset[theStopWord[j]]);
+            }
+            res[i] = theWord.toString();
+        }
+        return res;
+    }
+
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public RussianAnalyzer(char[] charset, Hashtable stopwords)
+    {
+        this.charset = charset;
+        stoptable = stopwords;
+    }
+
+    /**
+     * Creates a TokenStream which tokenizes all the text in the provided Reader.
+     *
+     * @return  A TokenStream build from a RussianLetterTokenizer filtered with
+     *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+     */
+    public final TokenStream tokenStream(String fieldName, Reader reader)
+    {
+        TokenStream result = new RussianLetterTokenizer(reader, charset);
+        result = new RussianLowerCaseFilter(result, charset);
+        result = new StopFilter(result, stoptable);
+        result = new RussianStemFilter(result, charset);
+        return result;
+    }
+}
--- a/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
+++ b/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
@ -0,0 +1,317 @@
+package org.apache.lucene.analysis.ru;
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+/**
+ * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
+ * for russian characters in Unicode, KOI8 and CP1252.
+ * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
+ * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
+ * and adding logic to toLowerCase() method for that charset.
+ *
+ * @author Boris Okner
+ * @version $Id$
+ */
+public class RussianCharsets
+{
+    // Unicode Russian charset (lowercase letters only)
+    public static char[] UnicodeRussian = {
+        '\u0430',
+        '\u0431',
+        '\u0432',
+        '\u0433',
+        '\u0434',
+        '\u0435',
+        '\u0436',
+        '\u0437',
+        '\u0438',
+        '\u0439',
+        '\u043A',
+        '\u043B',
+        '\u043C',
+        '\u043D',
+        '\u043E',
+        '\u043F',
+        '\u0440',
+        '\u0441',
+        '\u0442',
+        '\u0443',
+        '\u0444',
+        '\u0445',
+        '\u0446',
+        '\u0447',
+        '\u0448',
+        '\u0449',
+        '\u044A',
+        '\u044B',
+        '\u044C',
+        '\u044D',
+        '\u044E',
+        '\u044F',
+        // upper case
+        '\u0410',
+        '\u0411',
+        '\u0412',
+        '\u0413',
+        '\u0414',
+        '\u0415',
+        '\u0416',
+        '\u0417',
+        '\u0418',
+        '\u0419',
+        '\u041A',
+        '\u041B',
+        '\u041C',
+        '\u041D',
+        '\u041E',
+        '\u041F',
+        '\u0420',
+        '\u0421',
+        '\u0422',
+        '\u0423',
+        '\u0424',
+        '\u0425',
+        '\u0426',
+        '\u0427',
+        '\u0428',
+        '\u0429',
+        '\u042A',
+        '\u042B',
+        '\u042C',
+        '\u042D',
+        '\u042E',
+        '\u042F'
+    };
+
+    // KOI8 charset
+    public static char[] KOI8 = {
+        0xc1,
+        0xc2,
+        0xd7,
+        0xc7,
+        0xc4,
+        0xc5,
+        0xd6,
+        0xda,
+        0xc9,
+        0xca,
+        0xcb,
+        0xcc,
+        0xcd,
+        0xce,
+        0xcf,
+        0xd0,
+        0xd2,
+        0xd3,
+        0xd4,
+        0xd5,
+        0xc6,
+        0xc8,
+        0xc3,
+        0xde,
+        0xdb,
+        0xdd,
+        0xdf,
+        0xd9,
+        0xd8,
+        0xdc,
+        0xc0,
+        0xd1,
+        // upper case
+        0xe1,
+        0xe2,
+        0xf7,
+        0xe7,
+        0xe4,
+        0xe5,
+        0xf6,
+        0xfa,
+        0xe9,
+        0xea,
+        0xeb,
+        0xec,
+        0xed,
+        0xee,
+        0xef,
+        0xf0,
+        0xf2,
+        0xf3,
+        0xf4,
+        0xf5,
+        0xe6,
+        0xe8,
+        0xe3,
+        0xfe,
+        0xfb,
+        0xfd,
+        0xff,
+        0xf9,
+        0xf8,
+        0xfc,
+        0xe0,
+        0xf1
+    };
+
+    // CP1251 eharset
+    public static char[] CP1251 = {
+        0xE0,
+        0xE1,
+        0xE2,
+        0xE3,
+        0xE4,
+        0xE5,
+        0xE6,
+        0xE7,
+        0xE8,
+        0xE9,
+        0xEA,
+        0xEB,
+        0xEC,
+        0xED,
+        0xEE,
+        0xEF,
+        0xF0,
+        0xF1,
+        0xF2,
+        0xF3,
+        0xF4,
+        0xF5,
+        0xF6,
+        0xF7,
+        0xF8,
+        0xF9,
+        0xFA,
+        0xFB,
+        0xFC,
+        0xFD,
+        0xFE,
+        0xFF,
+        // upper case
+        0xC0,
+        0xC1,
+        0xC2,
+        0xC3,
+        0xC4,
+        0xC5,
+        0xC6,
+        0xC7,
+        0xC8,
+        0xC9,
+        0xCA,
+        0xCB,
+        0xCC,
+        0xCD,
+        0xCE,
+        0xCF,
+        0xD0,
+        0xD1,
+        0xD2,
+        0xD3,
+        0xD4,
+        0xD5,
+        0xD6,
+        0xD7,
+        0xD8,
+        0xD9,
+        0xDA,
+        0xDB,
+        0xDC,
+        0xDD,
+        0xDE,
+        0xDF
+    };
+
+    public static char toLowerCase(char letter, char[] charset)
+    {
+        if (charset == UnicodeRussian)
+        {
+            if (letter >= '\u0430' && letter <= '\u044F')
+            {
+                return letter;
+            }
+            if (letter >= '\u0410' && letter <= '\u042F')
+            {
+                return (char) (letter + 32);
+            }
+        }
+
+        if (charset == KOI8)
+        {
+            if (letter >= 0xe0 && letter <= 0xff)
+            {
+                return (char) (letter - 32);
+            }
+            if (letter >= 0xc0 && letter <= 0xdf)
+            {
+                return letter;
+            }
+
+        }
+
+        if (charset == CP1251)
+        {
+            if (letter >= 0xC0 && letter <= 0xDF)
+            {
+                return (char) (letter + 32);
+            }
+            if (letter >= 0xE0 && letter <= 0xFF)
+            {
+                return letter;
+            }
+
+        }
+
+        return Character.toLowerCase(letter);
+    }
+}
--- a/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@ -0,0 +1,95 @@
+package org.apache.lucene.analysis.ru;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.Reader;
+import org.apache.lucene.analysis.CharTokenizer;
+
+/**
+ * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
+ * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
+ * which doesn't know how to detect letters in encodings like CP1252 and KOI8
+ * (well-known problems with 0xD7 and 0xF7 chars)
+ *
+ * @version $Id$
+ */
+
+public class RussianLetterTokenizer extends CharTokenizer
+{
+    /** Construct a new LetterTokenizer. */
+    private char[] charset;
+
+    public RussianLetterTokenizer(Reader in, char[] charset)
+    {
+        super(in);
+        this.charset = charset;
+    }
+
+    /**
+     * Collects only characters which satisfy
+     * {@link Character#isLetter(char)}.
+     */
+    protected boolean isTokenChar(char c)
+    {
+        if (Character.isLetter(c))
+            return true;
+        for (int i = 0; i < charset.length; i++)
+        {
+            if (c == charset[i])
+                return true;
+        }
+        return false;
+    }
+}
--- a/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@ -0,0 +1,97 @@
+package org.apache.lucene.analysis.ru;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Normalizes token text to lower case, analyzing given ("russian") charset.
+ *
+ * @version $Id$
+ */
+public final class RussianLowerCaseFilter extends TokenFilter
+{
+    char[] charset;
+
+    public RussianLowerCaseFilter(TokenStream in, char[] charset)
+    {
+        input = in;
+        this.charset = charset;
+    }
+
+    public final Token next() throws java.io.IOException
+    {
+        Token t = input.next();
+
+        if (t == null)
+            return null;
+
+        String txt = t.termText();
+
+        char[] chArray = txt.toCharArray();
+        for (int i = 0; i < chArray.length; i++)
+        {
+            chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
+        }
+
+        String newTxt = new String(chArray);
+        // create new token
+        Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
+
+        return newToken;
+    }
+}
--- a/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@ -0,0 +1,115 @@
+package org.apache.lucene.analysis.ru;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import java.io.IOException;
+import java.util.Hashtable;
+
+/**
+ * A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
+ * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
+ * because RussianStemFilter only works  with lowercase part of any "russian" charset.
+ * @author    Boris Okner
+ * @version   $Id$
+ */
+public final class RussianStemFilter extends TokenFilter
+{
+    /**
+     * The actual token in the input stream.
+     */
+    private Token token = null;
+    private RussianStemmer stemmer = null;
+
+    public RussianStemFilter(TokenStream in, char[] charset)
+    {
+        stemmer = new RussianStemmer(charset);
+        input = in;
+    }
+
+    /**
+     * @return  Returns the next token in the stream, or null at EOS
+     */
+    public final Token next() throws IOException
+    {
+        if ((token = input.next()) == null)
+        {
+            return null;
+        }
+        else
+        {
+            String s = stemmer.stem(token.termText());
+            if (!s.equals(token.termText()))
+            {
+                return new Token(s, token.startOffset(), token.endOffset(),
+                    token.type());
+            }
+            return token;
+        }
+    }
+
+    /**
+     * Set a alternative/custom RussianStemmer for this filter.
+     */
+    public void setStemmer(RussianStemmer stemmer)
+    {
+        if (stemmer != null)
+        {
+            this.stemmer = stemmer;
+        }
+    }
+}
--- a/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
+++ b/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
@ -0,0 +1,857 @@
+package org.apache.lucene.analysis.ru;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+/**
+ * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
+ * Creation date: (12/02/2002 10:34:15 PM)
+ * @author: Boris Okner
+ * @version $Id$
+ */
+class RussianStemmer
+{
+    private char[] charset;
+
+    // positions of RV, R1 and R2 respectively
+    private int RV, R1, R2;
+
+    // letters
+    private static char A = 0;
+    private static char B = 1;
+    private static char V = 2;
+    private static char G = 3;
+    private static char D = 4;
+    private static char E = 5;
+    private static char ZH = 6;
+    private static char Z = 7;
+    private static char I = 8;
+    private static char I_ = 9;
+    private static char K = 10;
+    private static char L = 11;
+    private static char M = 12;
+    private static char N = 13;
+    private static char O = 14;
+    private static char P = 15;
+    private static char R = 16;
+    private static char S = 17;
+    private static char T = 18;
+    private static char U = 19;
+    private static char F = 20;
+    private static char X = 21;
+    private static char TS = 22;
+    private static char CH = 23;
+    private static char SH = 24;
+    private static char SHCH = 25;
+    private static char HARD = 26;
+    private static char Y = 27;
+    private static char SOFT = 28;
+    private static char AE = 29;
+    private static char IU = 30;
+    private static char IA = 31;
+
+    // stem definitions
+    private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
+
+    private static char[][] perfectiveGerundEndings1 = {
+        { V },
+        { V, SH, I },
+        { V, SH, I, S, SOFT }
+    };
+
+    private static char[][] perfectiveGerund1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] perfectiveGerundEndings2 = { { I, V }, {
+        Y, V }, {
+            I, V, SH, I }, {
+                Y, V, SH, I }, {
+                    I, V, SH, I, S, SOFT }, {
+                        Y, V, SH, I, S, SOFT }
+    };
+
+    private static char[][] adjectiveEndings = {
+        { E, E },
+        { I, E },
+        { Y, E },
+        { O, E },
+        { E, I_ },
+        { I, I_ },
+        { Y, I_ },
+        { O, I_ },
+        { E, M },
+        { I, M },
+        { Y, M },
+        { O, M },
+        { I, X },
+        { Y, X },
+        { U, IU },
+        { IU, IU },
+        { A, IA },
+        { IA, IA },
+        { O, IU },
+        { E, IU },
+        { I, M, I },
+        { Y, M, I },
+        { E, G, O },
+        { O, G, O },
+        { E, M, U },
+        {O, M, U }
+    };
+
+    private static char[][] participleEndings1 = {
+        { SHCH },
+        { E, M },
+        { N, N },
+        { V, SH },
+        { IU, SHCH }
+    };
+
+    private static char[][] participleEndings2 = {
+        { I, V, SH },
+        { Y, V, SH },
+        { U, IU, SHCH }
+    };
+
+    private static char[][] participle1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] reflexiveEndings = {
+        { S, IA },
+        { S, SOFT }
+    };
+
+    private static char[][] verbEndings1 = {
+        { I_ },
+        { L },
+        { N },
+        { L, O },
+        { N, O },
+        { E, T },
+        { IU, T },
+        { L, A },
+        { N, A },
+        { L, I },
+        { E, M },
+        { N, Y },
+        { E, T, E },
+        { I_, T, E },
+        { T, SOFT },
+        { E, SH, SOFT },
+        { N, N, O }
+    };
+
+    private static char[][] verbEndings2 = {
+        { IU },
+        { U, IU },
+        { E, N },
+        { E, I_ },
+        { IA, T },
+        { U, I_ },
+        { I, L },
+        { Y, L },
+        { I, M },
+        { Y, M },
+        { I, T },
+        { Y, T },
+        { I, L, A },
+        { Y, L, A },
+        { E, N, A },
+        { I, T, E },
+        { I, L, I },
+        { Y, L, I },
+        { I, L, O },
+        { Y, L, O },
+        { E, N, O },
+        { U, E, T },
+        { U, IU, T },
+        { E, N, Y },
+        { I, T, SOFT },
+        { Y, T, SOFT },
+        { I, SH, SOFT },
+        { E, I_, T, E },
+        { U, I_, T, E }
+    };
+
+    private static char[][] verb1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] nounEndings = {
+        { A },
+        { U },
+        { I_ },
+        { O },
+        { U },
+        { E },
+        { Y },
+        { I },
+        { SOFT },
+        { IA },
+        { E, V },
+        { O, V },
+        { I, E },
+        { SOFT, E },
+        { IA, X },
+        { I, IU },
+        { E, I },
+        { I, I },
+        { E, I_ },
+        { O, I_ },
+        { E, M },
+        { A, M },
+        { O, M },
+        { A, X },
+        { SOFT, IU },
+        { I, IA },
+        { SOFT, IA },
+        { I, I_ },
+        { IA, M },
+        { IA, M, I },
+        { A, M, I },
+        { I, E, I_ },
+        { I, IA, M },
+        { I, E, M },
+        { I, IA, X },
+        { I, IA, M, I }
+    };
+
+    private static char[][] superlativeEndings = {
+        { E, I_, SH },
+        { E, I_, SH, E }
+    };
+
+    private static char[][] derivationalEndings = {
+        { O, S, T },
+        { O, S, T, SOFT }
+    };
+
+    /**
+     * RussianStemmer constructor comment.
+     */
+    public RussianStemmer()
+    {
+        super();
+    }
+
+    /**
+     * RussianStemmer constructor comment.
+     */
+    public RussianStemmer(char[] charset)
+    {
+        super();
+        this.charset = charset;
+    }
+
+    /**
+     * Adjectival ending is an adjective ending,
+     * optionally preceded by participle ending.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean adjectival(StringBuffer stemmingZone)
+    {
+        // look for adjective ending in a stemming zone
+        if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
+            return false;
+        // if adjective ending was found, try for participle ending
+        boolean r =
+            findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
+            ||
+            findAndRemoveEnding(stemmingZone, participleEndings2);
+        return true;
+    }
+
+    /**
+     * Derivational endings
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean derivational(StringBuffer stemmingZone)
+    {
+        int endingLength = findEnding(stemmingZone, derivationalEndings);
+        if (endingLength == 0)
+             // no derivational ending found
+            return false;
+        else
+        {
+            // Ensure that the ending locates in R2
+            if (R2 - RV <= stemmingZone.length() - endingLength)
+            {
+                stemmingZone.setLength(stemmingZone.length() - endingLength);
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+
+    /**
+     * Finds ending among given ending class and returns the length of ending found(0, if not found).
+     * Creation date: (17/03/2002 8:18:34 PM)
+     * @return int
+     * @param word java.lang.StringBuffer
+     * @param theEnding char[]
+     */
+    private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
+    {
+        boolean match = false;
+        for (int i = theEndingClass.length - 1; i >= 0; i--)
+        {
+            char[] theEnding = theEndingClass[i];
+            // check if the ending is bigger than stemming zone
+            if (startIndex < theEnding.length - 1)
+            {
+                match = false;
+                continue;
+            }
+            match = true;
+            int stemmingIndex = startIndex;
+            for (int j = theEnding.length - 1; j >= 0; j--)
+            {
+                if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
+                {
+                    match = false;
+                    break;
+                }
+            }
+            // check if ending was found
+            if (match)
+            {
+                return theEndingClass[i].length; // cut ending
+            }
+        }
+        return 0;
+    }
+
+    private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
+    {
+        return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
+    }
+
+    /**
+     * Finds the ending among the given class of endings and removes it from stemming zone.
+     * Creation date: (17/03/2002 8:18:34 PM)
+     * @return boolean
+     * @param word java.lang.StringBuffer
+     * @param theEnding char[]
+     */
+    private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
+    {
+        int endingLength = findEnding(stemmingZone, theEndingClass);
+        if (endingLength == 0)
+            // not found
+            return false;
+        else {
+            stemmingZone.setLength(stemmingZone.length() - endingLength);
+            // cut the ending found
+            return true;
+        }
+    }
+
+    /**
+     * Finds the ending among the given class of endings, then checks if this ending was
+     * preceded by any of given predessors, and if so, removes it from stemming zone.
+     * Creation date: (17/03/2002 8:18:34 PM)
+     * @return boolean
+     * @param word java.lang.StringBuffer
+     * @param theEnding char[]
+     */
+    private boolean findAndRemoveEnding(StringBuffer stemmingZone,
+        char[][] theEndingClass, char[][] thePredessors)
+    {
+        int endingLength = findEnding(stemmingZone, theEndingClass);
+        if (endingLength == 0)
+            // not found
+            return false;
+        else
+        {
+            int predessorLength =
+                findEnding(stemmingZone,
+                    stemmingZone.length() - endingLength - 1,
+                    thePredessors);
+            if (predessorLength == 0)
+                return false;
+            else {
+                stemmingZone.setLength(stemmingZone.length() - endingLength);
+                // cut the ending found
+                return true;
+            }
+        }
+
+    }
+
+    /**
+     * Marks positions of RV, R1 and R2 in a given word.
+     * Creation date: (16/03/2002 3:40:11 PM)
+     * @return int
+     * @param word java.lang.String
+     */
+    private void markPositions(String word)
+    {
+        RV = 0;
+        R1 = 0;
+        R2 = 0;
+        int i = 0;
+        // find RV
+        while (word.length() > i && !isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // RV zone is empty
+        RV = i;
+        // find R1
+        while (word.length() > i && isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R1 zone is empty
+        R1 = i;
+        // find R2
+        while (word.length() > i && !isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R2 zone is empty
+        while (word.length() > i && isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R2 zone is empty
+        R2 = i;
+    }
+
+    /**
+     * Checks if character is a vowel..
+     * Creation date: (16/03/2002 10:47:03 PM)
+     * @return boolean
+     * @param letter char
+     */
+    private boolean isVowel(char letter)
+    {
+        for (int i = 0; i < vowels.length; i++)
+        {
+            if (letter == charset[vowels[i]])
+                return true;
+        }
+        return false;
+    }
+
+    /**
+     * Noun endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean noun(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, nounEndings);
+    }
+
+    /**
+     * Perfective gerund endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean perfectiveGerund(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(
+            stemmingZone,
+            perfectiveGerundEndings1,
+            perfectiveGerund1Predessors)
+            || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
+    }
+
+    /**
+     * Reflexive endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean reflexive(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, reflexiveEndings);
+    }
+
+    /**
+     * Insert the method's description here.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean removeI(StringBuffer stemmingZone)
+    {
+        if (stemmingZone.length() > 0
+            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Insert the method's description here.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean removeSoft(StringBuffer stemmingZone)
+    {
+        if (stemmingZone.length() > 0
+            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Insert the method's description here.
+     * Creation date: (16/03/2002 10:58:42 PM)
+     * @param newCharset char[]
+     */
+    public void setCharset(char[] newCharset)
+    {
+        charset = newCharset;
+    }
+
+    /**
+     * Set ending definition as in Russian stemming algorithm.
+     * Creation date: (16/03/2002 11:16:36 PM)
+     */
+    private void setEndings()
+    {
+        vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };
+
+        perfectiveGerundEndings1 = new char[][] {
+            { V }, { V, SH, I }, { V, SH, I, S, SOFT }
+        };
+
+        perfectiveGerund1Predessors = new char[][] { { A }, { IA }
+        };
+
+        perfectiveGerundEndings2 = new char[][] {
+            { I, V },
+            { Y, V },
+            { I, V, SH, I },
+            { Y, V, SH, I },
+            { I, V, SH, I, S, SOFT },
+            { Y, V, SH, I, S, SOFT }
+        };
+
+        adjectiveEndings = new char[][] {
+            { E, E },
+            { I, E },
+            { Y, E },
+            { O, E },
+            { E, I_ },
+            { I, I_ },
+            { Y, I_ },
+            { O, I_ },
+            { E, M },
+            { I, M },
+            { Y, M },
+            { O, M },
+            { I, X },
+            { Y, X },
+            { U, IU },
+            { IU, IU },
+            { A, IA },
+            { IA, IA },
+            { O, IU },
+            { E, IU },
+            { I, M, I },
+            { Y, M, I },
+            { E, G, O },
+            { O, G, O },
+            { E, M, U },
+            { O, M, U }
+        };
+
+        participleEndings1 = new char[][] {
+            { SHCH },
+            { E, M },
+            { N, N },
+            { V, SH },
+            { IU, SHCH }
+        };
+
+        participleEndings2 = new char[][] {
+            { I, V, SH },
+            { Y, V, SH },
+            { U, IU, SHCH }
+        };
+
+        participle1Predessors = new char[][] {
+            { A },
+            { IA }
+        };
+
+        reflexiveEndings = new char[][] {
+            { S, IA },
+            { S, SOFT }
+        };
+
+        verbEndings1 = new char[][] {
+            { I_ },
+            { L },
+            { N },
+            { L, O },
+            { N, O },
+            { E, T },
+            { IU, T },
+            { L, A },
+            { N, A },
+            { L, I },
+            { E, M },
+            { N, Y },
+            { E, T, E },
+            { I_, T, E },
+            { T, SOFT },
+            { E, SH, SOFT },
+            { N, N, O }
+        };
+
+        verbEndings2 = new char[][] {
+            { IU },
+            { U, IU },
+            { E, N },
+            { E, I_ },
+            { IA, T },
+            { U, I_ },
+            { I, L },
+            { Y, L },
+            { I, M },
+            { Y, M },
+            { I, T },
+            { Y, T },
+            { I, L, A },
+            { Y, L, A },
+            { E, N, A },
+            { I, T, E },
+            { I, L, I },
+            { Y, L, I },
+            { I, L, O },
+            { Y, L, O },
+            { E, N, O },
+            { U, E, T },
+            { U, IU, T },
+            { E, N, Y },
+            { I, T, SOFT },
+            { Y, T, SOFT },
+            { I, SH, SOFT },
+            { E, I_, T, E },
+            { U, I_, T, E }
+        };
+
+        verb1Predessors = new char[][] {
+            { A },
+            { IA }
+        };
+
+        nounEndings = new char[][] {
+            { A },
+            { IU },
+            { I_ },
+            { O },
+            { U },
+            { E },
+            { Y },
+            { I },
+            { SOFT },
+            { IA },
+            { E, V },
+            { O, V },
+            { I, E },
+            { SOFT, E },
+            { IA, X },
+            { I, IU },
+            { E, I },
+            { I, I },
+            { E, I_ },
+            { O, I_ },
+            { E, M },
+            { A, M },
+            { O, M },
+            { A, X },
+            { SOFT, IU },
+            { I, IA },
+            { SOFT, IA },
+            { I, I_ },
+            { IA, M },
+            { IA, M, I },
+            { A, M, I },
+            { I, E, I_ },
+            { I, IA, M },
+            { I, E, M },
+            { I, IA, X },
+            { I, IA, M, I }
+        };
+
+        superlativeEndings = new char[][] {
+            { E, I_, SH },
+            { E, I_, SH, E }
+        };
+
+        derivationalEndings = new char[][] {
+            { O, S, T },
+            { O, S, T, SOFT }
+        };
+    }
+
+    /**
+     * Finds the stem for given Russian word.
+     * Creation date: (16/03/2002 3:36:48 PM)
+     * @return java.lang.String
+     * @param input java.lang.String
+     */
+    public String stem(String input)
+    {
+        markPositions(input);
+        if (RV == 0)
+            return input; //RV wasn't detected, nothing to stem
+        StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
+        // stemming goes on in RV
+        // Step 1
+
+        if (!perfectiveGerund(stemmingZone))
+        {
+            reflexive(stemmingZone);
+            boolean r =
+                adjectival(stemmingZone)
+                || verb(stemmingZone)
+                || noun(stemmingZone);
+        }
+        // Step 2
+        removeI(stemmingZone);
+        // Step 3
+        derivational(stemmingZone);
+        // Step 4
+        superlative(stemmingZone);
+        undoubleN(stemmingZone);
+        removeSoft(stemmingZone);
+        // return result
+        return input.substring(0, RV) + stemmingZone.toString();
+    }
+
+    /**
+     * Superlative endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean superlative(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, superlativeEndings);
+    }
+
+    /**
+     * Undoubles N.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean undoubleN(StringBuffer stemmingZone)
+    {
+        char[][] doubleN = {
+            { N, N }
+        };
+        if (findEnding(stemmingZone, doubleN) != 0)
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Verb endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean verb(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(
+            stemmingZone,
+            verbEndings1,
+            verb1Predessors)
+            || findAndRemoveEnding(stemmingZone, verbEndings2);
+    }
+
+    /**
+     * Static method for stemming with different charsets
+     */
+    public static String stem(String theWord, char[] charset)
+    {
+        RussianStemmer stemmer = new RussianStemmer();
+        stemmer.setCharset(charset);
+        return stemmer.stem(theWord);
+    }
+}