copy the Russian and German analyzers plus their test cases to the sandbox

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150998 13f79535-47bb-0310-9956-ffa450edef68
2025-02-06 10:08:58 +00:00 · 2004-08-16 20:30:46 +00:00 · 2004-08-16 20:30:46 +00:00 · 87bcdf6f25
commit 87bcdf6f25
parent 726ddaeb5a
24 changed files with 2398 additions and 0 deletions
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@ -0,0 +1,135 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+
+/**
+ * Analyzer for German language. Supports an external list of stopwords (words that
+ * will not be indexed at all) and an external list of exclusions (word that will
+ * not be stemmed, but indexed).
+ * A default set of stopwords is used unless an alternative list is specified, the
+ * exclusion list is empty by default.
+ *
+ * @author Gerhard Schwarz
+ * @version $Id$
+ */
+public class GermanAnalyzer extends Analyzer {
+  /**
+   * List of typical german stopwords.
+   */
+  private String[] GERMAN_STOP_WORDS = {
+    "einer", "eine", "eines", "einem", "einen",
+    "der", "die", "das", "dass", "daß",
+    "du", "er", "sie", "es",
+    "was", "wer", "wie", "wir",
+    "und", "oder", "ohne", "mit",
+    "am", "im", "in", "aus", "auf",
+    "ist", "sein", "war", "wird",
+    "ihr", "ihre", "ihres",
+    "als", "für", "von", "mit",
+    "dich", "dir", "mich", "mir",
+    "mein", "sein", "kein",
+    "durch", "wegen", "wird"
+  };
+
+  /**
+   * Contains the stopwords used with the StopFilter.
+   */
+  private Set stopSet = new HashSet();
+
+  /**
+   * Contains words that should be indexed but not stemmed.
+   */
+  private Set exclusionSet = new HashSet();
+
+  /**
+   * Builds an analyzer.
+   */
+  public GermanAnalyzer() {
+    stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public GermanAnalyzer(String[] stopwords) {
+    stopSet = StopFilter.makeStopSet(stopwords);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public GermanAnalyzer(Hashtable stopwords) {
+    stopSet = new HashSet(stopwords.keySet());
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public GermanAnalyzer(File stopwords) throws IOException {
+    stopSet = WordlistLoader.getWordSet(stopwords);
+  }
+
+  /**
+   * Builds an exclusionlist from an array of Strings.
+   */
+  public void setStemExclusionTable(String[] exclusionlist) {
+    exclusionSet = StopFilter.makeStopSet(exclusionlist);
+  }
+
+  /**
+   * Builds an exclusionlist from a Hashtable.
+   */
+  public void setStemExclusionTable(Hashtable exclusionlist) {
+    exclusionSet = new HashSet(exclusionlist.keySet());
+  }
+
+  /**
+   * Builds an exclusionlist from the words contained in the given file.
+   */
+  public void setStemExclusionTable(File exclusionlist) throws IOException {
+    exclusionSet = WordlistLoader.getWordSet(exclusionlist);
+  }
+
+  /**
+   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   *
+   * @return A TokenStream build from a StandardTokenizer filtered with
+   *         StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
+   */
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new StandardTokenizer(reader);
+    result = new StandardFilter(result);
+    result = new LowerCaseFilter(result);
+    result = new StopFilter(result, stopSet);
+    result = new GermanStemFilter(result, exclusionSet);
+    return result;
+  }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
@ -0,0 +1,119 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import java.io.IOException;
+import java.util.Hashtable;
+import java.util.Set;
+import java.util.HashSet;
+
+/**
+ * A filter that stems German words. It supports a table of words that should
+ * not be stemmed at all. The stemmer used can be changed at runtime after the
+ * filter object is created (as long as it is a GermanStemmer).
+ *
+ * @author    Gerhard Schwarz
+ * @version   $Id$
+ */
+public final class GermanStemFilter extends TokenFilter
+{
+    /**
+     * The actual token in the input stream.
+     */
+    private Token token = null;
+    private GermanStemmer stemmer = null;
+    private Set exclusionSet = null;
+
+    public GermanStemFilter( TokenStream in )
+    {
+      super(in);
+      stemmer = new GermanStemmer();
+    }
+
+    /**
+     * Builds a GermanStemFilter that uses an exclusiontable.
+     * @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
+     */
+    public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
+    {
+      this( in );
+      exclusionSet = new HashSet(exclusiontable.keySet());
+    }
+
+    /**
+     * Builds a GermanStemFilter that uses an exclusiontable.
+     */
+    public GermanStemFilter( TokenStream in, Set exclusionSet )
+    {
+      this( in );
+      this.exclusionSet = exclusionSet;
+    }
+
+    /**
+     * @return  Returns the next token in the stream, or null at EOS
+     */
+    public final Token next()
+      throws IOException
+    {
+      if ( ( token = input.next() ) == null ) {
+        return null;
+      }
+      // Check the exclusiontable
+      else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
+        return token;
+      }
+      else {
+        String s = stemmer.stem( token.termText() );
+        // If not stemmed, dont waste the time creating a new token
+        if ( !s.equals( token.termText() ) ) {
+          return new Token( s, token.startOffset(),
+            token.endOffset(), token.type() );
+        }
+        return token;
+      }
+    }
+
+    /**
+     * Set a alternative/custom GermanStemmer for this filter.
+     */
+    public void setStemmer( GermanStemmer stemmer )
+    {
+      if ( stemmer != null ) {
+        this.stemmer = stemmer;
+      }
+    }
+
+    /**
+     * Set an alternative exclusion list for this filter.
+     * @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
+     */
+    public void setExclusionTable( Hashtable exclusiontable )
+    {
+      exclusionSet = new HashSet(exclusiontable.keySet());
+    }
+
+    /**
+     * Set an alternative exclusion list for this filter.
+     */
+    public void setExclusionSet( Set exclusionSet )
+    {
+      this.exclusionSet = exclusionSet;
+    }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
@ -0,0 +1,265 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A stemmer for German words. The algorithm is based on the report
+ * "A Fast and Simple Stemming Algorithm for German Words" by Jörg
+ * Caumanns (joerg.caumanns@isst.fhg.de).
+ *
+ * @author    Gerhard Schwarz
+ * @version   $Id$
+ */
+public class GermanStemmer
+{
+    /**
+     * Buffer for the terms while stemming them.
+     */
+    private StringBuffer sb = new StringBuffer();
+
+    /**
+     * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
+     */
+    private int substCount = 0;
+
+    /**
+     * Stemms the given term to an unique <tt>discriminator</tt>.
+     *
+     * @param term  The term that should be stemmed.
+     * @return      Discriminator for <tt>term</tt>
+     */
+    protected String stem( String term )
+    {
+      // Use lowercase for medium stemming.
+      term = term.toLowerCase();
+      if ( !isStemmable( term ) )
+        return term;
+      // Reset the StringBuffer.
+      sb.delete( 0, sb.length() );
+      sb.insert( 0, term );
+      // Stemming starts here...
+      substitute( sb );
+      strip( sb );
+      optimize( sb );
+      resubstitute( sb );
+      removeParticleDenotion( sb );
+      return sb.toString();
+    }
+
+    /**
+     * Checks if a term could be stemmed.
+     *
+     * @return  true if, and only if, the given term consists in letters.
+     */
+    private boolean isStemmable( String term )
+    {
+      for ( int c = 0; c < term.length(); c++ ) {
+        if ( !Character.isLetter( term.charAt( c ) ) )
+          return false;
+      }
+      return true;
+    }
+
+    /**
+     * suffix stripping (stemming) on the current term. The stripping is reduced
+     * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
+     * from which all regular suffixes are build of. The simplification causes
+     * some overstemming, and way more irregular stems, but still provides unique.
+     * discriminators in the most of those cases.
+     * The algorithm is context free, except of the length restrictions.
+     */
+    private void strip( StringBuffer buffer )
+    {
+      boolean doMore = true;
+      while ( doMore && buffer.length() > 3 ) {
+        if ( ( buffer.length() + substCount > 5 ) &&
+          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
+        {
+          buffer.delete( buffer.length() - 2, buffer.length() );
+        }
+        else if ( ( buffer.length() + substCount > 4 ) &&
+          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
+            buffer.delete( buffer.length() - 2, buffer.length() );
+        }
+        else if ( ( buffer.length() + substCount > 4 ) &&
+          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
+            buffer.delete( buffer.length() - 2, buffer.length() );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        // "t" occurs only as suffix of verbs.
+        else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else {
+          doMore = false;
+        }
+      }
+    }
+
+    /**
+     * Does some optimizations on the term. This optimisations are
+     * contextual.
+     */
+    private void optimize( StringBuffer buffer )
+    {
+      // Additional step for female plurals of professions and inhabitants.
+      if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
+        buffer.deleteCharAt( buffer.length() -1 );
+        strip( buffer );
+      }
+      // Additional step for irregular plural nouns like "Matrizen -> Matrix".
+      if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
+        buffer.setCharAt( buffer.length() - 1, 'x' );
+      }
+    }
+
+    /**
+     * Removes a particle denotion ("ge") from a term.
+     */
+    private void removeParticleDenotion( StringBuffer buffer )
+    {
+      if ( buffer.length() > 4 ) {
+        for ( int c = 0; c < buffer.length() - 3; c++ ) {
+          if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
+            buffer.delete( c, c + 2 );
+            return;
+          }
+        }
+      }
+    }
+
+    /**
+     * Do some substitutions for the term to reduce overstemming:
+     *
+     * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
+     *   "ß" is substituted by "ss"
+     * - Substitute a second char of a pair of equal characters with
+     *   an asterisk: ?? -> ?*
+     * - Substitute some common character combinations with a token:
+     *   sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
+     */
+    private void substitute( StringBuffer buffer )
+    {
+      substCount = 0;
+      for ( int c = 0; c < buffer.length(); c++ ) {
+        // Replace the second char of a pair of the equal characters with an asterisk
+        if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
+          buffer.setCharAt( c, '*' );
+        }
+        // Substitute Umlauts.
+        else if ( buffer.charAt( c ) == 'ä' ) {
+          buffer.setCharAt( c, 'a' );
+        }
+        else if ( buffer.charAt( c ) == 'ö' ) {
+          buffer.setCharAt( c, 'o' );
+        }
+        else if ( buffer.charAt( c ) == 'ü' ) {
+          buffer.setCharAt( c, 'u' );
+        }
+        // Fix bug so that 'ß' at the end of a word is replaced.
+        else if ( buffer.charAt( c ) == 'ß' ) {
+            buffer.setCharAt( c, 's' );
+            buffer.insert( c + 1, 's' );
+            substCount++;
+        }
+        // Take care that at least one character is left left side from the current one
+        if ( c < buffer.length() - 1 ) {
+          // Masking several common character combinations with an token
+          if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
+            buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
+          {
+            buffer.setCharAt( c, '$' );
+            buffer.delete( c + 1, c + 3 );
+            substCount =+ 2;
+          }
+          else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
+            buffer.setCharAt( c, '§' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+          else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
+            buffer.setCharAt( c, '%' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+          else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
+            buffer.setCharAt( c, '&' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+          else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
+            buffer.setCharAt( c, '#' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+          else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
+            buffer.setCharAt( c, '!' );
+            buffer.deleteCharAt( c + 1 );
+            substCount++;
+          }
+        }
+      }
+    }
+
+    /**
+     * Undoes the changes made by substitute(). That are character pairs and
+     * character combinations. Umlauts will remain as their corresponding vowel,
+     * as "ß" remains as "ss".
+     */
+    private void resubstitute( StringBuffer buffer )
+    {
+      for ( int c = 0; c < buffer.length(); c++ ) {
+        if ( buffer.charAt( c ) == '*' ) {
+          char x = buffer.charAt( c - 1 );
+          buffer.setCharAt( c, x );
+        }
+        else if ( buffer.charAt( c ) == '$' ) {
+          buffer.setCharAt( c, 's' );
+          buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
+        }
+        else if ( buffer.charAt( c ) == '§' ) {
+          buffer.setCharAt( c, 'c' );
+          buffer.insert( c + 1, 'h' );
+        }
+        else if ( buffer.charAt( c ) == '%' ) {
+          buffer.setCharAt( c, 'e' );
+          buffer.insert( c + 1, 'i' );
+        }
+        else if ( buffer.charAt( c ) == '&' ) {
+          buffer.setCharAt( c, 'i' );
+          buffer.insert( c + 1, 'e' );
+        }
+        else if ( buffer.charAt( c ) == '#' ) {
+          buffer.setCharAt( c, 'i' );
+          buffer.insert( c + 1, 'g' );
+        }
+        else if ( buffer.charAt( c ) == '!' ) {
+          buffer.setCharAt( c, 's' );
+          buffer.insert( c + 1, 't' );
+        }
+      }
+    }
+    
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
@ -0,0 +1,111 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Iterator;
+
+/**
+ * Loader for text files that represent a list of stopwords.
+ *
+ * @author Gerhard Schwarz
+ * @version $Id$
+ *
+ * @todo this is not specific to German, it should be moved up
+ */
+public class WordlistLoader {
+
+  /**
+   * Loads a text file and adds every line as an entry to a HashSet (omitting
+   * leading and trailing whitespace). Every line of the file should contain only 
+   * one word. The words need to be in lowercase if you make use of an
+   * Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
+   * 
+   * @param wordfile File containing the wordlist
+   * @return A HashSet with the file's words
+   */
+  public static HashSet getWordSet(File wordfile) throws IOException {
+    HashSet result = new HashSet();
+    FileReader freader = null;
+    LineNumberReader lnr = null;
+    try {
+      freader = new FileReader(wordfile);
+      lnr = new LineNumberReader(freader);
+      String word = null;
+      while ((word = lnr.readLine()) != null) {
+        result.add(word.trim());
+      }
+    }
+    finally {
+      if (lnr != null)
+        lnr.close();
+      if (freader != null)
+        freader.close();
+    }
+    return result;
+  }
+
+  /**
+   * @param path      Path to the wordlist
+   * @param wordfile  Name of the wordlist
+   * 
+   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
+   */
+  public static Hashtable getWordtable(String path, String wordfile) throws IOException {
+    return getWordtable(new File(path, wordfile));
+  }
+
+  /**
+   * @param wordfile  Complete path to the wordlist
+   * 
+   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
+   */
+  public static Hashtable getWordtable(String wordfile) throws IOException {
+    return getWordtable(new File(wordfile));
+  }
+
+  /**
+   * @param wordfile  File object that points to the wordlist
+   *
+   * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
+   */
+  public static Hashtable getWordtable(File wordfile) throws IOException {
+    HashSet wordSet = (HashSet)getWordSet(wordfile);
+    Hashtable result = makeWordTable(wordSet);
+    return result;
+  }
+
+  /**
+   * Builds a wordlist table, using words as both keys and values
+   * for backward compatibility.
+   *
+   * @param wordSet   stopword set
+   */
+  private static Hashtable makeWordTable(HashSet wordSet) {
+    Hashtable table = new Hashtable();
+    for (Iterator iter = wordSet.iterator(); iter.hasNext();) {
+      String word = (String)iter.next();
+      table.put(word, word);
+    }
+    return table;
+  }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/package.html
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/package.html
@ -0,0 +1,5 @@
+<html>
+<body>
+Support for indexing and searching of German text.
+</body>
+</html>
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@ -0,0 +1,259 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.Reader;
+import java.util.Hashtable;
+import java.util.Set;
+import java.util.HashSet;
+
+/**
+ * Analyzer for Russian language. Supports an external list of stopwords (words that
+ * will not be indexed at all).
+ * A default set of stopwords is used unless an alternative list is specified.
+ *
+ * @author  Boris Okner, b.okner@rogers.com
+ * @version $Id$
+ */
+public final class RussianAnalyzer extends Analyzer
+{
+    // letters (currently unused letters are commented out)
+    private final static char A = 0;
+    private final static char B = 1;
+    private final static char V = 2;
+    private final static char G = 3;
+    private final static char D = 4;
+    private final static char E = 5;
+    private final static char ZH = 6;
+    private final static char Z = 7;
+    private final static char I = 8;
+    private final static char I_ = 9;
+    private final static char K = 10;
+    private final static char L = 11;
+    private final static char M = 12;
+    private final static char N = 13;
+    private final static char O = 14;
+    private final static char P = 15;
+    private final static char R = 16;
+    private final static char S = 17;
+    private final static char T = 18;
+    private final static char U = 19;
+    //private final static char F = 20;
+    private final static char X = 21;
+    //private final static char TS = 22;
+    private final static char CH = 23;
+    private final static char SH = 24;
+    private final static char SHCH = 25;
+    //private final static char HARD = 26;
+    private final static char Y = 27;
+    private final static char SOFT = 28;
+    private final static char AE = 29;
+    private final static char IU = 30;
+    private final static char IA = 31;
+
+    /**
+     * List of typical Russian stopwords.
+     */
+    private static char[][] RUSSIAN_STOP_WORDS = {
+        {A},
+        {B, E, Z},
+        {B, O, L, E, E},
+        {B, Y},
+        {B, Y, L},
+        {B, Y, L, A},
+        {B, Y, L, I},
+        {B, Y, L, O},
+        {B, Y, T, SOFT},
+        {V},
+        {V, A, M},
+        {V, A, S},
+        {V, E, S, SOFT},
+        {V, O},
+        {V, O, T},
+        {V, S, E},
+        {V, S, E, G, O},
+        {V, S, E, X},
+        {V, Y},
+        {G, D, E},
+        {D, A},
+        {D, A, ZH, E},
+        {D, L, IA},
+        {D, O},
+        {E, G, O},
+        {E, E},
+        {E, I_,},
+        {E, IU},
+        {E, S, L, I},
+        {E, S, T, SOFT},
+        {E, SHCH, E},
+        {ZH, E},
+        {Z, A},
+        {Z, D, E, S, SOFT},
+        {I},
+        {I, Z},
+        {I, L, I},
+        {I, M},
+        {I, X},
+        {K},
+        {K, A, K},
+        {K, O},
+        {K, O, G, D, A},
+        {K, T, O},
+        {L, I},
+        {L, I, B, O},
+        {M, N, E},
+        {M, O, ZH, E, T},
+        {M, Y},
+        {N, A},
+        {N, A, D, O},
+        {N, A, SH},
+        {N, E},
+        {N, E, G, O},
+        {N, E, E},
+        {N, E, T},
+        {N, I},
+        {N, I, X},
+        {N, O},
+        {N, U},
+        {O},
+        {O, B},
+        {O, D, N, A, K, O},
+        {O, N},
+        {O, N, A},
+        {O, N, I},
+        {O, N, O},
+        {O, T},
+        {O, CH, E, N, SOFT},
+        {P, O},
+        {P, O, D},
+        {P, R, I},
+        {S},
+        {S, O},
+        {T, A, K},
+        {T, A, K, ZH, E},
+        {T, A, K, O, I_},
+        {T, A, M},
+        {T, E},
+        {T, E, M},
+        {T, O},
+        {T, O, G, O},
+        {T, O, ZH, E},
+        {T, O, I_},
+        {T, O, L, SOFT, K, O},
+        {T, O, M},
+        {T, Y},
+        {U},
+        {U, ZH, E},
+        {X, O, T, IA},
+        {CH, E, G, O},
+        {CH, E, I_},
+        {CH, E, M},
+        {CH, T, O},
+        {CH, T, O, B, Y},
+        {CH, SOFT, E},
+        {CH, SOFT, IA},
+        {AE, T, A},
+        {AE, T, I},
+        {AE, T, O},
+        {IA}
+    };
+
+    /**
+     * Contains the stopwords used with the StopFilter.
+     */
+    private Set stopSet = new HashSet();
+
+    /**
+     * Charset for Russian letters.
+     * Represents encoding for 32 lowercase Russian letters.
+     * Predefined charsets can be taken from RussianCharSets class
+     */
+    private char[] charset;
+
+
+    public RussianAnalyzer() {
+        charset = RussianCharsets.UnicodeRussian;
+        stopSet = StopFilter.makeStopSet(
+                    makeStopWords(RussianCharsets.UnicodeRussian));
+    }
+
+    /**
+     * Builds an analyzer.
+     */
+    public RussianAnalyzer(char[] charset)
+    {
+        this.charset = charset;
+        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
+    }
+
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public RussianAnalyzer(char[] charset, String[] stopwords)
+    {
+        this.charset = charset;
+        stopSet = StopFilter.makeStopSet(stopwords);
+    }
+
+    // Takes russian stop words and translates them to a String array, using
+    // the given charset
+    private static String[] makeStopWords(char[] charset)
+    {
+        String[] res = new String[RUSSIAN_STOP_WORDS.length];
+        for (int i = 0; i < res.length; i++)
+        {
+            char[] theStopWord = RUSSIAN_STOP_WORDS[i];
+            // translate the word, using the charset
+            StringBuffer theWord = new StringBuffer();
+            for (int j = 0; j < theStopWord.length; j++)
+            {
+                theWord.append(charset[theStopWord[j]]);
+            }
+            res[i] = theWord.toString();
+        }
+        return res;
+    }
+
+    /**
+     * Builds an analyzer with the given stop words.
+     * @todo create a Set version of this ctor
+     */
+    public RussianAnalyzer(char[] charset, Hashtable stopwords)
+    {
+        this.charset = charset;
+        stopSet = new HashSet(stopwords.keySet());
+    }
+
+    /**
+     * Creates a TokenStream which tokenizes all the text in the provided Reader.
+     *
+     * @return  A TokenStream build from a RussianLetterTokenizer filtered with
+     *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter
+     */
+    public TokenStream tokenStream(String fieldName, Reader reader)
+    {
+        TokenStream result = new RussianLetterTokenizer(reader, charset);
+        result = new RussianLowerCaseFilter(result, charset);
+        result = new StopFilter(result, stopSet);
+        result = new RussianStemFilter(result, charset);
+        return result;
+    }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
@ -0,0 +1,279 @@
+package org.apache.lucene.analysis.ru;
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
+ * for russian characters in Unicode, KOI8 and CP1252.
+ * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
+ * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
+ * and adding logic to toLowerCase() method for that charset.
+ *
+ * @author  Boris Okner, b.okner@rogers.com
+ * @version $Id$
+ */
+public class RussianCharsets
+{
+    // Unicode Russian charset (lowercase letters only)
+    public static char[] UnicodeRussian = {
+        '\u0430',
+        '\u0431',
+        '\u0432',
+        '\u0433',
+        '\u0434',
+        '\u0435',
+        '\u0436',
+        '\u0437',
+        '\u0438',
+        '\u0439',
+        '\u043A',
+        '\u043B',
+        '\u043C',
+        '\u043D',
+        '\u043E',
+        '\u043F',
+        '\u0440',
+        '\u0441',
+        '\u0442',
+        '\u0443',
+        '\u0444',
+        '\u0445',
+        '\u0446',
+        '\u0447',
+        '\u0448',
+        '\u0449',
+        '\u044A',
+        '\u044B',
+        '\u044C',
+        '\u044D',
+        '\u044E',
+        '\u044F',
+        // upper case
+        '\u0410',
+        '\u0411',
+        '\u0412',
+        '\u0413',
+        '\u0414',
+        '\u0415',
+        '\u0416',
+        '\u0417',
+        '\u0418',
+        '\u0419',
+        '\u041A',
+        '\u041B',
+        '\u041C',
+        '\u041D',
+        '\u041E',
+        '\u041F',
+        '\u0420',
+        '\u0421',
+        '\u0422',
+        '\u0423',
+        '\u0424',
+        '\u0425',
+        '\u0426',
+        '\u0427',
+        '\u0428',
+        '\u0429',
+        '\u042A',
+        '\u042B',
+        '\u042C',
+        '\u042D',
+        '\u042E',
+        '\u042F'
+    };
+
+    // KOI8 charset
+    public static char[] KOI8 = {
+        0xc1,
+        0xc2,
+        0xd7,
+        0xc7,
+        0xc4,
+        0xc5,
+        0xd6,
+        0xda,
+        0xc9,
+        0xca,
+        0xcb,
+        0xcc,
+        0xcd,
+        0xce,
+        0xcf,
+        0xd0,
+        0xd2,
+        0xd3,
+        0xd4,
+        0xd5,
+        0xc6,
+        0xc8,
+        0xc3,
+        0xde,
+        0xdb,
+        0xdd,
+        0xdf,
+        0xd9,
+        0xd8,
+        0xdc,
+        0xc0,
+        0xd1,
+        // upper case
+        0xe1,
+        0xe2,
+        0xf7,
+        0xe7,
+        0xe4,
+        0xe5,
+        0xf6,
+        0xfa,
+        0xe9,
+        0xea,
+        0xeb,
+        0xec,
+        0xed,
+        0xee,
+        0xef,
+        0xf0,
+        0xf2,
+        0xf3,
+        0xf4,
+        0xf5,
+        0xe6,
+        0xe8,
+        0xe3,
+        0xfe,
+        0xfb,
+        0xfd,
+        0xff,
+        0xf9,
+        0xf8,
+        0xfc,
+        0xe0,
+        0xf1
+    };
+
+    // CP1251 eharset
+    public static char[] CP1251 = {
+        0xE0,
+        0xE1,
+        0xE2,
+        0xE3,
+        0xE4,
+        0xE5,
+        0xE6,
+        0xE7,
+        0xE8,
+        0xE9,
+        0xEA,
+        0xEB,
+        0xEC,
+        0xED,
+        0xEE,
+        0xEF,
+        0xF0,
+        0xF1,
+        0xF2,
+        0xF3,
+        0xF4,
+        0xF5,
+        0xF6,
+        0xF7,
+        0xF8,
+        0xF9,
+        0xFA,
+        0xFB,
+        0xFC,
+        0xFD,
+        0xFE,
+        0xFF,
+        // upper case
+        0xC0,
+        0xC1,
+        0xC2,
+        0xC3,
+        0xC4,
+        0xC5,
+        0xC6,
+        0xC7,
+        0xC8,
+        0xC9,
+        0xCA,
+        0xCB,
+        0xCC,
+        0xCD,
+        0xCE,
+        0xCF,
+        0xD0,
+        0xD1,
+        0xD2,
+        0xD3,
+        0xD4,
+        0xD5,
+        0xD6,
+        0xD7,
+        0xD8,
+        0xD9,
+        0xDA,
+        0xDB,
+        0xDC,
+        0xDD,
+        0xDE,
+        0xDF
+    };
+
+    public static char toLowerCase(char letter, char[] charset)
+    {
+        if (charset == UnicodeRussian)
+        {
+            if (letter >= '\u0430' && letter <= '\u044F')
+            {
+                return letter;
+            }
+            if (letter >= '\u0410' && letter <= '\u042F')
+            {
+                return (char) (letter + 32);
+            }
+        }
+
+        if (charset == KOI8)
+        {
+            if (letter >= 0xe0 && letter <= 0xff)
+            {
+                return (char) (letter - 32);
+            }
+            if (letter >= 0xc0 && letter <= 0xdf)
+            {
+                return letter;
+            }
+
+        }
+
+        if (charset == CP1251)
+        {
+            if (letter >= 0xC0 && letter <= 0xDF)
+            {
+                return (char) (letter + 32);
+            }
+            if (letter >= 0xE0 && letter <= 0xFF)
+            {
+                return letter;
+            }
+
+        }
+
+        return Character.toLowerCase(letter);
+    }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import org.apache.lucene.analysis.CharTokenizer;
+
+/**
+ * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
+ * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
+ * which doesn't know how to detect letters in encodings like CP1252 and KOI8
+ * (well-known problems with 0xD7 and 0xF7 chars)
+ *
+ * @author  Boris Okner, b.okner@rogers.com
+ * @version $Id$
+ */
+
+public class RussianLetterTokenizer extends CharTokenizer
+{
+    /** Construct a new LetterTokenizer. */
+    private char[] charset;
+
+    public RussianLetterTokenizer(Reader in, char[] charset)
+    {
+        super(in);
+        this.charset = charset;
+    }
+
+    /**
+     * Collects only characters which satisfy
+     * {@link Character#isLetter(char)}.
+     */
+    protected boolean isTokenChar(char c)
+    {
+        if (Character.isLetter(c))
+            return true;
+        for (int i = 0; i < charset.length; i++)
+        {
+            if (c == charset[i])
+                return true;
+        }
+        return false;
+    }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@ -0,0 +1,60 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Normalizes token text to lower case, analyzing given ("russian") charset.
+ *
+ * @author  Boris Okner, b.okner@rogers.com
+ * @version $Id$
+ */
+public final class RussianLowerCaseFilter extends TokenFilter
+{
+    char[] charset;
+
+    public RussianLowerCaseFilter(TokenStream in, char[] charset)
+    {
+        super(in);
+        this.charset = charset;
+    }
+
+    public final Token next() throws java.io.IOException
+    {
+        Token t = input.next();
+
+        if (t == null)
+            return null;
+
+        String txt = t.termText();
+
+        char[] chArray = txt.toCharArray();
+        for (int i = 0; i < chArray.length; i++)
+        {
+            chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
+        }
+
+        String newTxt = new String(chArray);
+        // create new token
+        Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
+
+        return newToken;
+    }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@ -0,0 +1,77 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import java.io.IOException;
+
+/**
+ * A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
+ * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
+ * because RussianStemFilter only works  with lowercase part of any "russian" charset.
+ *
+ * @author    Boris Okner, b.okner@rogers.com
+ * @version   $Id$
+ */
+public final class RussianStemFilter extends TokenFilter
+{
+    /**
+     * The actual token in the input stream.
+     */
+    private Token token = null;
+    private RussianStemmer stemmer = null;
+
+    public RussianStemFilter(TokenStream in, char[] charset)
+    {
+        super(in);
+        stemmer = new RussianStemmer(charset);
+    }
+
+    /**
+     * @return  Returns the next token in the stream, or null at EOS
+     */
+    public final Token next() throws IOException
+    {
+        if ((token = input.next()) == null)
+        {
+            return null;
+        }
+        else
+        {
+            String s = stemmer.stem(token.termText());
+            if (!s.equals(token.termText()))
+            {
+                return new Token(s, token.startOffset(), token.endOffset(),
+                    token.type());
+            }
+            return token;
+        }
+    }
+
+    /**
+     * Set a alternative/custom RussianStemmer for this filter.
+     */
+    public void setStemmer(RussianStemmer stemmer)
+    {
+        if (stemmer != null)
+        {
+            this.stemmer = stemmer;
+        }
+    }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
@ -0,0 +1,629 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
+ *
+ * @author  Boris Okner, b.okner@rogers.com
+ * @version $Id$
+ */
+class RussianStemmer
+{
+    private char[] charset;
+
+    // positions of RV, R1 and R2 respectively
+    private int RV, R1, R2;
+
+    // letters (currently unused letters are commented out)
+    private final static char A = 0;
+    //private final static char B = 1;
+    private final static char V = 2;
+    private final static char G = 3;
+    //private final static char D = 4;
+    private final static char E = 5;
+    //private final static char ZH = 6;
+    //private final static char Z = 7;
+    private final static char I = 8;
+    private final static char I_ = 9;
+    //private final static char K = 10;
+    private final static char L = 11;
+    private final static char M = 12;
+    private final static char N = 13;
+    private final static char O = 14;
+    //private final static char P = 15;
+    //private final static char R = 16;
+    private final static char S = 17;
+    private final static char T = 18;
+    private final static char U = 19;
+    //private final static char F = 20;
+    private final static char X = 21;
+    //private final static char TS = 22;
+    //private final static char CH = 23;
+    private final static char SH = 24;
+    private final static char SHCH = 25;
+    //private final static char HARD = 26;
+    private final static char Y = 27;
+    private final static char SOFT = 28;
+    private final static char AE = 29;
+    private final static char IU = 30;
+    private final static char IA = 31;
+
+    // stem definitions
+    private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
+
+    private static char[][] perfectiveGerundEndings1 = {
+        { V },
+        { V, SH, I },
+        { V, SH, I, S, SOFT }
+    };
+
+    private static char[][] perfectiveGerund1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] perfectiveGerundEndings2 = { { I, V }, {
+        Y, V }, {
+            I, V, SH, I }, {
+                Y, V, SH, I }, {
+                    I, V, SH, I, S, SOFT }, {
+                        Y, V, SH, I, S, SOFT }
+    };
+
+    private static char[][] adjectiveEndings = {
+        { E, E },
+        { I, E },
+        { Y, E },
+        { O, E },
+        { E, I_ },
+        { I, I_ },
+        { Y, I_ },
+        { O, I_ },
+        { E, M },
+        { I, M },
+        { Y, M },
+        { O, M },
+        { I, X },
+        { Y, X },
+        { U, IU },
+        { IU, IU },
+        { A, IA },
+        { IA, IA },
+        { O, IU },
+        { E, IU },
+        { I, M, I },
+        { Y, M, I },
+        { E, G, O },
+        { O, G, O },
+        { E, M, U },
+        {O, M, U }
+    };
+
+    private static char[][] participleEndings1 = {
+        { SHCH },
+        { E, M },
+        { N, N },
+        { V, SH },
+        { IU, SHCH }
+    };
+
+    private static char[][] participleEndings2 = {
+        { I, V, SH },
+        { Y, V, SH },
+        { U, IU, SHCH }
+    };
+
+    private static char[][] participle1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] reflexiveEndings = {
+        { S, IA },
+        { S, SOFT }
+    };
+
+    private static char[][] verbEndings1 = {
+        { I_ },
+        { L },
+        { N },
+        { L, O },
+        { N, O },
+        { E, T },
+        { IU, T },
+        { L, A },
+        { N, A },
+        { L, I },
+        { E, M },
+        { N, Y },
+        { E, T, E },
+        { I_, T, E },
+        { T, SOFT },
+        { E, SH, SOFT },
+        { N, N, O }
+    };
+
+    private static char[][] verbEndings2 = {
+        { IU },
+        { U, IU },
+        { E, N },
+        { E, I_ },
+        { IA, T },
+        { U, I_ },
+        { I, L },
+        { Y, L },
+        { I, M },
+        { Y, M },
+        { I, T },
+        { Y, T },
+        { I, L, A },
+        { Y, L, A },
+        { E, N, A },
+        { I, T, E },
+        { I, L, I },
+        { Y, L, I },
+        { I, L, O },
+        { Y, L, O },
+        { E, N, O },
+        { U, E, T },
+        { U, IU, T },
+        { E, N, Y },
+        { I, T, SOFT },
+        { Y, T, SOFT },
+        { I, SH, SOFT },
+        { E, I_, T, E },
+        { U, I_, T, E }
+    };
+
+    private static char[][] verb1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] nounEndings = {
+        { A },
+        { U },
+        { I_ },
+        { O },
+        { U },
+        { E },
+        { Y },
+        { I },
+        { SOFT },
+        { IA },
+        { E, V },
+        { O, V },
+        { I, E },
+        { SOFT, E },
+        { IA, X },
+        { I, IU },
+        { E, I },
+        { I, I },
+        { E, I_ },
+        { O, I_ },
+        { E, M },
+        { A, M },
+        { O, M },
+        { A, X },
+        { SOFT, IU },
+        { I, IA },
+        { SOFT, IA },
+        { I, I_ },
+        { IA, M },
+        { IA, M, I },
+        { A, M, I },
+        { I, E, I_ },
+        { I, IA, M },
+        { I, E, M },
+        { I, IA, X },
+        { I, IA, M, I }
+    };
+
+    private static char[][] superlativeEndings = {
+        { E, I_, SH },
+        { E, I_, SH, E }
+    };
+
+    private static char[][] derivationalEndings = {
+        { O, S, T },
+        { O, S, T, SOFT }
+    };
+
+    /**
+     * RussianStemmer constructor comment.
+     */
+    public RussianStemmer()
+    {
+        super();
+    }
+
+    /**
+     * RussianStemmer constructor comment.
+     */
+    public RussianStemmer(char[] charset)
+    {
+        super();
+        this.charset = charset;
+    }
+
+    /**
+     * Adjectival ending is an adjective ending,
+     * optionally preceded by participle ending.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean adjectival(StringBuffer stemmingZone)
+    {
+        // look for adjective ending in a stemming zone
+        if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
+            return false;
+        // if adjective ending was found, try for participle ending.
+        // variable r is unused, we are just interested in the side effect of
+        // findAndRemoveEnding():
+        boolean r =
+            findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
+            ||
+            findAndRemoveEnding(stemmingZone, participleEndings2);
+        return true;
+    }
+
+    /**
+     * Derivational endings
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean derivational(StringBuffer stemmingZone)
+    {
+        int endingLength = findEnding(stemmingZone, derivationalEndings);
+        if (endingLength == 0)
+             // no derivational ending found
+            return false;
+        else
+        {
+            // Ensure that the ending locates in R2
+            if (R2 - RV <= stemmingZone.length() - endingLength)
+            {
+                stemmingZone.setLength(stemmingZone.length() - endingLength);
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+
+    /**
+     * Finds ending among given ending class and returns the length of ending found(0, if not found).
+     * Creation date: (17/03/2002 8:18:34 PM)
+     */
+    private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
+    {
+        boolean match = false;
+        for (int i = theEndingClass.length - 1; i >= 0; i--)
+        {
+            char[] theEnding = theEndingClass[i];
+            // check if the ending is bigger than stemming zone
+            if (startIndex < theEnding.length - 1)
+            {
+                match = false;
+                continue;
+            }
+            match = true;
+            int stemmingIndex = startIndex;
+            for (int j = theEnding.length - 1; j >= 0; j--)
+            {
+                if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
+                {
+                    match = false;
+                    break;
+                }
+            }
+            // check if ending was found
+            if (match)
+            {
+                return theEndingClass[i].length; // cut ending
+            }
+        }
+        return 0;
+    }
+
+    private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
+    {
+        return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
+    }
+
+    /**
+     * Finds the ending among the given class of endings and removes it from stemming zone.
+     * Creation date: (17/03/2002 8:18:34 PM)
+     */
+    private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
+    {
+        int endingLength = findEnding(stemmingZone, theEndingClass);
+        if (endingLength == 0)
+            // not found
+            return false;
+        else {
+            stemmingZone.setLength(stemmingZone.length() - endingLength);
+            // cut the ending found
+            return true;
+        }
+    }
+
+    /**
+     * Finds the ending among the given class of endings, then checks if this ending was
+     * preceded by any of given predessors, and if so, removes it from stemming zone.
+     * Creation date: (17/03/2002 8:18:34 PM)
+     */
+    private boolean findAndRemoveEnding(StringBuffer stemmingZone,
+        char[][] theEndingClass, char[][] thePredessors)
+    {
+        int endingLength = findEnding(stemmingZone, theEndingClass);
+        if (endingLength == 0)
+            // not found
+            return false;
+        else
+        {
+            int predessorLength =
+                findEnding(stemmingZone,
+                    stemmingZone.length() - endingLength - 1,
+                    thePredessors);
+            if (predessorLength == 0)
+                return false;
+            else {
+                stemmingZone.setLength(stemmingZone.length() - endingLength);
+                // cut the ending found
+                return true;
+            }
+        }
+
+    }
+
+    /**
+     * Marks positions of RV, R1 and R2 in a given word.
+     * Creation date: (16/03/2002 3:40:11 PM)
+     */
+    private void markPositions(String word)
+    {
+        RV = 0;
+        R1 = 0;
+        R2 = 0;
+        int i = 0;
+        // find RV
+        while (word.length() > i && !isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // RV zone is empty
+        RV = i;
+        // find R1
+        while (word.length() > i && isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R1 zone is empty
+        R1 = i;
+        // find R2
+        while (word.length() > i && !isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R2 zone is empty
+        while (word.length() > i && isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R2 zone is empty
+        R2 = i;
+    }
+
+    /**
+     * Checks if character is a vowel..
+     * Creation date: (16/03/2002 10:47:03 PM)
+     * @return boolean
+     * @param letter char
+     */
+    private boolean isVowel(char letter)
+    {
+        for (int i = 0; i < vowels.length; i++)
+        {
+            if (letter == charset[vowels[i]])
+                return true;
+        }
+        return false;
+    }
+
+    /**
+     * Noun endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean noun(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, nounEndings);
+    }
+
+    /**
+     * Perfective gerund endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean perfectiveGerund(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(
+            stemmingZone,
+            perfectiveGerundEndings1,
+            perfectiveGerund1Predessors)
+            || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
+    }
+
+    /**
+     * Reflexive endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean reflexive(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, reflexiveEndings);
+    }
+
+    /**
+     * Insert the method's description here.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean removeI(StringBuffer stemmingZone)
+    {
+        if (stemmingZone.length() > 0
+            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Insert the method's description here.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean removeSoft(StringBuffer stemmingZone)
+    {
+        if (stemmingZone.length() > 0
+            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Insert the method's description here.
+     * Creation date: (16/03/2002 10:58:42 PM)
+     * @param newCharset char[]
+     */
+    public void setCharset(char[] newCharset)
+    {
+        charset = newCharset;
+    }
+
+    /**
+     * Finds the stem for given Russian word.
+     * Creation date: (16/03/2002 3:36:48 PM)
+     * @return java.lang.String
+     * @param input java.lang.String
+     */
+    public String stem(String input)
+    {
+        markPositions(input);
+        if (RV == 0)
+            return input; //RV wasn't detected, nothing to stem
+        StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
+        // stemming goes on in RV
+        // Step 1
+
+        if (!perfectiveGerund(stemmingZone))
+        {
+            reflexive(stemmingZone);
+            // variable r is unused, we are just interested in the flow that gets
+            // created by logical expression: apply adjectival(); if that fails,
+            // apply verb() etc
+            boolean r =
+                adjectival(stemmingZone)
+                || verb(stemmingZone)
+                || noun(stemmingZone);
+        }
+        // Step 2
+        removeI(stemmingZone);
+        // Step 3
+        derivational(stemmingZone);
+        // Step 4
+        superlative(stemmingZone);
+        undoubleN(stemmingZone);
+        removeSoft(stemmingZone);
+        // return result
+        return input.substring(0, RV) + stemmingZone.toString();
+    }
+
+    /**
+     * Superlative endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean superlative(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, superlativeEndings);
+    }
+
+    /**
+     * Undoubles N.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean undoubleN(StringBuffer stemmingZone)
+    {
+        char[][] doubleN = {
+            { N, N }
+        };
+        if (findEnding(stemmingZone, doubleN) != 0)
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Verb endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuffer
+     */
+    private boolean verb(StringBuffer stemmingZone)
+    {
+        return findAndRemoveEnding(
+            stemmingZone,
+            verbEndings1,
+            verb1Predessors)
+            || findAndRemoveEnding(stemmingZone, verbEndings2);
+    }
+
+    /**
+     * Static method for stemming with different charsets
+     */
+    public static String stem(String theWord, char[] charset)
+    {
+        RussianStemmer stemmer = new RussianStemmer();
+        stemmer.setCharset(charset);
+        return stemmer.stem(theWord);
+    }
+}
--- a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/package.html
+++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/package.html
@ -0,0 +1,5 @@
+<html>
+<body>
+Support for indexing and searching Russian text.
+</body>
+</html>
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
@ -0,0 +1,78 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the German stemmer. The stemming algorithm is known to work less 
+ * than perfect, as it doesn't use any word lists with exceptions. We 
+ * also check some of the cases where the algorithm is wrong.
+ * 
+ * @author Daniel Naber
+ */
+public class TestGermanStemFilter extends TestCase {
+
+  public void testStemming() {
+    try {
+      // read test cases from external file:
+      File dataDir = new File(System.getProperty("dataDir", "./bin"));
+      File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
+      FileInputStream fis = new FileInputStream(testFile);
+      InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1");
+      BufferedReader breader = new BufferedReader(isr);
+      while(true) {
+        String line = breader.readLine();
+        if (line == null)
+          break;
+        line = line.trim();
+        if (line.startsWith("#") || line.equals(""))
+          continue;    // ignore comments and empty lines
+        String[] parts = line.split(";");
+        //System.out.println(parts[0] + " -- " + parts[1]);
+        check(parts[0], parts[1]);
+      }
+      breader.close();
+      isr.close();
+      fis.close();
+    } catch (IOException e) {
+       e.printStackTrace();
+       fail();
+    }
+  }
+
+  private void check(final String input, final String expected) throws IOException {
+    StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
+    GermanStemFilter filter = new GermanStemFilter(tokenStream);
+    Token t = filter.next();
+    if (t == null)
+      fail();
+    assertEquals(expected, t.termText());
+    filter.close();
+  }
+
+}
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/data.txt
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/data.txt
@ -0,0 +1,48 @@
+# German special characters are replaced:
+häufig;haufig
+
+# here the stemmer works okay, it maps related words to the same stem:
+abschließen;abschliess
+abschließender;abschliess
+abschließendes;abschliess
+abschließenden;abschliess
+
+Tisch;tisch
+Tische;tisch
+Tischen;tisch
+
+Haus;hau
+Hauses;hau
+Häuser;hau
+Häusern;hau
+# here's a case where overstemming occurs, i.e. a word is 
+# mapped to the same stem as unrelated words:
+hauen;hau
+
+# here's a case where understemming occurs, i.e. two related words
+# are not mapped to the same stem. This is the case with basically
+# all irregular forms:
+Drama;drama
+Dramen;dram
+
+# replace "ß" with 'ss':
+Ausmaß;ausmass
+
+# fake words to test if suffixes are cut off:
+xxxxxe;xxxxx
+xxxxxs;xxxxx
+xxxxxn;xxxxx
+xxxxxt;xxxxx
+xxxxxem;xxxxx
+xxxxxer;xxxxx
+xxxxxnd;xxxxx
+# the suffixes are also removed when combined:
+xxxxxetende;xxxxx
+
+# words that are shorter than four charcters are not changed:
+xxe;xxe
+# -em and -er are not removed from words shorter than five characters:
+xxem;xxem
+xxer;xxer
+# -nd is not removed from words shorter than six characters:
+xxxnd;xxxnd
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
@ -0,0 +1,170 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+import java.io.*;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+/**
+ * Test case for RussianAnalyzer.
+ *
+ * @author    Boris Okner
+ * @version   $Id$
+ */
+
+public class TestRussianAnalyzer extends TestCase
+{
+    private InputStreamReader inWords;
+
+    private InputStreamReader sampleUnicode;
+
+    private Reader inWordsKOI8;
+
+    private Reader sampleKOI8;
+
+    private Reader inWords1251;
+
+    private Reader sample1251;
+
+    private File dataDir;
+
+    protected void setUp() throws Exception
+    {
+      dataDir = new File(System.getProperty("dataDir", "./bin"));
+    }
+
+    public void testUnicode() throws IOException
+    {
+        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
+        inWords =
+            new InputStreamReader(
+                new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")),
+                "Unicode");
+
+        sampleUnicode =
+            new InputStreamReader(
+                new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")),
+                "Unicode");
+
+        TokenStream in = ra.tokenStream("all", inWords);
+
+        RussianLetterTokenizer sample =
+            new RussianLetterTokenizer(
+                sampleUnicode,
+                RussianCharsets.UnicodeRussian);
+
+        for (;;)
+        {
+            Token token = in.next();
+
+            if (token == null)
+            {
+                break;
+            }
+
+            Token sampleToken = sample.next();
+            assertEquals(
+                "Unicode",
+                token.termText(),
+                sampleToken == null
+                ? null
+                : sampleToken.termText());
+        }
+
+        inWords.close();
+        sampleUnicode.close();
+    }
+
+    public void testKOI8() throws IOException
+    {
+        //System.out.println(new java.util.Date());
+        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
+        // KOI8
+        inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
+
+        sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
+
+        TokenStream in = ra.tokenStream("all", inWordsKOI8);
+        RussianLetterTokenizer sample =
+            new RussianLetterTokenizer(
+                sampleKOI8,
+                RussianCharsets.KOI8);
+
+        for (;;)
+        {
+            Token token = in.next();
+
+            if (token == null)
+            {
+                break;
+            }
+
+            Token sampleToken = sample.next();
+            assertEquals(
+                "KOI8",
+                token.termText(),
+                sampleToken == null
+                ? null
+                : sampleToken.termText());
+
+        }
+
+        inWordsKOI8.close();
+        sampleKOI8.close();
+    }
+
+    public void test1251() throws IOException
+    {
+        // 1251
+        inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
+
+        sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
+
+        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
+        TokenStream in = ra.tokenStream("", inWords1251);
+        RussianLetterTokenizer sample =
+            new RussianLetterTokenizer(
+                sample1251,
+                RussianCharsets.CP1251);
+
+        for (;;)
+        {
+            Token token = in.next();
+
+            if (token == null)
+            {
+                break;
+            }
+
+            Token sampleToken = sample.next();
+            assertEquals(
+                "1251",
+                token.termText(),
+                sampleToken == null
+                ? null
+                : sampleToken.termText());
+
+        }
+
+        inWords1251.close();
+        sample1251.close();
+    }
+}
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
@ -0,0 +1,94 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.InputStreamReader;
+import java.io.FileInputStream;
+import java.util.ArrayList;
+
+public class TestRussianStem extends TestCase
+{
+    private ArrayList words = new ArrayList();
+    private ArrayList stems = new ArrayList();
+
+    public TestRussianStem(String name)
+    {
+        super(name);
+    }
+
+    /**
+     * @see TestCase#setUp()
+     */
+    protected void setUp() throws Exception
+    {
+        super.setUp();
+        //System.out.println(new java.util.Date());
+        String str;
+        
+        File dataDir = new File(System.getProperty("dataDir", "./bin"));
+
+        // open and read words into an array list
+        BufferedReader inWords =
+            new BufferedReader(
+                new InputStreamReader(
+                    new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/wordsUnicode.txt")),
+                    "Unicode"));
+        while ((str = inWords.readLine()) != null)
+        {
+            words.add(str);
+        }
+        inWords.close();
+
+        // open and read stems into an array list
+        BufferedReader inStems =
+            new BufferedReader(
+                new InputStreamReader(
+                    new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/stemsUnicode.txt")),
+                    "Unicode"));
+        while ((str = inStems.readLine()) != null)
+        {
+            stems.add(str);
+        }
+        inStems.close();
+    }
+
+    /**
+     * @see TestCase#tearDown()
+     */
+    protected void tearDown() throws Exception
+    {
+        super.tearDown();
+    }
+
+    public void testStem()
+    {
+        for (int i = 0; i < words.size(); i++)
+        {
+            //if ( (i % 100) == 0 ) System.err.println(i);
+            String realStem =
+                RussianStemmer.stem(
+                    (String) words.get(i),
+                    RussianCharsets.UnicodeRussian);
+            assertEquals("unicode", stems.get(i), realStem);
+        }
+    }
+
+}
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/res1251.htm
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/res1251.htm
@ -0,0 +1 @@
+[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resKOI8.htm
@ -0,0 +1 @@
+[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resUnicode.htm
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resUnicode.htm
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/test1251.txt
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/test1251.txt
@ -0,0 +1,2 @@
+Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в 
+узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testKOI8.txt
@ -0,0 +1,2 @@
+Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в 
+узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testUnicode.txt
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testUnicode.txt
--- a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt
+++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt
				`@ -0,0 +1 @@`
				[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
				`@ -0,0 +1 @@`
				`[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃`