diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java new file mode 100644 index 00000000000..229ac22d051 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -0,0 +1,135 @@ +package org.apache.lucene.analysis.de; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Set; + +/** + * Analyzer for German language. Supports an external list of stopwords (words that + * will not be indexed at all) and an external list of exclusions (word that will + * not be stemmed, but indexed). + * A default set of stopwords is used unless an alternative list is specified, the + * exclusion list is empty by default. + * + * @author Gerhard Schwarz + * @version $Id$ + */ +public class GermanAnalyzer extends Analyzer { + /** + * List of typical german stopwords. + */ + private String[] GERMAN_STOP_WORDS = { + "einer", "eine", "eines", "einem", "einen", + "der", "die", "das", "dass", "daß", + "du", "er", "sie", "es", + "was", "wer", "wie", "wir", + "und", "oder", "ohne", "mit", + "am", "im", "in", "aus", "auf", + "ist", "sein", "war", "wird", + "ihr", "ihre", "ihres", + "als", "für", "von", "mit", + "dich", "dir", "mich", "mir", + "mein", "sein", "kein", + "durch", "wegen", "wird" + }; + + /** + * Contains the stopwords used with the StopFilter. + */ + private Set stopSet = new HashSet(); + + /** + * Contains words that should be indexed but not stemmed. + */ + private Set exclusionSet = new HashSet(); + + /** + * Builds an analyzer. + */ + public GermanAnalyzer() { + stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS); + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer(String[] stopwords) { + stopSet = StopFilter.makeStopSet(stopwords); + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer(Hashtable stopwords) { + stopSet = new HashSet(stopwords.keySet()); + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer(File stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + /** + * Builds an exclusionlist from an array of Strings. + */ + public void setStemExclusionTable(String[] exclusionlist) { + exclusionSet = StopFilter.makeStopSet(exclusionlist); + } + + /** + * Builds an exclusionlist from a Hashtable. + */ + public void setStemExclusionTable(Hashtable exclusionlist) { + exclusionSet = new HashSet(exclusionlist.keySet()); + } + + /** + * Builds an exclusionlist from the words contained in the given file. + */ + public void setStemExclusionTable(File exclusionlist) throws IOException { + exclusionSet = WordlistLoader.getWordSet(exclusionlist); + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter + */ + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(reader); + result = new StandardFilter(result); + result = new LowerCaseFilter(result); + result = new StopFilter(result, stopSet); + result = new GermanStemFilter(result, exclusionSet); + return result; + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java new file mode 100644 index 00000000000..0d5d5f3de29 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java @@ -0,0 +1,119 @@ +package org.apache.lucene.analysis.de; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.io.IOException; +import java.util.Hashtable; +import java.util.Set; +import java.util.HashSet; + +/** + * A filter that stems German words. It supports a table of words that should + * not be stemmed at all. The stemmer used can be changed at runtime after the + * filter object is created (as long as it is a GermanStemmer). + * + * @author Gerhard Schwarz + * @version $Id$ + */ +public final class GermanStemFilter extends TokenFilter +{ + /** + * The actual token in the input stream. + */ + private Token token = null; + private GermanStemmer stemmer = null; + private Set exclusionSet = null; + + public GermanStemFilter( TokenStream in ) + { + super(in); + stemmer = new GermanStemmer(); + } + + /** + * Builds a GermanStemFilter that uses an exclusiontable. + * @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead. + */ + public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) + { + this( in ); + exclusionSet = new HashSet(exclusiontable.keySet()); + } + + /** + * Builds a GermanStemFilter that uses an exclusiontable. + */ + public GermanStemFilter( TokenStream in, Set exclusionSet ) + { + this( in ); + this.exclusionSet = exclusionSet; + } + + /** + * @return Returns the next token in the stream, or null at EOS + */ + public final Token next() + throws IOException + { + if ( ( token = input.next() ) == null ) { + return null; + } + // Check the exclusiontable + else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { + return token; + } + else { + String s = stemmer.stem( token.termText() ); + // If not stemmed, dont waste the time creating a new token + if ( !s.equals( token.termText() ) ) { + return new Token( s, token.startOffset(), + token.endOffset(), token.type() ); + } + return token; + } + } + + /** + * Set a alternative/custom GermanStemmer for this filter. + */ + public void setStemmer( GermanStemmer stemmer ) + { + if ( stemmer != null ) { + this.stemmer = stemmer; + } + } + + /** + * Set an alternative exclusion list for this filter. + * @deprecated Use {@link #setExclusionSet(java.util.Set)} instead. + */ + public void setExclusionTable( Hashtable exclusiontable ) + { + exclusionSet = new HashSet(exclusiontable.keySet()); + } + + /** + * Set an alternative exclusion list for this filter. + */ + public void setExclusionSet( Set exclusionSet ) + { + this.exclusionSet = exclusionSet; + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java new file mode 100644 index 00000000000..db02b7330ac --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/GermanStemmer.java @@ -0,0 +1,265 @@ +package org.apache.lucene.analysis.de; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for German words. The algorithm is based on the report + * "A Fast and Simple Stemming Algorithm for German Words" by Jörg + * Caumanns (joerg.caumanns@isst.fhg.de). + * + * @author Gerhard Schwarz + * @version $Id$ + */ +public class GermanStemmer +{ + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + + /** + * Amount of characters that are removed with substitute() while stemming. + */ + private int substCount = 0; + + /** + * Stemms the given term to an unique discriminator. + * + * @param term The term that should be stemmed. + * @return Discriminator for term + */ + protected String stem( String term ) + { + // Use lowercase for medium stemming. + term = term.toLowerCase(); + if ( !isStemmable( term ) ) + return term; + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + // Stemming starts here... + substitute( sb ); + strip( sb ); + optimize( sb ); + resubstitute( sb ); + removeParticleDenotion( sb ); + return sb.toString(); + } + + /** + * Checks if a term could be stemmed. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) + { + for ( int c = 0; c < term.length(); c++ ) { + if ( !Character.isLetter( term.charAt( c ) ) ) + return false; + } + return true; + } + + /** + * suffix stripping (stemming) on the current term. The stripping is reduced + * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", + * from which all regular suffixes are build of. The simplification causes + * some overstemming, and way more irregular stems, but still provides unique. + * discriminators in the most of those cases. + * The algorithm is context free, except of the length restrictions. + */ + private void strip( StringBuffer buffer ) + { + boolean doMore = true; + while ( doMore && buffer.length() > 3 ) { + if ( ( buffer.length() + substCount > 5 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) ) + { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + // "t" occurs only as suffix of verbs. + else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else { + doMore = false; + } + } + } + + /** + * Does some optimizations on the term. This optimisations are + * contextual. + */ + private void optimize( StringBuffer buffer ) + { + // Additional step for female plurals of professions and inhabitants. + if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) { + buffer.deleteCharAt( buffer.length() -1 ); + strip( buffer ); + } + // Additional step for irregular plural nouns like "Matrizen -> Matrix". + if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) { + buffer.setCharAt( buffer.length() - 1, 'x' ); + } + } + + /** + * Removes a particle denotion ("ge") from a term. + */ + private void removeParticleDenotion( StringBuffer buffer ) + { + if ( buffer.length() > 4 ) { + for ( int c = 0; c < buffer.length() - 3; c++ ) { + if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) { + buffer.delete( c, c + 2 ); + return; + } + } + } + } + + /** + * Do some substitutions for the term to reduce overstemming: + * + * - Substitute Umlauts with their corresponding vowel: äöü -> aou, + * "ß" is substituted by "ss" + * - Substitute a second char of a pair of equal characters with + * an asterisk: ?? -> ?* + * - Substitute some common character combinations with a token: + * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! + */ + private void substitute( StringBuffer buffer ) + { + substCount = 0; + for ( int c = 0; c < buffer.length(); c++ ) { + // Replace the second char of a pair of the equal characters with an asterisk + if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { + buffer.setCharAt( c, '*' ); + } + // Substitute Umlauts. + else if ( buffer.charAt( c ) == 'ä' ) { + buffer.setCharAt( c, 'a' ); + } + else if ( buffer.charAt( c ) == 'ö' ) { + buffer.setCharAt( c, 'o' ); + } + else if ( buffer.charAt( c ) == 'ü' ) { + buffer.setCharAt( c, 'u' ); + } + // Fix bug so that 'ß' at the end of a word is replaced. + else if ( buffer.charAt( c ) == 'ß' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 's' ); + substCount++; + } + // Take care that at least one character is left left side from the current one + if ( c < buffer.length() - 1 ) { + // Masking several common character combinations with an token + if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && + buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) + { + buffer.setCharAt( c, '$' ); + buffer.delete( c + 1, c + 3 ); + substCount =+ 2; + } + else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { + buffer.setCharAt( c, '§' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { + buffer.setCharAt( c, '%' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { + buffer.setCharAt( c, '&' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { + buffer.setCharAt( c, '#' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { + buffer.setCharAt( c, '!' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + } + } + } + + /** + * Undoes the changes made by substitute(). That are character pairs and + * character combinations. Umlauts will remain as their corresponding vowel, + * as "ß" remains as "ss". + */ + private void resubstitute( StringBuffer buffer ) + { + for ( int c = 0; c < buffer.length(); c++ ) { + if ( buffer.charAt( c ) == '*' ) { + char x = buffer.charAt( c - 1 ); + buffer.setCharAt( c, x ); + } + else if ( buffer.charAt( c ) == '$' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); + } + else if ( buffer.charAt( c ) == '§' ) { + buffer.setCharAt( c, 'c' ); + buffer.insert( c + 1, 'h' ); + } + else if ( buffer.charAt( c ) == '%' ) { + buffer.setCharAt( c, 'e' ); + buffer.insert( c + 1, 'i' ); + } + else if ( buffer.charAt( c ) == '&' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'e' ); + } + else if ( buffer.charAt( c ) == '#' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'g' ); + } + else if ( buffer.charAt( c ) == '!' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 't' ); + } + } + } + +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/WordlistLoader.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/WordlistLoader.java new file mode 100644 index 00000000000..5cb1ad8e2a8 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/WordlistLoader.java @@ -0,0 +1,111 @@ +package org.apache.lucene.analysis.de; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.LineNumberReader; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Iterator; + +/** + * Loader for text files that represent a list of stopwords. + * + * @author Gerhard Schwarz + * @version $Id$ + * + * @todo this is not specific to German, it should be moved up + */ +public class WordlistLoader { + + /** + * Loads a text file and adds every line as an entry to a HashSet (omitting + * leading and trailing whitespace). Every line of the file should contain only + * one word. The words need to be in lowercase if you make use of an + * Analyzer which uses LowerCaseFilter (like GermanAnalyzer). + * + * @param wordfile File containing the wordlist + * @return A HashSet with the file's words + */ + public static HashSet getWordSet(File wordfile) throws IOException { + HashSet result = new HashSet(); + FileReader freader = null; + LineNumberReader lnr = null; + try { + freader = new FileReader(wordfile); + lnr = new LineNumberReader(freader); + String word = null; + while ((word = lnr.readLine()) != null) { + result.add(word.trim()); + } + } + finally { + if (lnr != null) + lnr.close(); + if (freader != null) + freader.close(); + } + return result; + } + + /** + * @param path Path to the wordlist + * @param wordfile Name of the wordlist + * + * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead + */ + public static Hashtable getWordtable(String path, String wordfile) throws IOException { + return getWordtable(new File(path, wordfile)); + } + + /** + * @param wordfile Complete path to the wordlist + * + * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead + */ + public static Hashtable getWordtable(String wordfile) throws IOException { + return getWordtable(new File(wordfile)); + } + + /** + * @param wordfile File object that points to the wordlist + * + * @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead + */ + public static Hashtable getWordtable(File wordfile) throws IOException { + HashSet wordSet = (HashSet)getWordSet(wordfile); + Hashtable result = makeWordTable(wordSet); + return result; + } + + /** + * Builds a wordlist table, using words as both keys and values + * for backward compatibility. + * + * @param wordSet stopword set + */ + private static Hashtable makeWordTable(HashSet wordSet) { + Hashtable table = new Hashtable(); + for (Iterator iter = wordSet.iterator(); iter.hasNext();) { + String word = (String)iter.next(); + table.put(word, word); + } + return table; + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/package.html b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/package.html new file mode 100644 index 00000000000..b8445825a83 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/de/package.html @@ -0,0 +1,5 @@ + + +Support for indexing and searching of German text. + + diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java new file mode 100644 index 00000000000..0bade3df8b1 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java @@ -0,0 +1,259 @@ +package org.apache.lucene.analysis.ru; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; + +import java.io.Reader; +import java.util.Hashtable; +import java.util.Set; +import java.util.HashSet; + +/** + * Analyzer for Russian language. Supports an external list of stopwords (words that + * will not be indexed at all). + * A default set of stopwords is used unless an alternative list is specified. + * + * @author Boris Okner, b.okner@rogers.com + * @version $Id$ + */ +public final class RussianAnalyzer extends Analyzer +{ + // letters (currently unused letters are commented out) + private final static char A = 0; + private final static char B = 1; + private final static char V = 2; + private final static char G = 3; + private final static char D = 4; + private final static char E = 5; + private final static char ZH = 6; + private final static char Z = 7; + private final static char I = 8; + private final static char I_ = 9; + private final static char K = 10; + private final static char L = 11; + private final static char M = 12; + private final static char N = 13; + private final static char O = 14; + private final static char P = 15; + private final static char R = 16; + private final static char S = 17; + private final static char T = 18; + private final static char U = 19; + //private final static char F = 20; + private final static char X = 21; + //private final static char TS = 22; + private final static char CH = 23; + private final static char SH = 24; + private final static char SHCH = 25; + //private final static char HARD = 26; + private final static char Y = 27; + private final static char SOFT = 28; + private final static char AE = 29; + private final static char IU = 30; + private final static char IA = 31; + + /** + * List of typical Russian stopwords. + */ + private static char[][] RUSSIAN_STOP_WORDS = { + {A}, + {B, E, Z}, + {B, O, L, E, E}, + {B, Y}, + {B, Y, L}, + {B, Y, L, A}, + {B, Y, L, I}, + {B, Y, L, O}, + {B, Y, T, SOFT}, + {V}, + {V, A, M}, + {V, A, S}, + {V, E, S, SOFT}, + {V, O}, + {V, O, T}, + {V, S, E}, + {V, S, E, G, O}, + {V, S, E, X}, + {V, Y}, + {G, D, E}, + {D, A}, + {D, A, ZH, E}, + {D, L, IA}, + {D, O}, + {E, G, O}, + {E, E}, + {E, I_,}, + {E, IU}, + {E, S, L, I}, + {E, S, T, SOFT}, + {E, SHCH, E}, + {ZH, E}, + {Z, A}, + {Z, D, E, S, SOFT}, + {I}, + {I, Z}, + {I, L, I}, + {I, M}, + {I, X}, + {K}, + {K, A, K}, + {K, O}, + {K, O, G, D, A}, + {K, T, O}, + {L, I}, + {L, I, B, O}, + {M, N, E}, + {M, O, ZH, E, T}, + {M, Y}, + {N, A}, + {N, A, D, O}, + {N, A, SH}, + {N, E}, + {N, E, G, O}, + {N, E, E}, + {N, E, T}, + {N, I}, + {N, I, X}, + {N, O}, + {N, U}, + {O}, + {O, B}, + {O, D, N, A, K, O}, + {O, N}, + {O, N, A}, + {O, N, I}, + {O, N, O}, + {O, T}, + {O, CH, E, N, SOFT}, + {P, O}, + {P, O, D}, + {P, R, I}, + {S}, + {S, O}, + {T, A, K}, + {T, A, K, ZH, E}, + {T, A, K, O, I_}, + {T, A, M}, + {T, E}, + {T, E, M}, + {T, O}, + {T, O, G, O}, + {T, O, ZH, E}, + {T, O, I_}, + {T, O, L, SOFT, K, O}, + {T, O, M}, + {T, Y}, + {U}, + {U, ZH, E}, + {X, O, T, IA}, + {CH, E, G, O}, + {CH, E, I_}, + {CH, E, M}, + {CH, T, O}, + {CH, T, O, B, Y}, + {CH, SOFT, E}, + {CH, SOFT, IA}, + {AE, T, A}, + {AE, T, I}, + {AE, T, O}, + {IA} + }; + + /** + * Contains the stopwords used with the StopFilter. + */ + private Set stopSet = new HashSet(); + + /** + * Charset for Russian letters. + * Represents encoding for 32 lowercase Russian letters. + * Predefined charsets can be taken from RussianCharSets class + */ + private char[] charset; + + + public RussianAnalyzer() { + charset = RussianCharsets.UnicodeRussian; + stopSet = StopFilter.makeStopSet( + makeStopWords(RussianCharsets.UnicodeRussian)); + } + + /** + * Builds an analyzer. + */ + public RussianAnalyzer(char[] charset) + { + this.charset = charset; + stopSet = StopFilter.makeStopSet(makeStopWords(charset)); + } + + /** + * Builds an analyzer with the given stop words. + */ + public RussianAnalyzer(char[] charset, String[] stopwords) + { + this.charset = charset; + stopSet = StopFilter.makeStopSet(stopwords); + } + + // Takes russian stop words and translates them to a String array, using + // the given charset + private static String[] makeStopWords(char[] charset) + { + String[] res = new String[RUSSIAN_STOP_WORDS.length]; + for (int i = 0; i < res.length; i++) + { + char[] theStopWord = RUSSIAN_STOP_WORDS[i]; + // translate the word, using the charset + StringBuffer theWord = new StringBuffer(); + for (int j = 0; j < theStopWord.length; j++) + { + theWord.append(charset[theStopWord[j]]); + } + res[i] = theWord.toString(); + } + return res; + } + + /** + * Builds an analyzer with the given stop words. + * @todo create a Set version of this ctor + */ + public RussianAnalyzer(char[] charset, Hashtable stopwords) + { + this.charset = charset; + stopSet = new HashSet(stopwords.keySet()); + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a RussianLetterTokenizer filtered with + * RussianLowerCaseFilter, StopFilter, and RussianStemFilter + */ + public TokenStream tokenStream(String fieldName, Reader reader) + { + TokenStream result = new RussianLetterTokenizer(reader, charset); + result = new RussianLowerCaseFilter(result, charset); + result = new StopFilter(result, stopSet); + result = new RussianStemFilter(result, charset); + return result; + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java new file mode 100644 index 00000000000..e1305b0b8c6 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java @@ -0,0 +1,279 @@ +package org.apache.lucene.analysis.ru; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation + * for russian characters in Unicode, KOI8 and CP1252. + * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters. + * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset + * and adding logic to toLowerCase() method for that charset. + * + * @author Boris Okner, b.okner@rogers.com + * @version $Id$ + */ +public class RussianCharsets +{ + // Unicode Russian charset (lowercase letters only) + public static char[] UnicodeRussian = { + '\u0430', + '\u0431', + '\u0432', + '\u0433', + '\u0434', + '\u0435', + '\u0436', + '\u0437', + '\u0438', + '\u0439', + '\u043A', + '\u043B', + '\u043C', + '\u043D', + '\u043E', + '\u043F', + '\u0440', + '\u0441', + '\u0442', + '\u0443', + '\u0444', + '\u0445', + '\u0446', + '\u0447', + '\u0448', + '\u0449', + '\u044A', + '\u044B', + '\u044C', + '\u044D', + '\u044E', + '\u044F', + // upper case + '\u0410', + '\u0411', + '\u0412', + '\u0413', + '\u0414', + '\u0415', + '\u0416', + '\u0417', + '\u0418', + '\u0419', + '\u041A', + '\u041B', + '\u041C', + '\u041D', + '\u041E', + '\u041F', + '\u0420', + '\u0421', + '\u0422', + '\u0423', + '\u0424', + '\u0425', + '\u0426', + '\u0427', + '\u0428', + '\u0429', + '\u042A', + '\u042B', + '\u042C', + '\u042D', + '\u042E', + '\u042F' + }; + + // KOI8 charset + public static char[] KOI8 = { + 0xc1, + 0xc2, + 0xd7, + 0xc7, + 0xc4, + 0xc5, + 0xd6, + 0xda, + 0xc9, + 0xca, + 0xcb, + 0xcc, + 0xcd, + 0xce, + 0xcf, + 0xd0, + 0xd2, + 0xd3, + 0xd4, + 0xd5, + 0xc6, + 0xc8, + 0xc3, + 0xde, + 0xdb, + 0xdd, + 0xdf, + 0xd9, + 0xd8, + 0xdc, + 0xc0, + 0xd1, + // upper case + 0xe1, + 0xe2, + 0xf7, + 0xe7, + 0xe4, + 0xe5, + 0xf6, + 0xfa, + 0xe9, + 0xea, + 0xeb, + 0xec, + 0xed, + 0xee, + 0xef, + 0xf0, + 0xf2, + 0xf3, + 0xf4, + 0xf5, + 0xe6, + 0xe8, + 0xe3, + 0xfe, + 0xfb, + 0xfd, + 0xff, + 0xf9, + 0xf8, + 0xfc, + 0xe0, + 0xf1 + }; + + // CP1251 eharset + public static char[] CP1251 = { + 0xE0, + 0xE1, + 0xE2, + 0xE3, + 0xE4, + 0xE5, + 0xE6, + 0xE7, + 0xE8, + 0xE9, + 0xEA, + 0xEB, + 0xEC, + 0xED, + 0xEE, + 0xEF, + 0xF0, + 0xF1, + 0xF2, + 0xF3, + 0xF4, + 0xF5, + 0xF6, + 0xF7, + 0xF8, + 0xF9, + 0xFA, + 0xFB, + 0xFC, + 0xFD, + 0xFE, + 0xFF, + // upper case + 0xC0, + 0xC1, + 0xC2, + 0xC3, + 0xC4, + 0xC5, + 0xC6, + 0xC7, + 0xC8, + 0xC9, + 0xCA, + 0xCB, + 0xCC, + 0xCD, + 0xCE, + 0xCF, + 0xD0, + 0xD1, + 0xD2, + 0xD3, + 0xD4, + 0xD5, + 0xD6, + 0xD7, + 0xD8, + 0xD9, + 0xDA, + 0xDB, + 0xDC, + 0xDD, + 0xDE, + 0xDF + }; + + public static char toLowerCase(char letter, char[] charset) + { + if (charset == UnicodeRussian) + { + if (letter >= '\u0430' && letter <= '\u044F') + { + return letter; + } + if (letter >= '\u0410' && letter <= '\u042F') + { + return (char) (letter + 32); + } + } + + if (charset == KOI8) + { + if (letter >= 0xe0 && letter <= 0xff) + { + return (char) (letter - 32); + } + if (letter >= 0xc0 && letter <= 0xdf) + { + return letter; + } + + } + + if (charset == CP1251) + { + if (letter >= 0xC0 && letter <= 0xDF) + { + return (char) (letter + 32); + } + if (letter >= 0xE0 && letter <= 0xFF) + { + return letter; + } + + } + + return Character.toLowerCase(letter); + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java new file mode 100644 index 00000000000..484b09b9bf9 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.ru; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import org.apache.lucene.analysis.CharTokenizer; + +/** + * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters + * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method, + * which doesn't know how to detect letters in encodings like CP1252 and KOI8 + * (well-known problems with 0xD7 and 0xF7 chars) + * + * @author Boris Okner, b.okner@rogers.com + * @version $Id$ + */ + +public class RussianLetterTokenizer extends CharTokenizer +{ + /** Construct a new LetterTokenizer. */ + private char[] charset; + + public RussianLetterTokenizer(Reader in, char[] charset) + { + super(in); + this.charset = charset; + } + + /** + * Collects only characters which satisfy + * {@link Character#isLetter(char)}. + */ + protected boolean isTokenChar(char c) + { + if (Character.isLetter(c)) + return true; + for (int i = 0; i < charset.length; i++) + { + if (c == charset[i]) + return true; + } + return false; + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java new file mode 100644 index 00000000000..79f273aab16 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.ru; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/** + * Normalizes token text to lower case, analyzing given ("russian") charset. + * + * @author Boris Okner, b.okner@rogers.com + * @version $Id$ + */ +public final class RussianLowerCaseFilter extends TokenFilter +{ + char[] charset; + + public RussianLowerCaseFilter(TokenStream in, char[] charset) + { + super(in); + this.charset = charset; + } + + public final Token next() throws java.io.IOException + { + Token t = input.next(); + + if (t == null) + return null; + + String txt = t.termText(); + + char[] chArray = txt.toCharArray(); + for (int i = 0; i < chArray.length; i++) + { + chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); + } + + String newTxt = new String(chArray); + // create new token + Token newToken = new Token(newTxt, t.startOffset(), t.endOffset()); + + return newToken; + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java new file mode 100644 index 00000000000..597fe1d7073 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java @@ -0,0 +1,77 @@ +package org.apache.lucene.analysis.ru; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import java.io.IOException; + +/** + * A filter that stems Russian words. The implementation was inspired by GermanStemFilter. + * The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter , + * because RussianStemFilter only works with lowercase part of any "russian" charset. + * + * @author Boris Okner, b.okner@rogers.com + * @version $Id$ + */ +public final class RussianStemFilter extends TokenFilter +{ + /** + * The actual token in the input stream. + */ + private Token token = null; + private RussianStemmer stemmer = null; + + public RussianStemFilter(TokenStream in, char[] charset) + { + super(in); + stemmer = new RussianStemmer(charset); + } + + /** + * @return Returns the next token in the stream, or null at EOS + */ + public final Token next() throws IOException + { + if ((token = input.next()) == null) + { + return null; + } + else + { + String s = stemmer.stem(token.termText()); + if (!s.equals(token.termText())) + { + return new Token(s, token.startOffset(), token.endOffset(), + token.type()); + } + return token; + } + } + + /** + * Set a alternative/custom RussianStemmer for this filter. + */ + public void setStemmer(RussianStemmer stemmer) + { + if (stemmer != null) + { + this.stemmer = stemmer; + } + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java new file mode 100644 index 00000000000..ba6b7312397 --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java @@ -0,0 +1,629 @@ +package org.apache.lucene.analysis.ru; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). + * + * @author Boris Okner, b.okner@rogers.com + * @version $Id$ + */ +class RussianStemmer +{ + private char[] charset; + + // positions of RV, R1 and R2 respectively + private int RV, R1, R2; + + // letters (currently unused letters are commented out) + private final static char A = 0; + //private final static char B = 1; + private final static char V = 2; + private final static char G = 3; + //private final static char D = 4; + private final static char E = 5; + //private final static char ZH = 6; + //private final static char Z = 7; + private final static char I = 8; + private final static char I_ = 9; + //private final static char K = 10; + private final static char L = 11; + private final static char M = 12; + private final static char N = 13; + private final static char O = 14; + //private final static char P = 15; + //private final static char R = 16; + private final static char S = 17; + private final static char T = 18; + private final static char U = 19; + //private final static char F = 20; + private final static char X = 21; + //private final static char TS = 22; + //private final static char CH = 23; + private final static char SH = 24; + private final static char SHCH = 25; + //private final static char HARD = 26; + private final static char Y = 27; + private final static char SOFT = 28; + private final static char AE = 29; + private final static char IU = 30; + private final static char IA = 31; + + // stem definitions + private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; + + private static char[][] perfectiveGerundEndings1 = { + { V }, + { V, SH, I }, + { V, SH, I, S, SOFT } + }; + + private static char[][] perfectiveGerund1Predessors = { + { A }, + { IA } + }; + + private static char[][] perfectiveGerundEndings2 = { { I, V }, { + Y, V }, { + I, V, SH, I }, { + Y, V, SH, I }, { + I, V, SH, I, S, SOFT }, { + Y, V, SH, I, S, SOFT } + }; + + private static char[][] adjectiveEndings = { + { E, E }, + { I, E }, + { Y, E }, + { O, E }, + { E, I_ }, + { I, I_ }, + { Y, I_ }, + { O, I_ }, + { E, M }, + { I, M }, + { Y, M }, + { O, M }, + { I, X }, + { Y, X }, + { U, IU }, + { IU, IU }, + { A, IA }, + { IA, IA }, + { O, IU }, + { E, IU }, + { I, M, I }, + { Y, M, I }, + { E, G, O }, + { O, G, O }, + { E, M, U }, + {O, M, U } + }; + + private static char[][] participleEndings1 = { + { SHCH }, + { E, M }, + { N, N }, + { V, SH }, + { IU, SHCH } + }; + + private static char[][] participleEndings2 = { + { I, V, SH }, + { Y, V, SH }, + { U, IU, SHCH } + }; + + private static char[][] participle1Predessors = { + { A }, + { IA } + }; + + private static char[][] reflexiveEndings = { + { S, IA }, + { S, SOFT } + }; + + private static char[][] verbEndings1 = { + { I_ }, + { L }, + { N }, + { L, O }, + { N, O }, + { E, T }, + { IU, T }, + { L, A }, + { N, A }, + { L, I }, + { E, M }, + { N, Y }, + { E, T, E }, + { I_, T, E }, + { T, SOFT }, + { E, SH, SOFT }, + { N, N, O } + }; + + private static char[][] verbEndings2 = { + { IU }, + { U, IU }, + { E, N }, + { E, I_ }, + { IA, T }, + { U, I_ }, + { I, L }, + { Y, L }, + { I, M }, + { Y, M }, + { I, T }, + { Y, T }, + { I, L, A }, + { Y, L, A }, + { E, N, A }, + { I, T, E }, + { I, L, I }, + { Y, L, I }, + { I, L, O }, + { Y, L, O }, + { E, N, O }, + { U, E, T }, + { U, IU, T }, + { E, N, Y }, + { I, T, SOFT }, + { Y, T, SOFT }, + { I, SH, SOFT }, + { E, I_, T, E }, + { U, I_, T, E } + }; + + private static char[][] verb1Predessors = { + { A }, + { IA } + }; + + private static char[][] nounEndings = { + { A }, + { U }, + { I_ }, + { O }, + { U }, + { E }, + { Y }, + { I }, + { SOFT }, + { IA }, + { E, V }, + { O, V }, + { I, E }, + { SOFT, E }, + { IA, X }, + { I, IU }, + { E, I }, + { I, I }, + { E, I_ }, + { O, I_ }, + { E, M }, + { A, M }, + { O, M }, + { A, X }, + { SOFT, IU }, + { I, IA }, + { SOFT, IA }, + { I, I_ }, + { IA, M }, + { IA, M, I }, + { A, M, I }, + { I, E, I_ }, + { I, IA, M }, + { I, E, M }, + { I, IA, X }, + { I, IA, M, I } + }; + + private static char[][] superlativeEndings = { + { E, I_, SH }, + { E, I_, SH, E } + }; + + private static char[][] derivationalEndings = { + { O, S, T }, + { O, S, T, SOFT } + }; + + /** + * RussianStemmer constructor comment. + */ + public RussianStemmer() + { + super(); + } + + /** + * RussianStemmer constructor comment. + */ + public RussianStemmer(char[] charset) + { + super(); + this.charset = charset; + } + + /** + * Adjectival ending is an adjective ending, + * optionally preceded by participle ending. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean adjectival(StringBuffer stemmingZone) + { + // look for adjective ending in a stemming zone + if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) + return false; + // if adjective ending was found, try for participle ending. + // variable r is unused, we are just interested in the side effect of + // findAndRemoveEnding(): + boolean r = + findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) + || + findAndRemoveEnding(stemmingZone, participleEndings2); + return true; + } + + /** + * Derivational endings + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean derivational(StringBuffer stemmingZone) + { + int endingLength = findEnding(stemmingZone, derivationalEndings); + if (endingLength == 0) + // no derivational ending found + return false; + else + { + // Ensure that the ending locates in R2 + if (R2 - RV <= stemmingZone.length() - endingLength) + { + stemmingZone.setLength(stemmingZone.length() - endingLength); + return true; + } + else + { + return false; + } + } + } + + /** + * Finds ending among given ending class and returns the length of ending found(0, if not found). + * Creation date: (17/03/2002 8:18:34 PM) + */ + private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass) + { + boolean match = false; + for (int i = theEndingClass.length - 1; i >= 0; i--) + { + char[] theEnding = theEndingClass[i]; + // check if the ending is bigger than stemming zone + if (startIndex < theEnding.length - 1) + { + match = false; + continue; + } + match = true; + int stemmingIndex = startIndex; + for (int j = theEnding.length - 1; j >= 0; j--) + { + if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) + { + match = false; + break; + } + } + // check if ending was found + if (match) + { + return theEndingClass[i].length; // cut ending + } + } + return 0; + } + + private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass) + { + return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass); + } + + /** + * Finds the ending among the given class of endings and removes it from stemming zone. + * Creation date: (17/03/2002 8:18:34 PM) + */ + private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass) + { + int endingLength = findEnding(stemmingZone, theEndingClass); + if (endingLength == 0) + // not found + return false; + else { + stemmingZone.setLength(stemmingZone.length() - endingLength); + // cut the ending found + return true; + } + } + + /** + * Finds the ending among the given class of endings, then checks if this ending was + * preceded by any of given predessors, and if so, removes it from stemming zone. + * Creation date: (17/03/2002 8:18:34 PM) + */ + private boolean findAndRemoveEnding(StringBuffer stemmingZone, + char[][] theEndingClass, char[][] thePredessors) + { + int endingLength = findEnding(stemmingZone, theEndingClass); + if (endingLength == 0) + // not found + return false; + else + { + int predessorLength = + findEnding(stemmingZone, + stemmingZone.length() - endingLength - 1, + thePredessors); + if (predessorLength == 0) + return false; + else { + stemmingZone.setLength(stemmingZone.length() - endingLength); + // cut the ending found + return true; + } + } + + } + + /** + * Marks positions of RV, R1 and R2 in a given word. + * Creation date: (16/03/2002 3:40:11 PM) + */ + private void markPositions(String word) + { + RV = 0; + R1 = 0; + R2 = 0; + int i = 0; + // find RV + while (word.length() > i && !isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // RV zone is empty + RV = i; + // find R1 + while (word.length() > i && isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R1 zone is empty + R1 = i; + // find R2 + while (word.length() > i && !isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R2 zone is empty + while (word.length() > i && isVowel(word.charAt(i))) + { + i++; + } + if (word.length() - 1 < ++i) + return; // R2 zone is empty + R2 = i; + } + + /** + * Checks if character is a vowel.. + * Creation date: (16/03/2002 10:47:03 PM) + * @return boolean + * @param letter char + */ + private boolean isVowel(char letter) + { + for (int i = 0; i < vowels.length; i++) + { + if (letter == charset[vowels[i]]) + return true; + } + return false; + } + + /** + * Noun endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean noun(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, nounEndings); + } + + /** + * Perfective gerund endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean perfectiveGerund(StringBuffer stemmingZone) + { + return findAndRemoveEnding( + stemmingZone, + perfectiveGerundEndings1, + perfectiveGerund1Predessors) + || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2); + } + + /** + * Reflexive endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean reflexive(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, reflexiveEndings); + } + + /** + * Insert the method's description here. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean removeI(StringBuffer stemmingZone) + { + if (stemmingZone.length() > 0 + && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Insert the method's description here. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean removeSoft(StringBuffer stemmingZone) + { + if (stemmingZone.length() > 0 + && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Insert the method's description here. + * Creation date: (16/03/2002 10:58:42 PM) + * @param newCharset char[] + */ + public void setCharset(char[] newCharset) + { + charset = newCharset; + } + + /** + * Finds the stem for given Russian word. + * Creation date: (16/03/2002 3:36:48 PM) + * @return java.lang.String + * @param input java.lang.String + */ + public String stem(String input) + { + markPositions(input); + if (RV == 0) + return input; //RV wasn't detected, nothing to stem + StringBuffer stemmingZone = new StringBuffer(input.substring(RV)); + // stemming goes on in RV + // Step 1 + + if (!perfectiveGerund(stemmingZone)) + { + reflexive(stemmingZone); + // variable r is unused, we are just interested in the flow that gets + // created by logical expression: apply adjectival(); if that fails, + // apply verb() etc + boolean r = + adjectival(stemmingZone) + || verb(stemmingZone) + || noun(stemmingZone); + } + // Step 2 + removeI(stemmingZone); + // Step 3 + derivational(stemmingZone); + // Step 4 + superlative(stemmingZone); + undoubleN(stemmingZone); + removeSoft(stemmingZone); + // return result + return input.substring(0, RV) + stemmingZone.toString(); + } + + /** + * Superlative endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean superlative(StringBuffer stemmingZone) + { + return findAndRemoveEnding(stemmingZone, superlativeEndings); + } + + /** + * Undoubles N. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean undoubleN(StringBuffer stemmingZone) + { + char[][] doubleN = { + { N, N } + }; + if (findEnding(stemmingZone, doubleN) != 0) + { + stemmingZone.setLength(stemmingZone.length() - 1); + return true; + } + else + { + return false; + } + } + + /** + * Verb endings. + * Creation date: (17/03/2002 12:14:58 AM) + * @param stemmingZone java.lang.StringBuffer + */ + private boolean verb(StringBuffer stemmingZone) + { + return findAndRemoveEnding( + stemmingZone, + verbEndings1, + verb1Predessors) + || findAndRemoveEnding(stemmingZone, verbEndings2); + } + + /** + * Static method for stemming with different charsets + */ + public static String stem(String theWord, char[] charset) + { + RussianStemmer stemmer = new RussianStemmer(); + stemmer.setCharset(charset); + return stemmer.stem(theWord); + } +} diff --git a/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/package.html b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/package.html new file mode 100644 index 00000000000..c63920a9bfa --- /dev/null +++ b/sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/ru/package.html @@ -0,0 +1,5 @@ + + +Support for indexing and searching Russian text. + + diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java new file mode 100644 index 00000000000..adaddc92ce6 --- /dev/null +++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java @@ -0,0 +1,78 @@ +package org.apache.lucene.analysis.de; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Test the German stemmer. The stemming algorithm is known to work less + * than perfect, as it doesn't use any word lists with exceptions. We + * also check some of the cases where the algorithm is wrong. + * + * @author Daniel Naber + */ +public class TestGermanStemFilter extends TestCase { + + public void testStemming() { + try { + // read test cases from external file: + File dataDir = new File(System.getProperty("dataDir", "./bin")); + File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt"); + FileInputStream fis = new FileInputStream(testFile); + InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1"); + BufferedReader breader = new BufferedReader(isr); + while(true) { + String line = breader.readLine(); + if (line == null) + break; + line = line.trim(); + if (line.startsWith("#") || line.equals("")) + continue; // ignore comments and empty lines + String[] parts = line.split(";"); + //System.out.println(parts[0] + " -- " + parts[1]); + check(parts[0], parts[1]); + } + breader.close(); + isr.close(); + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + fail(); + } + } + + private void check(final String input, final String expected) throws IOException { + StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input)); + GermanStemFilter filter = new GermanStemFilter(tokenStream); + Token t = filter.next(); + if (t == null) + fail(); + assertEquals(expected, t.termText()); + filter.close(); + } + +} diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/data.txt b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/data.txt new file mode 100644 index 00000000000..520c18a1df6 --- /dev/null +++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/de/data.txt @@ -0,0 +1,48 @@ +# German special characters are replaced: +häufig;haufig + +# here the stemmer works okay, it maps related words to the same stem: +abschließen;abschliess +abschließender;abschliess +abschließendes;abschliess +abschließenden;abschliess + +Tisch;tisch +Tische;tisch +Tischen;tisch + +Haus;hau +Hauses;hau +Häuser;hau +Häusern;hau +# here's a case where overstemming occurs, i.e. a word is +# mapped to the same stem as unrelated words: +hauen;hau + +# here's a case where understemming occurs, i.e. two related words +# are not mapped to the same stem. This is the case with basically +# all irregular forms: +Drama;drama +Dramen;dram + +# replace "ß" with 'ss': +Ausmaß;ausmass + +# fake words to test if suffixes are cut off: +xxxxxe;xxxxx +xxxxxs;xxxxx +xxxxxn;xxxxx +xxxxxt;xxxxx +xxxxxem;xxxxx +xxxxxer;xxxxx +xxxxxnd;xxxxx +# the suffixes are also removed when combined: +xxxxxetende;xxxxx + +# words that are shorter than four charcters are not changed: +xxe;xxe +# -em and -er are not removed from words shorter than five characters: +xxem;xxem +xxer;xxer +# -nd is not removed from words shorter than six characters: +xxxnd;xxxnd diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java new file mode 100644 index 00000000000..1294d5e577b --- /dev/null +++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java @@ -0,0 +1,170 @@ +package org.apache.lucene.analysis.ru; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +import java.io.*; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; + +/** + * Test case for RussianAnalyzer. + * + * @author Boris Okner + * @version $Id$ + */ + +public class TestRussianAnalyzer extends TestCase +{ + private InputStreamReader inWords; + + private InputStreamReader sampleUnicode; + + private Reader inWordsKOI8; + + private Reader sampleKOI8; + + private Reader inWords1251; + + private Reader sample1251; + + private File dataDir; + + protected void setUp() throws Exception + { + dataDir = new File(System.getProperty("dataDir", "./bin")); + } + + public void testUnicode() throws IOException + { + RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); + inWords = + new InputStreamReader( + new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")), + "Unicode"); + + sampleUnicode = + new InputStreamReader( + new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")), + "Unicode"); + + TokenStream in = ra.tokenStream("all", inWords); + + RussianLetterTokenizer sample = + new RussianLetterTokenizer( + sampleUnicode, + RussianCharsets.UnicodeRussian); + + for (;;) + { + Token token = in.next(); + + if (token == null) + { + break; + } + + Token sampleToken = sample.next(); + assertEquals( + "Unicode", + token.termText(), + sampleToken == null + ? null + : sampleToken.termText()); + } + + inWords.close(); + sampleUnicode.close(); + } + + public void testKOI8() throws IOException + { + //System.out.println(new java.util.Date()); + RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); + // KOI8 + inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1"); + + sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1"); + + TokenStream in = ra.tokenStream("all", inWordsKOI8); + RussianLetterTokenizer sample = + new RussianLetterTokenizer( + sampleKOI8, + RussianCharsets.KOI8); + + for (;;) + { + Token token = in.next(); + + if (token == null) + { + break; + } + + Token sampleToken = sample.next(); + assertEquals( + "KOI8", + token.termText(), + sampleToken == null + ? null + : sampleToken.termText()); + + } + + inWordsKOI8.close(); + sampleKOI8.close(); + } + + public void test1251() throws IOException + { + // 1251 + inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1"); + + sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1"); + + RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); + TokenStream in = ra.tokenStream("", inWords1251); + RussianLetterTokenizer sample = + new RussianLetterTokenizer( + sample1251, + RussianCharsets.CP1251); + + for (;;) + { + Token token = in.next(); + + if (token == null) + { + break; + } + + Token sampleToken = sample.next(); + assertEquals( + "1251", + token.termText(), + sampleToken == null + ? null + : sampleToken.termText()); + + } + + inWords1251.close(); + sample1251.close(); + } +} diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java new file mode 100644 index 00000000000..96e1801ad8c --- /dev/null +++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java @@ -0,0 +1,94 @@ +package org.apache.lucene.analysis.ru; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +import java.io.BufferedReader; +import java.io.File; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.util.ArrayList; + +public class TestRussianStem extends TestCase +{ + private ArrayList words = new ArrayList(); + private ArrayList stems = new ArrayList(); + + public TestRussianStem(String name) + { + super(name); + } + + /** + * @see TestCase#setUp() + */ + protected void setUp() throws Exception + { + super.setUp(); + //System.out.println(new java.util.Date()); + String str; + + File dataDir = new File(System.getProperty("dataDir", "./bin")); + + // open and read words into an array list + BufferedReader inWords = + new BufferedReader( + new InputStreamReader( + new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/wordsUnicode.txt")), + "Unicode")); + while ((str = inWords.readLine()) != null) + { + words.add(str); + } + inWords.close(); + + // open and read stems into an array list + BufferedReader inStems = + new BufferedReader( + new InputStreamReader( + new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/stemsUnicode.txt")), + "Unicode")); + while ((str = inStems.readLine()) != null) + { + stems.add(str); + } + inStems.close(); + } + + /** + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception + { + super.tearDown(); + } + + public void testStem() + { + for (int i = 0; i < words.size(); i++) + { + //if ( (i % 100) == 0 ) System.err.println(i); + String realStem = + RussianStemmer.stem( + (String) words.get(i), + RussianCharsets.UnicodeRussian); + assertEquals("unicode", stems.get(i), realStem); + } + } + +} diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/res1251.htm b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/res1251.htm new file mode 100644 index 00000000000..d3d2e2badad --- /dev/null +++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/res1251.htm @@ -0,0 +1 @@ +[âìåñò][ñèë][ýëåêòðîìàãíèòí][ýíåðã][èìåë][ïðåäñòàâëåí][ñêàæ][æðåö][äðåâí][åãèïò][çíàí][õðàí][òàéí][óçê][êðóã][ïîñâÿùåí][âñÿê][âðåìåí][âèòîê][ïðèí][ñîá][íîâ][òåõíîëîã][ñàì][äåë][ðàñêðûâà][ïîòàåí][çíàí][ïðåæí][âåê][ãîâîð][íîâ][èíôîðìàö][ñòàíîâ][äîñòóïí][øèðîê][êðóã][ïîëüçîâàòåë][òåõ][ñëó÷à][ñîçíàí][îáùåñòâ][ãîòîâ][âîñïðèíÿ][âîñïîëüçîâà] \ No newline at end of file diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resKOI8.htm b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resKOI8.htm new file mode 100644 index 00000000000..7cfab861990 --- /dev/null +++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resKOI8.htm @@ -0,0 +1 @@ +[×ÍÅÓÔ][ÓÉÌ][ÜÌÅËÔÒÏÍÁÇÎÉÔÎ][ÜÎÅÒÇ][ÉÍÅÌ][ÐÒÅÄÓÔÁ×ÌÅÎ][ÓËÁÖ][ÖÒÅÃ][ÄÒÅ×Î][ÅÇÉÐÔ][ÚÎÁÎ][ÈÒÁÎ][ÔÁÊÎ][ÕÚË][ËÒÕÇ][ÐÏÓ×ÑÝÅÎ][×ÓÑË][×ÒÅÍÅÎ][×ÉÔÏË][ÐÒÉÎ][ÓÏÂ][ÎÏ×][ÔÅÈÎÏÌÏÇ][ÓÁÍ][ÄÅÌ][ÒÁÓËÒÙ×Á][ÐÏÔÁÅÎ][ÚÎÁÎ][ÐÒÅÖÎ][×ÅË][ÇÏ×ÏÒ][ÎÏ×][ÉÎÆÏÒÍÁÃ][ÓÔÁÎÏ×][ÄÏÓÔÕÐÎ][ÛÉÒÏË][ËÒÕÇ][ÐÏÌØÚÏ×ÁÔÅÌ][ÔÅÈ][ÓÌÕÞÁ][ÓÏÚÎÁÎ][ÏÂÝÅÓÔ×][ÇÏÔÏ×][×ÏÓÐÒÉÎÑ][×ÏÓÐÏÌØÚÏ×Á] \ No newline at end of file diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resUnicode.htm b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resUnicode.htm new file mode 100644 index 00000000000..ea71882a505 Binary files /dev/null and b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/resUnicode.htm differ diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt new file mode 100644 index 00000000000..0f1232777f7 Binary files /dev/null and b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/stemsUnicode.txt differ diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/test1251.txt b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/test1251.txt new file mode 100644 index 00000000000..984f7438678 --- /dev/null +++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/test1251.txt @@ -0,0 +1,2 @@ +Âìåñòå ñ òåì î ñèëå ýëåêòðîìàãíèòíîé ýíåðãèè èìåëè ïðåäñòàâëåíèå åùå, ñêàæåì, æðåöû Äðåâíåãî Åãèïòà. Íî çíàíèå ýòî õðàíèëîñü â òàéíå, â +óçêîì êðóãó ïîñâÿùåííûõ. Âñÿêèé âðåìåííîé âèòîê, ïðèíîñÿ ñ ñîáîé íîâûå òåõíîëîãèè, íà ñàìîì äåëå ðàñêðûâàåò ïîòàåííîå çíàíèå ïðåæíèõ âåêîâ. Ìû óæå ãîâîðèëè, ÷òî íîâàÿ èíôîðìàöèÿ ñòàíîâèòñÿ äîñòóïíîé øèðîêîìó êðóãó ïîëüçîâàòåëåé òîëüêî â òåõ ñëó÷àÿõ, êîãäà ñîçíàíèå îáùåñòâà ãîòîâî åå âîñïðèíÿòü è âîñïîëüçîâàòüñÿ åþ. diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testKOI8.txt b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testKOI8.txt new file mode 100644 index 00000000000..bf2a91a6d1d --- /dev/null +++ b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testKOI8.txt @@ -0,0 +1,2 @@ +÷ÍÅÓÔÅ Ó ÔÅÍ Ï ÓÉÌÅ ÜÌÅËÔÒÏÍÁÇÎÉÔÎÏÊ ÜÎÅÒÇÉÉ ÉÍÅÌÉ ÐÒÅÄÓÔÁ×ÌÅÎÉÅ ÅÝÅ, ÓËÁÖÅÍ, ÖÒÅÃÙ äÒÅ×ÎÅÇÏ åÇÉÐÔÁ. îÏ ÚÎÁÎÉÅ ÜÔÏ ÈÒÁÎÉÌÏÓØ × ÔÁÊÎÅ, × +ÕÚËÏÍ ËÒÕÇÕ ÐÏÓ×ÑÝÅÎÎÙÈ. ÷ÓÑËÉÊ ×ÒÅÍÅÎÎÏÊ ×ÉÔÏË, ÐÒÉÎÏÓÑ Ó ÓÏÂÏÊ ÎÏ×ÙÅ ÔÅÈÎÏÌÏÇÉÉ, ÎÁ ÓÁÍÏÍ ÄÅÌÅ ÒÁÓËÒÙ×ÁÅÔ ÐÏÔÁÅÎÎÏÅ ÚÎÁÎÉÅ ÐÒÅÖÎÉÈ ×ÅËÏ×. íÙ ÕÖÅ ÇÏ×ÏÒÉÌÉ, ÞÔÏ ÎÏ×ÁÑ ÉÎÆÏÒÍÁÃÉÑ ÓÔÁÎÏ×ÉÔÓÑ ÄÏÓÔÕÐÎÏÊ ÛÉÒÏËÏÍÕ ËÒÕÇÕ ÐÏÌØÚÏ×ÁÔÅÌÅÊ ÔÏÌØËÏ × ÔÅÈ ÓÌÕÞÁÑÈ, ËÏÇÄÁ ÓÏÚÎÁÎÉÅ ÏÂÝÅÓÔ×Á ÇÏÔÏ×Ï ÅÅ ×ÏÓÐÒÉÎÑÔØ É ×ÏÓÐÏÌØÚÏ×ÁÔØÓÑ ÅÀ. diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testUnicode.txt b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testUnicode.txt new file mode 100644 index 00000000000..be73a640e6d Binary files /dev/null and b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/testUnicode.txt differ diff --git a/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt new file mode 100644 index 00000000000..a57ec5c4c60 Binary files /dev/null and b/sandbox/contributions/analyzers/src/test/org/apache/lucene/analysis/ru/wordsUnicode.txt differ