- Russian Analyzer, by Boris Okner. Initial checkin.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149840 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2002-09-16 02:51:58 +00:00
parent e79dfcfaa1
commit e63750554f
6 changed files with 1772 additions and 0 deletions

View File

@ -0,0 +1,291 @@
package org.apache.lucene.analysis.ru;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
import java.io.Reader;
import java.util.Hashtable;
/**
* Analyzer for Russian language. Supports an external list of stopwords (words that
* will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
*
* @author Boris Okner
* @version $Id
*/
public final class RussianAnalyzer extends Analyzer
{
// letters
private static char A = 0;
private static char B = 1;
private static char V = 2;
private static char G = 3;
private static char D = 4;
private static char E = 5;
private static char ZH = 6;
private static char Z = 7;
private static char I = 8;
private static char I_ = 9;
private static char K = 10;
private static char L = 11;
private static char M = 12;
private static char N = 13;
private static char O = 14;
private static char P = 15;
private static char R = 16;
private static char S = 17;
private static char T = 18;
private static char U = 19;
private static char F = 20;
private static char X = 21;
private static char TS = 22;
private static char CH = 23;
private static char SH = 24;
private static char SHCH = 25;
private static char HARD = 26;
private static char Y = 27;
private static char SOFT = 28;
private static char AE = 29;
private static char IU = 30;
private static char IA = 31;
/**
* List of typical Russian stopwords.
*/
private static char[][] RUSSIAN_STOP_WORDS = {
{A},
{B, E, Z},
{B, O, L, E, E},
{B, Y},
{B, Y, L},
{B, Y, L, A},
{B, Y, L, I},
{B, Y, L, O},
{B, Y, T, SOFT},
{V},
{V, A, M},
{V, A, S},
{V, E, S, SOFT},
{V, O},
{V, O, T},
{V, S, E},
{V, S, E, G, O},
{V, S, E, X},
{V, Y},
{G, D, E},
{D, A},
{D, A, ZH, E},
{D, L, IA},
{D, O},
{E, G, O},
{E, E},
{E, I_,},
{E, IU},
{E, S, L, I},
{E, S, T, SOFT},
{E, SHCH, E},
{ZH, E},
{Z, A},
{Z, D, E, S, SOFT},
{I},
{I, Z},
{I, L, I},
{I, M},
{I, X},
{K},
{K, A, K},
{K, O},
{K, O, G, D, A},
{K, T, O},
{L, I},
{L, I, B, O},
{M, N, E},
{M, O, ZH, E, T},
{M, Y},
{N, A},
{N, A, D, O},
{N, A, SH},
{N, E},
{N, E, G, O},
{N, E, E},
{N, E, T},
{N, I},
{N, I, X},
{N, O},
{N, U},
{O},
{O, B},
{O, D, N, A, K, O},
{O, N},
{O, N, A},
{O, N, I},
{O, N, O},
{O, T},
{O, CH, E, N, SOFT},
{P, O},
{P, O, D},
{P, R, I},
{S},
{S, O},
{T, A, K},
{T, A, K, ZH, E},
{T, A, K, O, I_},
{T, A, M},
{T, E},
{T, E, M},
{T, O},
{T, O, G, O},
{T, O, ZH, E},
{T, O, I_},
{T, O, L, SOFT, K, O},
{T, O, M},
{T, Y},
{U},
{U, ZH, E},
{X, O, T, IA},
{CH, E, G, O},
{CH, E, I_},
{CH, E, M},
{CH, T, O},
{CH, T, O, B, Y},
{CH, SOFT, E},
{CH, SOFT, IA},
{AE, T, A},
{AE, T, I},
{AE, T, O},
{IA}
};
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stoptable = new Hashtable();
/**
* Charset for Russian letters.
* Represents encoding for 32 lowercase Russian letters.
* Predefined charsets can be taken from RussianCharSets class
*/
private char[] charset;
/**
* Builds an analyzer.
*/
public RussianAnalyzer(char[] charset)
{
this.charset = charset;
stoptable = StopFilter.makeStopTable(makeStopWords(charset));
}
/**
* Builds an analyzer with the given stop words.
*/
public RussianAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stoptable = StopFilter.makeStopTable(stopwords);
}
// Takes russian stop words and translates them to a String array, using
// the given charset
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[RUSSIAN_STOP_WORDS.length];
for (int i = 0; i < res.length; i++)
{
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
// translate the word,using the charset
StringBuffer theWord = new StringBuffer();
for (int j = 0; j < theStopWord.length; j++)
{
theWord.append(charset[theStopWord[j]]);
}
res[i] = theWord.toString();
}
return res;
}
/**
* Builds an analyzer with the given stop words.
*/
public RussianAnalyzer(char[] charset, Hashtable stopwords)
{
this.charset = charset;
stoptable = stopwords;
}
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a RussianLetterTokenizer filtered with
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
*/
public final TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new RussianLetterTokenizer(reader, charset);
result = new RussianLowerCaseFilter(result, charset);
result = new StopFilter(result, stoptable);
result = new RussianStemFilter(result, charset);
return result;
}
}

View File

@ -0,0 +1,317 @@
package org.apache.lucene.analysis.ru;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
* for russian characters in Unicode, KOI8 and CP1252.
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
* and adding logic to toLowerCase() method for that charset.
*
* @author Boris Okner
* @version $Id$
*/
public class RussianCharsets
{
// Unicode Russian charset (lowercase letters only)
public static char[] UnicodeRussian = {
'\u0430',
'\u0431',
'\u0432',
'\u0433',
'\u0434',
'\u0435',
'\u0436',
'\u0437',
'\u0438',
'\u0439',
'\u043A',
'\u043B',
'\u043C',
'\u043D',
'\u043E',
'\u043F',
'\u0440',
'\u0441',
'\u0442',
'\u0443',
'\u0444',
'\u0445',
'\u0446',
'\u0447',
'\u0448',
'\u0449',
'\u044A',
'\u044B',
'\u044C',
'\u044D',
'\u044E',
'\u044F',
// upper case
'\u0410',
'\u0411',
'\u0412',
'\u0413',
'\u0414',
'\u0415',
'\u0416',
'\u0417',
'\u0418',
'\u0419',
'\u041A',
'\u041B',
'\u041C',
'\u041D',
'\u041E',
'\u041F',
'\u0420',
'\u0421',
'\u0422',
'\u0423',
'\u0424',
'\u0425',
'\u0426',
'\u0427',
'\u0428',
'\u0429',
'\u042A',
'\u042B',
'\u042C',
'\u042D',
'\u042E',
'\u042F'
};
// KOI8 charset
public static char[] KOI8 = {
0xc1,
0xc2,
0xd7,
0xc7,
0xc4,
0xc5,
0xd6,
0xda,
0xc9,
0xca,
0xcb,
0xcc,
0xcd,
0xce,
0xcf,
0xd0,
0xd2,
0xd3,
0xd4,
0xd5,
0xc6,
0xc8,
0xc3,
0xde,
0xdb,
0xdd,
0xdf,
0xd9,
0xd8,
0xdc,
0xc0,
0xd1,
// upper case
0xe1,
0xe2,
0xf7,
0xe7,
0xe4,
0xe5,
0xf6,
0xfa,
0xe9,
0xea,
0xeb,
0xec,
0xed,
0xee,
0xef,
0xf0,
0xf2,
0xf3,
0xf4,
0xf5,
0xe6,
0xe8,
0xe3,
0xfe,
0xfb,
0xfd,
0xff,
0xf9,
0xf8,
0xfc,
0xe0,
0xf1
};
// CP1251 eharset
public static char[] CP1251 = {
0xE0,
0xE1,
0xE2,
0xE3,
0xE4,
0xE5,
0xE6,
0xE7,
0xE8,
0xE9,
0xEA,
0xEB,
0xEC,
0xED,
0xEE,
0xEF,
0xF0,
0xF1,
0xF2,
0xF3,
0xF4,
0xF5,
0xF6,
0xF7,
0xF8,
0xF9,
0xFA,
0xFB,
0xFC,
0xFD,
0xFE,
0xFF,
// upper case
0xC0,
0xC1,
0xC2,
0xC3,
0xC4,
0xC5,
0xC6,
0xC7,
0xC8,
0xC9,
0xCA,
0xCB,
0xCC,
0xCD,
0xCE,
0xCF,
0xD0,
0xD1,
0xD2,
0xD3,
0xD4,
0xD5,
0xD6,
0xD7,
0xD8,
0xD9,
0xDA,
0xDB,
0xDC,
0xDD,
0xDE,
0xDF
};
public static char toLowerCase(char letter, char[] charset)
{
if (charset == UnicodeRussian)
{
if (letter >= '\u0430' && letter <= '\u044F')
{
return letter;
}
if (letter >= '\u0410' && letter <= '\u042F')
{
return (char) (letter + 32);
}
}
if (charset == KOI8)
{
if (letter >= 0xe0 && letter <= 0xff)
{
return (char) (letter - 32);
}
if (letter >= 0xc0 && letter <= 0xdf)
{
return letter;
}
}
if (charset == CP1251)
{
if (letter >= 0xC0 && letter <= 0xDF)
{
return (char) (letter + 32);
}
if (letter >= 0xE0 && letter <= 0xFF)
{
return letter;
}
}
return Character.toLowerCase(letter);
}
}

View File

@ -0,0 +1,95 @@
package org.apache.lucene.analysis.ru;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
/**
* A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
* in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
* (well-known problems with 0xD7 and 0xF7 chars)
*
* @version $Id$
*/
public class RussianLetterTokenizer extends CharTokenizer
{
/** Construct a new LetterTokenizer. */
private char[] charset;
public RussianLetterTokenizer(Reader in, char[] charset)
{
super(in);
this.charset = charset;
}
/**
* Collects only characters which satisfy
* {@link Character#isLetter(char)}.
*/
protected boolean isTokenChar(char c)
{
if (Character.isLetter(c))
return true;
for (int i = 0; i < charset.length; i++)
{
if (c == charset[i])
return true;
}
return false;
}
}

View File

@ -0,0 +1,97 @@
package org.apache.lucene.analysis.ru;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* Normalizes token text to lower case, analyzing given ("russian") charset.
*
* @version $Id$
*/
public final class RussianLowerCaseFilter extends TokenFilter
{
char[] charset;
public RussianLowerCaseFilter(TokenStream in, char[] charset)
{
input = in;
this.charset = charset;
}
public final Token next() throws java.io.IOException
{
Token t = input.next();
if (t == null)
return null;
String txt = t.termText();
char[] chArray = txt.toCharArray();
for (int i = 0; i < chArray.length; i++)
{
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
}
String newTxt = new String(chArray);
// create new token
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
return newToken;
}
}

View File

@ -0,0 +1,115 @@
package org.apache.lucene.analysis.ru;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
/**
* A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
* The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
* because RussianStemFilter only works with lowercase part of any "russian" charset.
* @author Boris Okner
* @version $Id$
*/
public final class RussianStemFilter extends TokenFilter
{
/**
* The actual token in the input stream.
*/
private Token token = null;
private RussianStemmer stemmer = null;
public RussianStemFilter(TokenStream in, char[] charset)
{
stemmer = new RussianStemmer(charset);
input = in;
}
/**
* @return Returns the next token in the stream, or null at EOS
*/
public final Token next() throws IOException
{
if ((token = input.next()) == null)
{
return null;
}
else
{
String s = stemmer.stem(token.termText());
if (!s.equals(token.termText()))
{
return new Token(s, token.startOffset(), token.endOffset(),
token.type());
}
return token;
}
}
/**
* Set a alternative/custom RussianStemmer for this filter.
*/
public void setStemmer(RussianStemmer stemmer)
{
if (stemmer != null)
{
this.stemmer = stemmer;
}
}
}

View File

@ -0,0 +1,857 @@
package org.apache.lucene.analysis.ru;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
* Creation date: (12/02/2002 10:34:15 PM)
* @author: Boris Okner
* @version $Id$
*/
class RussianStemmer
{
private char[] charset;
// positions of RV, R1 and R2 respectively
private int RV, R1, R2;
// letters
private static char A = 0;
private static char B = 1;
private static char V = 2;
private static char G = 3;
private static char D = 4;
private static char E = 5;
private static char ZH = 6;
private static char Z = 7;
private static char I = 8;
private static char I_ = 9;
private static char K = 10;
private static char L = 11;
private static char M = 12;
private static char N = 13;
private static char O = 14;
private static char P = 15;
private static char R = 16;
private static char S = 17;
private static char T = 18;
private static char U = 19;
private static char F = 20;
private static char X = 21;
private static char TS = 22;
private static char CH = 23;
private static char SH = 24;
private static char SHCH = 25;
private static char HARD = 26;
private static char Y = 27;
private static char SOFT = 28;
private static char AE = 29;
private static char IU = 30;
private static char IA = 31;
// stem definitions
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
private static char[][] perfectiveGerundEndings1 = {
{ V },
{ V, SH, I },
{ V, SH, I, S, SOFT }
};
private static char[][] perfectiveGerund1Predessors = {
{ A },
{ IA }
};
private static char[][] perfectiveGerundEndings2 = { { I, V }, {
Y, V }, {
I, V, SH, I }, {
Y, V, SH, I }, {
I, V, SH, I, S, SOFT }, {
Y, V, SH, I, S, SOFT }
};
private static char[][] adjectiveEndings = {
{ E, E },
{ I, E },
{ Y, E },
{ O, E },
{ E, I_ },
{ I, I_ },
{ Y, I_ },
{ O, I_ },
{ E, M },
{ I, M },
{ Y, M },
{ O, M },
{ I, X },
{ Y, X },
{ U, IU },
{ IU, IU },
{ A, IA },
{ IA, IA },
{ O, IU },
{ E, IU },
{ I, M, I },
{ Y, M, I },
{ E, G, O },
{ O, G, O },
{ E, M, U },
{O, M, U }
};
private static char[][] participleEndings1 = {
{ SHCH },
{ E, M },
{ N, N },
{ V, SH },
{ IU, SHCH }
};
private static char[][] participleEndings2 = {
{ I, V, SH },
{ Y, V, SH },
{ U, IU, SHCH }
};
private static char[][] participle1Predessors = {
{ A },
{ IA }
};
private static char[][] reflexiveEndings = {
{ S, IA },
{ S, SOFT }
};
private static char[][] verbEndings1 = {
{ I_ },
{ L },
{ N },
{ L, O },
{ N, O },
{ E, T },
{ IU, T },
{ L, A },
{ N, A },
{ L, I },
{ E, M },
{ N, Y },
{ E, T, E },
{ I_, T, E },
{ T, SOFT },
{ E, SH, SOFT },
{ N, N, O }
};
private static char[][] verbEndings2 = {
{ IU },
{ U, IU },
{ E, N },
{ E, I_ },
{ IA, T },
{ U, I_ },
{ I, L },
{ Y, L },
{ I, M },
{ Y, M },
{ I, T },
{ Y, T },
{ I, L, A },
{ Y, L, A },
{ E, N, A },
{ I, T, E },
{ I, L, I },
{ Y, L, I },
{ I, L, O },
{ Y, L, O },
{ E, N, O },
{ U, E, T },
{ U, IU, T },
{ E, N, Y },
{ I, T, SOFT },
{ Y, T, SOFT },
{ I, SH, SOFT },
{ E, I_, T, E },
{ U, I_, T, E }
};
private static char[][] verb1Predessors = {
{ A },
{ IA }
};
private static char[][] nounEndings = {
{ A },
{ U },
{ I_ },
{ O },
{ U },
{ E },
{ Y },
{ I },
{ SOFT },
{ IA },
{ E, V },
{ O, V },
{ I, E },
{ SOFT, E },
{ IA, X },
{ I, IU },
{ E, I },
{ I, I },
{ E, I_ },
{ O, I_ },
{ E, M },
{ A, M },
{ O, M },
{ A, X },
{ SOFT, IU },
{ I, IA },
{ SOFT, IA },
{ I, I_ },
{ IA, M },
{ IA, M, I },
{ A, M, I },
{ I, E, I_ },
{ I, IA, M },
{ I, E, M },
{ I, IA, X },
{ I, IA, M, I }
};
private static char[][] superlativeEndings = {
{ E, I_, SH },
{ E, I_, SH, E }
};
private static char[][] derivationalEndings = {
{ O, S, T },
{ O, S, T, SOFT }
};
/**
* RussianStemmer constructor comment.
*/
public RussianStemmer()
{
super();
}
/**
* RussianStemmer constructor comment.
*/
public RussianStemmer(char[] charset)
{
super();
this.charset = charset;
}
/**
* Adjectival ending is an adjective ending,
* optionally preceded by participle ending.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean adjectival(StringBuffer stemmingZone)
{
// look for adjective ending in a stemming zone
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
return false;
// if adjective ending was found, try for participle ending
boolean r =
findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
||
findAndRemoveEnding(stemmingZone, participleEndings2);
return true;
}
/**
* Derivational endings
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean derivational(StringBuffer stemmingZone)
{
int endingLength = findEnding(stemmingZone, derivationalEndings);
if (endingLength == 0)
// no derivational ending found
return false;
else
{
// Ensure that the ending locates in R2
if (R2 - RV <= stemmingZone.length() - endingLength)
{
stemmingZone.setLength(stemmingZone.length() - endingLength);
return true;
}
else
{
return false;
}
}
}
/**
* Finds ending among given ending class and returns the length of ending found(0, if not found).
* Creation date: (17/03/2002 8:18:34 PM)
* @return int
* @param word java.lang.StringBuffer
* @param theEnding char[]
*/
private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
{
boolean match = false;
for (int i = theEndingClass.length - 1; i >= 0; i--)
{
char[] theEnding = theEndingClass[i];
// check if the ending is bigger than stemming zone
if (startIndex < theEnding.length - 1)
{
match = false;
continue;
}
match = true;
int stemmingIndex = startIndex;
for (int j = theEnding.length - 1; j >= 0; j--)
{
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
{
match = false;
break;
}
}
// check if ending was found
if (match)
{
return theEndingClass[i].length; // cut ending
}
}
return 0;
}
private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
{
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
}
/**
* Finds the ending among the given class of endings and removes it from stemming zone.
* Creation date: (17/03/2002 8:18:34 PM)
* @return boolean
* @param word java.lang.StringBuffer
* @param theEnding char[]
*/
private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
{
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
/**
* Finds the ending among the given class of endings, then checks if this ending was
* preceded by any of given predessors, and if so, removes it from stemming zone.
* Creation date: (17/03/2002 8:18:34 PM)
* @return boolean
* @param word java.lang.StringBuffer
* @param theEnding char[]
*/
private boolean findAndRemoveEnding(StringBuffer stemmingZone,
char[][] theEndingClass, char[][] thePredessors)
{
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
int predessorLength =
findEnding(stemmingZone,
stemmingZone.length() - endingLength - 1,
thePredessors);
if (predessorLength == 0)
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
}
/**
* Marks positions of RV, R1 and R2 in a given word.
* Creation date: (16/03/2002 3:40:11 PM)
* @return int
* @param word java.lang.String
*/
private void markPositions(String word)
{
RV = 0;
R1 = 0;
R2 = 0;
int i = 0;
// find RV
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // RV zone is empty
RV = i;
// find R1
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R1 zone is empty
R1 = i;
// find R2
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
R2 = i;
}
/**
* Checks if character is a vowel..
* Creation date: (16/03/2002 10:47:03 PM)
* @return boolean
* @param letter char
*/
private boolean isVowel(char letter)
{
for (int i = 0; i < vowels.length; i++)
{
if (letter == charset[vowels[i]])
return true;
}
return false;
}
/**
* Noun endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean noun(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, nounEndings);
}
/**
* Perfective gerund endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean perfectiveGerund(StringBuffer stemmingZone)
{
return findAndRemoveEnding(
stemmingZone,
perfectiveGerundEndings1,
perfectiveGerund1Predessors)
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
}
/**
* Reflexive endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean reflexive(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
}
/**
* Insert the method's description here.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean removeI(StringBuffer stemmingZone)
{
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Insert the method's description here.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean removeSoft(StringBuffer stemmingZone)
{
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Insert the method's description here.
* Creation date: (16/03/2002 10:58:42 PM)
* @param newCharset char[]
*/
public void setCharset(char[] newCharset)
{
charset = newCharset;
}
/**
* Set ending definition as in Russian stemming algorithm.
* Creation date: (16/03/2002 11:16:36 PM)
*/
private void setEndings()
{
vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };
perfectiveGerundEndings1 = new char[][] {
{ V }, { V, SH, I }, { V, SH, I, S, SOFT }
};
perfectiveGerund1Predessors = new char[][] { { A }, { IA }
};
perfectiveGerundEndings2 = new char[][] {
{ I, V },
{ Y, V },
{ I, V, SH, I },
{ Y, V, SH, I },
{ I, V, SH, I, S, SOFT },
{ Y, V, SH, I, S, SOFT }
};
adjectiveEndings = new char[][] {
{ E, E },
{ I, E },
{ Y, E },
{ O, E },
{ E, I_ },
{ I, I_ },
{ Y, I_ },
{ O, I_ },
{ E, M },
{ I, M },
{ Y, M },
{ O, M },
{ I, X },
{ Y, X },
{ U, IU },
{ IU, IU },
{ A, IA },
{ IA, IA },
{ O, IU },
{ E, IU },
{ I, M, I },
{ Y, M, I },
{ E, G, O },
{ O, G, O },
{ E, M, U },
{ O, M, U }
};
participleEndings1 = new char[][] {
{ SHCH },
{ E, M },
{ N, N },
{ V, SH },
{ IU, SHCH }
};
participleEndings2 = new char[][] {
{ I, V, SH },
{ Y, V, SH },
{ U, IU, SHCH }
};
participle1Predessors = new char[][] {
{ A },
{ IA }
};
reflexiveEndings = new char[][] {
{ S, IA },
{ S, SOFT }
};
verbEndings1 = new char[][] {
{ I_ },
{ L },
{ N },
{ L, O },
{ N, O },
{ E, T },
{ IU, T },
{ L, A },
{ N, A },
{ L, I },
{ E, M },
{ N, Y },
{ E, T, E },
{ I_, T, E },
{ T, SOFT },
{ E, SH, SOFT },
{ N, N, O }
};
verbEndings2 = new char[][] {
{ IU },
{ U, IU },
{ E, N },
{ E, I_ },
{ IA, T },
{ U, I_ },
{ I, L },
{ Y, L },
{ I, M },
{ Y, M },
{ I, T },
{ Y, T },
{ I, L, A },
{ Y, L, A },
{ E, N, A },
{ I, T, E },
{ I, L, I },
{ Y, L, I },
{ I, L, O },
{ Y, L, O },
{ E, N, O },
{ U, E, T },
{ U, IU, T },
{ E, N, Y },
{ I, T, SOFT },
{ Y, T, SOFT },
{ I, SH, SOFT },
{ E, I_, T, E },
{ U, I_, T, E }
};
verb1Predessors = new char[][] {
{ A },
{ IA }
};
nounEndings = new char[][] {
{ A },
{ IU },
{ I_ },
{ O },
{ U },
{ E },
{ Y },
{ I },
{ SOFT },
{ IA },
{ E, V },
{ O, V },
{ I, E },
{ SOFT, E },
{ IA, X },
{ I, IU },
{ E, I },
{ I, I },
{ E, I_ },
{ O, I_ },
{ E, M },
{ A, M },
{ O, M },
{ A, X },
{ SOFT, IU },
{ I, IA },
{ SOFT, IA },
{ I, I_ },
{ IA, M },
{ IA, M, I },
{ A, M, I },
{ I, E, I_ },
{ I, IA, M },
{ I, E, M },
{ I, IA, X },
{ I, IA, M, I }
};
superlativeEndings = new char[][] {
{ E, I_, SH },
{ E, I_, SH, E }
};
derivationalEndings = new char[][] {
{ O, S, T },
{ O, S, T, SOFT }
};
}
/**
* Finds the stem for given Russian word.
* Creation date: (16/03/2002 3:36:48 PM)
* @return java.lang.String
* @param input java.lang.String
*/
public String stem(String input)
{
markPositions(input);
if (RV == 0)
return input; //RV wasn't detected, nothing to stem
StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
// stemming goes on in RV
// Step 1
if (!perfectiveGerund(stemmingZone))
{
reflexive(stemmingZone);
boolean r =
adjectival(stemmingZone)
|| verb(stemmingZone)
|| noun(stemmingZone);
}
// Step 2
removeI(stemmingZone);
// Step 3
derivational(stemmingZone);
// Step 4
superlative(stemmingZone);
undoubleN(stemmingZone);
removeSoft(stemmingZone);
// return result
return input.substring(0, RV) + stemmingZone.toString();
}
/**
* Superlative endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean superlative(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, superlativeEndings);
}
/**
* Undoubles N.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean undoubleN(StringBuffer stemmingZone)
{
char[][] doubleN = {
{ N, N }
};
if (findEnding(stemmingZone, doubleN) != 0)
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Verb endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean verb(StringBuffer stemmingZone)
{
return findAndRemoveEnding(
stemmingZone,
verbEndings1,
verb1Predessors)
|| findAndRemoveEnding(stemmingZone, verbEndings2);
}
/**
* Static method for stemming with different charsets
*/
public static String stem(String theWord, char[] charset)
{
RussianStemmer stemmer = new RussianStemmer();
stemmer.setCharset(charset);
return stemmer.stem(theWord);
}
}