mirror of https://github.com/apache/lucene.git
- Russian Analyzer, by Boris Okner. Initial checkin.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149840 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e79dfcfaa1
commit
e63750554f
|
@ -0,0 +1,291 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Reader;
|
||||
import java.util.Hashtable;
|
||||
|
||||
/**
|
||||
* Analyzer for Russian language. Supports an external list of stopwords (words that
|
||||
* will not be indexed at all).
|
||||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
*
|
||||
* @author Boris Okner
|
||||
* @version $Id
|
||||
*/
|
||||
public final class RussianAnalyzer extends Analyzer
|
||||
{
|
||||
// letters
|
||||
private static char A = 0;
|
||||
private static char B = 1;
|
||||
private static char V = 2;
|
||||
private static char G = 3;
|
||||
private static char D = 4;
|
||||
private static char E = 5;
|
||||
private static char ZH = 6;
|
||||
private static char Z = 7;
|
||||
private static char I = 8;
|
||||
private static char I_ = 9;
|
||||
private static char K = 10;
|
||||
private static char L = 11;
|
||||
private static char M = 12;
|
||||
private static char N = 13;
|
||||
private static char O = 14;
|
||||
private static char P = 15;
|
||||
private static char R = 16;
|
||||
private static char S = 17;
|
||||
private static char T = 18;
|
||||
private static char U = 19;
|
||||
private static char F = 20;
|
||||
private static char X = 21;
|
||||
private static char TS = 22;
|
||||
private static char CH = 23;
|
||||
private static char SH = 24;
|
||||
private static char SHCH = 25;
|
||||
private static char HARD = 26;
|
||||
private static char Y = 27;
|
||||
private static char SOFT = 28;
|
||||
private static char AE = 29;
|
||||
private static char IU = 30;
|
||||
private static char IA = 31;
|
||||
|
||||
/**
|
||||
* List of typical Russian stopwords.
|
||||
*/
|
||||
private static char[][] RUSSIAN_STOP_WORDS = {
|
||||
{A},
|
||||
{B, E, Z},
|
||||
{B, O, L, E, E},
|
||||
{B, Y},
|
||||
{B, Y, L},
|
||||
{B, Y, L, A},
|
||||
{B, Y, L, I},
|
||||
{B, Y, L, O},
|
||||
{B, Y, T, SOFT},
|
||||
{V},
|
||||
{V, A, M},
|
||||
{V, A, S},
|
||||
{V, E, S, SOFT},
|
||||
{V, O},
|
||||
{V, O, T},
|
||||
{V, S, E},
|
||||
{V, S, E, G, O},
|
||||
{V, S, E, X},
|
||||
{V, Y},
|
||||
{G, D, E},
|
||||
{D, A},
|
||||
{D, A, ZH, E},
|
||||
{D, L, IA},
|
||||
{D, O},
|
||||
{E, G, O},
|
||||
{E, E},
|
||||
{E, I_,},
|
||||
{E, IU},
|
||||
{E, S, L, I},
|
||||
{E, S, T, SOFT},
|
||||
{E, SHCH, E},
|
||||
{ZH, E},
|
||||
{Z, A},
|
||||
{Z, D, E, S, SOFT},
|
||||
{I},
|
||||
{I, Z},
|
||||
{I, L, I},
|
||||
{I, M},
|
||||
{I, X},
|
||||
{K},
|
||||
{K, A, K},
|
||||
{K, O},
|
||||
{K, O, G, D, A},
|
||||
{K, T, O},
|
||||
{L, I},
|
||||
{L, I, B, O},
|
||||
{M, N, E},
|
||||
{M, O, ZH, E, T},
|
||||
{M, Y},
|
||||
{N, A},
|
||||
{N, A, D, O},
|
||||
{N, A, SH},
|
||||
{N, E},
|
||||
{N, E, G, O},
|
||||
{N, E, E},
|
||||
{N, E, T},
|
||||
{N, I},
|
||||
{N, I, X},
|
||||
{N, O},
|
||||
{N, U},
|
||||
{O},
|
||||
{O, B},
|
||||
{O, D, N, A, K, O},
|
||||
{O, N},
|
||||
{O, N, A},
|
||||
{O, N, I},
|
||||
{O, N, O},
|
||||
{O, T},
|
||||
{O, CH, E, N, SOFT},
|
||||
{P, O},
|
||||
{P, O, D},
|
||||
{P, R, I},
|
||||
{S},
|
||||
{S, O},
|
||||
{T, A, K},
|
||||
{T, A, K, ZH, E},
|
||||
{T, A, K, O, I_},
|
||||
{T, A, M},
|
||||
{T, E},
|
||||
{T, E, M},
|
||||
{T, O},
|
||||
{T, O, G, O},
|
||||
{T, O, ZH, E},
|
||||
{T, O, I_},
|
||||
{T, O, L, SOFT, K, O},
|
||||
{T, O, M},
|
||||
{T, Y},
|
||||
{U},
|
||||
{U, ZH, E},
|
||||
{X, O, T, IA},
|
||||
{CH, E, G, O},
|
||||
{CH, E, I_},
|
||||
{CH, E, M},
|
||||
{CH, T, O},
|
||||
{CH, T, O, B, Y},
|
||||
{CH, SOFT, E},
|
||||
{CH, SOFT, IA},
|
||||
{AE, T, A},
|
||||
{AE, T, I},
|
||||
{AE, T, O},
|
||||
{IA}
|
||||
};
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Hashtable stoptable = new Hashtable();
|
||||
|
||||
/**
|
||||
* Charset for Russian letters.
|
||||
* Represents encoding for 32 lowercase Russian letters.
|
||||
* Predefined charsets can be taken from RussianCharSets class
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset)
|
||||
{
|
||||
this.charset = charset;
|
||||
stoptable = StopFilter.makeStopTable(makeStopWords(charset));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset, String[] stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stoptable = StopFilter.makeStopTable(stopwords);
|
||||
}
|
||||
|
||||
// Takes russian stop words and translates them to a String array, using
|
||||
// the given charset
|
||||
private static String[] makeStopWords(char[] charset)
|
||||
{
|
||||
String[] res = new String[RUSSIAN_STOP_WORDS.length];
|
||||
for (int i = 0; i < res.length; i++)
|
||||
{
|
||||
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
|
||||
// translate the word,using the charset
|
||||
StringBuffer theWord = new StringBuffer();
|
||||
for (int j = 0; j < theStopWord.length; j++)
|
||||
{
|
||||
theWord.append(charset[theStopWord[j]]);
|
||||
}
|
||||
res[i] = theWord.toString();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset, Hashtable stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stoptable = stopwords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a RussianLetterTokenizer filtered with
|
||||
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
|
||||
*/
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
TokenStream result = new RussianLetterTokenizer(reader, charset);
|
||||
result = new RussianLowerCaseFilter(result, charset);
|
||||
result = new StopFilter(result, stoptable);
|
||||
result = new RussianStemFilter(result, charset);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,317 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
||||
* for russian characters in Unicode, KOI8 and CP1252.
|
||||
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
|
||||
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
||||
* and adding logic to toLowerCase() method for that charset.
|
||||
*
|
||||
* @author Boris Okner
|
||||
* @version $Id$
|
||||
*/
|
||||
public class RussianCharsets
|
||||
{
|
||||
// Unicode Russian charset (lowercase letters only)
|
||||
public static char[] UnicodeRussian = {
|
||||
'\u0430',
|
||||
'\u0431',
|
||||
'\u0432',
|
||||
'\u0433',
|
||||
'\u0434',
|
||||
'\u0435',
|
||||
'\u0436',
|
||||
'\u0437',
|
||||
'\u0438',
|
||||
'\u0439',
|
||||
'\u043A',
|
||||
'\u043B',
|
||||
'\u043C',
|
||||
'\u043D',
|
||||
'\u043E',
|
||||
'\u043F',
|
||||
'\u0440',
|
||||
'\u0441',
|
||||
'\u0442',
|
||||
'\u0443',
|
||||
'\u0444',
|
||||
'\u0445',
|
||||
'\u0446',
|
||||
'\u0447',
|
||||
'\u0448',
|
||||
'\u0449',
|
||||
'\u044A',
|
||||
'\u044B',
|
||||
'\u044C',
|
||||
'\u044D',
|
||||
'\u044E',
|
||||
'\u044F',
|
||||
// upper case
|
||||
'\u0410',
|
||||
'\u0411',
|
||||
'\u0412',
|
||||
'\u0413',
|
||||
'\u0414',
|
||||
'\u0415',
|
||||
'\u0416',
|
||||
'\u0417',
|
||||
'\u0418',
|
||||
'\u0419',
|
||||
'\u041A',
|
||||
'\u041B',
|
||||
'\u041C',
|
||||
'\u041D',
|
||||
'\u041E',
|
||||
'\u041F',
|
||||
'\u0420',
|
||||
'\u0421',
|
||||
'\u0422',
|
||||
'\u0423',
|
||||
'\u0424',
|
||||
'\u0425',
|
||||
'\u0426',
|
||||
'\u0427',
|
||||
'\u0428',
|
||||
'\u0429',
|
||||
'\u042A',
|
||||
'\u042B',
|
||||
'\u042C',
|
||||
'\u042D',
|
||||
'\u042E',
|
||||
'\u042F'
|
||||
};
|
||||
|
||||
// KOI8 charset
|
||||
public static char[] KOI8 = {
|
||||
0xc1,
|
||||
0xc2,
|
||||
0xd7,
|
||||
0xc7,
|
||||
0xc4,
|
||||
0xc5,
|
||||
0xd6,
|
||||
0xda,
|
||||
0xc9,
|
||||
0xca,
|
||||
0xcb,
|
||||
0xcc,
|
||||
0xcd,
|
||||
0xce,
|
||||
0xcf,
|
||||
0xd0,
|
||||
0xd2,
|
||||
0xd3,
|
||||
0xd4,
|
||||
0xd5,
|
||||
0xc6,
|
||||
0xc8,
|
||||
0xc3,
|
||||
0xde,
|
||||
0xdb,
|
||||
0xdd,
|
||||
0xdf,
|
||||
0xd9,
|
||||
0xd8,
|
||||
0xdc,
|
||||
0xc0,
|
||||
0xd1,
|
||||
// upper case
|
||||
0xe1,
|
||||
0xe2,
|
||||
0xf7,
|
||||
0xe7,
|
||||
0xe4,
|
||||
0xe5,
|
||||
0xf6,
|
||||
0xfa,
|
||||
0xe9,
|
||||
0xea,
|
||||
0xeb,
|
||||
0xec,
|
||||
0xed,
|
||||
0xee,
|
||||
0xef,
|
||||
0xf0,
|
||||
0xf2,
|
||||
0xf3,
|
||||
0xf4,
|
||||
0xf5,
|
||||
0xe6,
|
||||
0xe8,
|
||||
0xe3,
|
||||
0xfe,
|
||||
0xfb,
|
||||
0xfd,
|
||||
0xff,
|
||||
0xf9,
|
||||
0xf8,
|
||||
0xfc,
|
||||
0xe0,
|
||||
0xf1
|
||||
};
|
||||
|
||||
// CP1251 eharset
|
||||
public static char[] CP1251 = {
|
||||
0xE0,
|
||||
0xE1,
|
||||
0xE2,
|
||||
0xE3,
|
||||
0xE4,
|
||||
0xE5,
|
||||
0xE6,
|
||||
0xE7,
|
||||
0xE8,
|
||||
0xE9,
|
||||
0xEA,
|
||||
0xEB,
|
||||
0xEC,
|
||||
0xED,
|
||||
0xEE,
|
||||
0xEF,
|
||||
0xF0,
|
||||
0xF1,
|
||||
0xF2,
|
||||
0xF3,
|
||||
0xF4,
|
||||
0xF5,
|
||||
0xF6,
|
||||
0xF7,
|
||||
0xF8,
|
||||
0xF9,
|
||||
0xFA,
|
||||
0xFB,
|
||||
0xFC,
|
||||
0xFD,
|
||||
0xFE,
|
||||
0xFF,
|
||||
// upper case
|
||||
0xC0,
|
||||
0xC1,
|
||||
0xC2,
|
||||
0xC3,
|
||||
0xC4,
|
||||
0xC5,
|
||||
0xC6,
|
||||
0xC7,
|
||||
0xC8,
|
||||
0xC9,
|
||||
0xCA,
|
||||
0xCB,
|
||||
0xCC,
|
||||
0xCD,
|
||||
0xCE,
|
||||
0xCF,
|
||||
0xD0,
|
||||
0xD1,
|
||||
0xD2,
|
||||
0xD3,
|
||||
0xD4,
|
||||
0xD5,
|
||||
0xD6,
|
||||
0xD7,
|
||||
0xD8,
|
||||
0xD9,
|
||||
0xDA,
|
||||
0xDB,
|
||||
0xDC,
|
||||
0xDD,
|
||||
0xDE,
|
||||
0xDF
|
||||
};
|
||||
|
||||
public static char toLowerCase(char letter, char[] charset)
|
||||
{
|
||||
if (charset == UnicodeRussian)
|
||||
{
|
||||
if (letter >= '\u0430' && letter <= '\u044F')
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
if (letter >= '\u0410' && letter <= '\u042F')
|
||||
{
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
}
|
||||
|
||||
if (charset == KOI8)
|
||||
{
|
||||
if (letter >= 0xe0 && letter <= 0xff)
|
||||
{
|
||||
return (char) (letter - 32);
|
||||
}
|
||||
if (letter >= 0xc0 && letter <= 0xdf)
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (charset == CP1251)
|
||||
{
|
||||
if (letter >= 0xC0 && letter <= 0xDF)
|
||||
{
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
if (letter >= 0xE0 && letter <= 0xFF)
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return Character.toLowerCase(letter);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
|
||||
/**
|
||||
* A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
|
||||
* in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
|
||||
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
|
||||
* (well-known problems with 0xD7 and 0xF7 chars)
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
public class RussianLetterTokenizer extends CharTokenizer
|
||||
{
|
||||
/** Construct a new LetterTokenizer. */
|
||||
private char[] charset;
|
||||
|
||||
public RussianLetterTokenizer(Reader in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects only characters which satisfy
|
||||
* {@link Character#isLetter(char)}.
|
||||
*/
|
||||
protected boolean isTokenChar(char c)
|
||||
{
|
||||
if (Character.isLetter(c))
|
||||
return true;
|
||||
for (int i = 0; i < charset.length; i++)
|
||||
{
|
||||
if (c == charset[i])
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case, analyzing given ("russian") charset.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class RussianLowerCaseFilter extends TokenFilter
|
||||
{
|
||||
char[] charset;
|
||||
|
||||
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
input = in;
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException
|
||||
{
|
||||
Token t = input.next();
|
||||
|
||||
if (t == null)
|
||||
return null;
|
||||
|
||||
String txt = t.termText();
|
||||
|
||||
char[] chArray = txt.toCharArray();
|
||||
for (int i = 0; i < chArray.length; i++)
|
||||
{
|
||||
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
|
||||
}
|
||||
|
||||
String newTxt = new String(chArray);
|
||||
// create new token
|
||||
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
|
||||
|
||||
return newToken;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
|
||||
/**
|
||||
* A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
|
||||
* The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
|
||||
* because RussianStemFilter only works with lowercase part of any "russian" charset.
|
||||
* @author Boris Okner
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class RussianStemFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private RussianStemmer stemmer = null;
|
||||
|
||||
public RussianStemFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
stemmer = new RussianStemmer(charset);
|
||||
input = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public final Token next() throws IOException
|
||||
{
|
||||
if ((token = input.next()) == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
else
|
||||
{
|
||||
String s = stemmer.stem(token.termText());
|
||||
if (!s.equals(token.termText()))
|
||||
{
|
||||
return new Token(s, token.startOffset(), token.endOffset(),
|
||||
token.type());
|
||||
}
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a alternative/custom RussianStemmer for this filter.
|
||||
*/
|
||||
public void setStemmer(RussianStemmer stemmer)
|
||||
{
|
||||
if (stemmer != null)
|
||||
{
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,857 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
|
||||
* Creation date: (12/02/2002 10:34:15 PM)
|
||||
* @author: Boris Okner
|
||||
* @version $Id$
|
||||
*/
|
||||
class RussianStemmer
|
||||
{
|
||||
private char[] charset;
|
||||
|
||||
// positions of RV, R1 and R2 respectively
|
||||
private int RV, R1, R2;
|
||||
|
||||
// letters
|
||||
private static char A = 0;
|
||||
private static char B = 1;
|
||||
private static char V = 2;
|
||||
private static char G = 3;
|
||||
private static char D = 4;
|
||||
private static char E = 5;
|
||||
private static char ZH = 6;
|
||||
private static char Z = 7;
|
||||
private static char I = 8;
|
||||
private static char I_ = 9;
|
||||
private static char K = 10;
|
||||
private static char L = 11;
|
||||
private static char M = 12;
|
||||
private static char N = 13;
|
||||
private static char O = 14;
|
||||
private static char P = 15;
|
||||
private static char R = 16;
|
||||
private static char S = 17;
|
||||
private static char T = 18;
|
||||
private static char U = 19;
|
||||
private static char F = 20;
|
||||
private static char X = 21;
|
||||
private static char TS = 22;
|
||||
private static char CH = 23;
|
||||
private static char SH = 24;
|
||||
private static char SHCH = 25;
|
||||
private static char HARD = 26;
|
||||
private static char Y = 27;
|
||||
private static char SOFT = 28;
|
||||
private static char AE = 29;
|
||||
private static char IU = 30;
|
||||
private static char IA = 31;
|
||||
|
||||
// stem definitions
|
||||
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
|
||||
|
||||
private static char[][] perfectiveGerundEndings1 = {
|
||||
{ V },
|
||||
{ V, SH, I },
|
||||
{ V, SH, I, S, SOFT }
|
||||
};
|
||||
|
||||
private static char[][] perfectiveGerund1Predessors = {
|
||||
{ A },
|
||||
{ IA }
|
||||
};
|
||||
|
||||
private static char[][] perfectiveGerundEndings2 = { { I, V }, {
|
||||
Y, V }, {
|
||||
I, V, SH, I }, {
|
||||
Y, V, SH, I }, {
|
||||
I, V, SH, I, S, SOFT }, {
|
||||
Y, V, SH, I, S, SOFT }
|
||||
};
|
||||
|
||||
private static char[][] adjectiveEndings = {
|
||||
{ E, E },
|
||||
{ I, E },
|
||||
{ Y, E },
|
||||
{ O, E },
|
||||
{ E, I_ },
|
||||
{ I, I_ },
|
||||
{ Y, I_ },
|
||||
{ O, I_ },
|
||||
{ E, M },
|
||||
{ I, M },
|
||||
{ Y, M },
|
||||
{ O, M },
|
||||
{ I, X },
|
||||
{ Y, X },
|
||||
{ U, IU },
|
||||
{ IU, IU },
|
||||
{ A, IA },
|
||||
{ IA, IA },
|
||||
{ O, IU },
|
||||
{ E, IU },
|
||||
{ I, M, I },
|
||||
{ Y, M, I },
|
||||
{ E, G, O },
|
||||
{ O, G, O },
|
||||
{ E, M, U },
|
||||
{O, M, U }
|
||||
};
|
||||
|
||||
private static char[][] participleEndings1 = {
|
||||
{ SHCH },
|
||||
{ E, M },
|
||||
{ N, N },
|
||||
{ V, SH },
|
||||
{ IU, SHCH }
|
||||
};
|
||||
|
||||
private static char[][] participleEndings2 = {
|
||||
{ I, V, SH },
|
||||
{ Y, V, SH },
|
||||
{ U, IU, SHCH }
|
||||
};
|
||||
|
||||
private static char[][] participle1Predessors = {
|
||||
{ A },
|
||||
{ IA }
|
||||
};
|
||||
|
||||
private static char[][] reflexiveEndings = {
|
||||
{ S, IA },
|
||||
{ S, SOFT }
|
||||
};
|
||||
|
||||
private static char[][] verbEndings1 = {
|
||||
{ I_ },
|
||||
{ L },
|
||||
{ N },
|
||||
{ L, O },
|
||||
{ N, O },
|
||||
{ E, T },
|
||||
{ IU, T },
|
||||
{ L, A },
|
||||
{ N, A },
|
||||
{ L, I },
|
||||
{ E, M },
|
||||
{ N, Y },
|
||||
{ E, T, E },
|
||||
{ I_, T, E },
|
||||
{ T, SOFT },
|
||||
{ E, SH, SOFT },
|
||||
{ N, N, O }
|
||||
};
|
||||
|
||||
private static char[][] verbEndings2 = {
|
||||
{ IU },
|
||||
{ U, IU },
|
||||
{ E, N },
|
||||
{ E, I_ },
|
||||
{ IA, T },
|
||||
{ U, I_ },
|
||||
{ I, L },
|
||||
{ Y, L },
|
||||
{ I, M },
|
||||
{ Y, M },
|
||||
{ I, T },
|
||||
{ Y, T },
|
||||
{ I, L, A },
|
||||
{ Y, L, A },
|
||||
{ E, N, A },
|
||||
{ I, T, E },
|
||||
{ I, L, I },
|
||||
{ Y, L, I },
|
||||
{ I, L, O },
|
||||
{ Y, L, O },
|
||||
{ E, N, O },
|
||||
{ U, E, T },
|
||||
{ U, IU, T },
|
||||
{ E, N, Y },
|
||||
{ I, T, SOFT },
|
||||
{ Y, T, SOFT },
|
||||
{ I, SH, SOFT },
|
||||
{ E, I_, T, E },
|
||||
{ U, I_, T, E }
|
||||
};
|
||||
|
||||
private static char[][] verb1Predessors = {
|
||||
{ A },
|
||||
{ IA }
|
||||
};
|
||||
|
||||
private static char[][] nounEndings = {
|
||||
{ A },
|
||||
{ U },
|
||||
{ I_ },
|
||||
{ O },
|
||||
{ U },
|
||||
{ E },
|
||||
{ Y },
|
||||
{ I },
|
||||
{ SOFT },
|
||||
{ IA },
|
||||
{ E, V },
|
||||
{ O, V },
|
||||
{ I, E },
|
||||
{ SOFT, E },
|
||||
{ IA, X },
|
||||
{ I, IU },
|
||||
{ E, I },
|
||||
{ I, I },
|
||||
{ E, I_ },
|
||||
{ O, I_ },
|
||||
{ E, M },
|
||||
{ A, M },
|
||||
{ O, M },
|
||||
{ A, X },
|
||||
{ SOFT, IU },
|
||||
{ I, IA },
|
||||
{ SOFT, IA },
|
||||
{ I, I_ },
|
||||
{ IA, M },
|
||||
{ IA, M, I },
|
||||
{ A, M, I },
|
||||
{ I, E, I_ },
|
||||
{ I, IA, M },
|
||||
{ I, E, M },
|
||||
{ I, IA, X },
|
||||
{ I, IA, M, I }
|
||||
};
|
||||
|
||||
private static char[][] superlativeEndings = {
|
||||
{ E, I_, SH },
|
||||
{ E, I_, SH, E }
|
||||
};
|
||||
|
||||
private static char[][] derivationalEndings = {
|
||||
{ O, S, T },
|
||||
{ O, S, T, SOFT }
|
||||
};
|
||||
|
||||
/**
|
||||
* RussianStemmer constructor comment.
|
||||
*/
|
||||
public RussianStemmer()
|
||||
{
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* RussianStemmer constructor comment.
|
||||
*/
|
||||
public RussianStemmer(char[] charset)
|
||||
{
|
||||
super();
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjectival ending is an adjective ending,
|
||||
* optionally preceded by participle ending.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean adjectival(StringBuffer stemmingZone)
|
||||
{
|
||||
// look for adjective ending in a stemming zone
|
||||
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
|
||||
return false;
|
||||
// if adjective ending was found, try for participle ending
|
||||
boolean r =
|
||||
findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
|
||||
||
|
||||
findAndRemoveEnding(stemmingZone, participleEndings2);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Derivational endings
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean derivational(StringBuffer stemmingZone)
|
||||
{
|
||||
int endingLength = findEnding(stemmingZone, derivationalEndings);
|
||||
if (endingLength == 0)
|
||||
// no derivational ending found
|
||||
return false;
|
||||
else
|
||||
{
|
||||
// Ensure that the ending locates in R2
|
||||
if (R2 - RV <= stemmingZone.length() - endingLength)
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds ending among given ending class and returns the length of ending found(0, if not found).
|
||||
* Creation date: (17/03/2002 8:18:34 PM)
|
||||
* @return int
|
||||
* @param word java.lang.StringBuffer
|
||||
* @param theEnding char[]
|
||||
*/
|
||||
private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
|
||||
{
|
||||
boolean match = false;
|
||||
for (int i = theEndingClass.length - 1; i >= 0; i--)
|
||||
{
|
||||
char[] theEnding = theEndingClass[i];
|
||||
// check if the ending is bigger than stemming zone
|
||||
if (startIndex < theEnding.length - 1)
|
||||
{
|
||||
match = false;
|
||||
continue;
|
||||
}
|
||||
match = true;
|
||||
int stemmingIndex = startIndex;
|
||||
for (int j = theEnding.length - 1; j >= 0; j--)
|
||||
{
|
||||
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
|
||||
{
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// check if ending was found
|
||||
if (match)
|
||||
{
|
||||
return theEndingClass[i].length; // cut ending
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
|
||||
{
|
||||
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the ending among the given class of endings and removes it from stemming zone.
|
||||
* Creation date: (17/03/2002 8:18:34 PM)
|
||||
* @return boolean
|
||||
* @param word java.lang.StringBuffer
|
||||
* @param theEnding char[]
|
||||
*/
|
||||
private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
|
||||
{
|
||||
int endingLength = findEnding(stemmingZone, theEndingClass);
|
||||
if (endingLength == 0)
|
||||
// not found
|
||||
return false;
|
||||
else {
|
||||
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||
// cut the ending found
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the ending among the given class of endings, then checks if this ending was
|
||||
* preceded by any of given predessors, and if so, removes it from stemming zone.
|
||||
* Creation date: (17/03/2002 8:18:34 PM)
|
||||
* @return boolean
|
||||
* @param word java.lang.StringBuffer
|
||||
* @param theEnding char[]
|
||||
*/
|
||||
private boolean findAndRemoveEnding(StringBuffer stemmingZone,
|
||||
char[][] theEndingClass, char[][] thePredessors)
|
||||
{
|
||||
int endingLength = findEnding(stemmingZone, theEndingClass);
|
||||
if (endingLength == 0)
|
||||
// not found
|
||||
return false;
|
||||
else
|
||||
{
|
||||
int predessorLength =
|
||||
findEnding(stemmingZone,
|
||||
stemmingZone.length() - endingLength - 1,
|
||||
thePredessors);
|
||||
if (predessorLength == 0)
|
||||
return false;
|
||||
else {
|
||||
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||
// cut the ending found
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks positions of RV, R1 and R2 in a given word.
|
||||
* Creation date: (16/03/2002 3:40:11 PM)
|
||||
* @return int
|
||||
* @param word java.lang.String
|
||||
*/
|
||||
private void markPositions(String word)
|
||||
{
|
||||
RV = 0;
|
||||
R1 = 0;
|
||||
R2 = 0;
|
||||
int i = 0;
|
||||
// find RV
|
||||
while (word.length() > i && !isVowel(word.charAt(i)))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
if (word.length() - 1 < ++i)
|
||||
return; // RV zone is empty
|
||||
RV = i;
|
||||
// find R1
|
||||
while (word.length() > i && isVowel(word.charAt(i)))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
if (word.length() - 1 < ++i)
|
||||
return; // R1 zone is empty
|
||||
R1 = i;
|
||||
// find R2
|
||||
while (word.length() > i && !isVowel(word.charAt(i)))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
if (word.length() - 1 < ++i)
|
||||
return; // R2 zone is empty
|
||||
while (word.length() > i && isVowel(word.charAt(i)))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
if (word.length() - 1 < ++i)
|
||||
return; // R2 zone is empty
|
||||
R2 = i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if character is a vowel..
|
||||
* Creation date: (16/03/2002 10:47:03 PM)
|
||||
* @return boolean
|
||||
* @param letter char
|
||||
*/
|
||||
private boolean isVowel(char letter)
|
||||
{
|
||||
for (int i = 0; i < vowels.length; i++)
|
||||
{
|
||||
if (letter == charset[vowels[i]])
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Noun endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean noun(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(stemmingZone, nounEndings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perfective gerund endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean perfectiveGerund(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(
|
||||
stemmingZone,
|
||||
perfectiveGerundEndings1,
|
||||
perfectiveGerund1Predessors)
|
||||
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reflexive endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean reflexive(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert the method's description here.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean removeI(StringBuffer stemmingZone)
|
||||
{
|
||||
if (stemmingZone.length() > 0
|
||||
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert the method's description here.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean removeSoft(StringBuffer stemmingZone)
|
||||
{
|
||||
if (stemmingZone.length() > 0
|
||||
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert the method's description here.
|
||||
* Creation date: (16/03/2002 10:58:42 PM)
|
||||
* @param newCharset char[]
|
||||
*/
|
||||
public void setCharset(char[] newCharset)
|
||||
{
|
||||
charset = newCharset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set ending definition as in Russian stemming algorithm.
|
||||
* Creation date: (16/03/2002 11:16:36 PM)
|
||||
*/
|
||||
private void setEndings()
|
||||
{
|
||||
vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };
|
||||
|
||||
perfectiveGerundEndings1 = new char[][] {
|
||||
{ V }, { V, SH, I }, { V, SH, I, S, SOFT }
|
||||
};
|
||||
|
||||
perfectiveGerund1Predessors = new char[][] { { A }, { IA }
|
||||
};
|
||||
|
||||
perfectiveGerundEndings2 = new char[][] {
|
||||
{ I, V },
|
||||
{ Y, V },
|
||||
{ I, V, SH, I },
|
||||
{ Y, V, SH, I },
|
||||
{ I, V, SH, I, S, SOFT },
|
||||
{ Y, V, SH, I, S, SOFT }
|
||||
};
|
||||
|
||||
adjectiveEndings = new char[][] {
|
||||
{ E, E },
|
||||
{ I, E },
|
||||
{ Y, E },
|
||||
{ O, E },
|
||||
{ E, I_ },
|
||||
{ I, I_ },
|
||||
{ Y, I_ },
|
||||
{ O, I_ },
|
||||
{ E, M },
|
||||
{ I, M },
|
||||
{ Y, M },
|
||||
{ O, M },
|
||||
{ I, X },
|
||||
{ Y, X },
|
||||
{ U, IU },
|
||||
{ IU, IU },
|
||||
{ A, IA },
|
||||
{ IA, IA },
|
||||
{ O, IU },
|
||||
{ E, IU },
|
||||
{ I, M, I },
|
||||
{ Y, M, I },
|
||||
{ E, G, O },
|
||||
{ O, G, O },
|
||||
{ E, M, U },
|
||||
{ O, M, U }
|
||||
};
|
||||
|
||||
participleEndings1 = new char[][] {
|
||||
{ SHCH },
|
||||
{ E, M },
|
||||
{ N, N },
|
||||
{ V, SH },
|
||||
{ IU, SHCH }
|
||||
};
|
||||
|
||||
participleEndings2 = new char[][] {
|
||||
{ I, V, SH },
|
||||
{ Y, V, SH },
|
||||
{ U, IU, SHCH }
|
||||
};
|
||||
|
||||
participle1Predessors = new char[][] {
|
||||
{ A },
|
||||
{ IA }
|
||||
};
|
||||
|
||||
reflexiveEndings = new char[][] {
|
||||
{ S, IA },
|
||||
{ S, SOFT }
|
||||
};
|
||||
|
||||
verbEndings1 = new char[][] {
|
||||
{ I_ },
|
||||
{ L },
|
||||
{ N },
|
||||
{ L, O },
|
||||
{ N, O },
|
||||
{ E, T },
|
||||
{ IU, T },
|
||||
{ L, A },
|
||||
{ N, A },
|
||||
{ L, I },
|
||||
{ E, M },
|
||||
{ N, Y },
|
||||
{ E, T, E },
|
||||
{ I_, T, E },
|
||||
{ T, SOFT },
|
||||
{ E, SH, SOFT },
|
||||
{ N, N, O }
|
||||
};
|
||||
|
||||
verbEndings2 = new char[][] {
|
||||
{ IU },
|
||||
{ U, IU },
|
||||
{ E, N },
|
||||
{ E, I_ },
|
||||
{ IA, T },
|
||||
{ U, I_ },
|
||||
{ I, L },
|
||||
{ Y, L },
|
||||
{ I, M },
|
||||
{ Y, M },
|
||||
{ I, T },
|
||||
{ Y, T },
|
||||
{ I, L, A },
|
||||
{ Y, L, A },
|
||||
{ E, N, A },
|
||||
{ I, T, E },
|
||||
{ I, L, I },
|
||||
{ Y, L, I },
|
||||
{ I, L, O },
|
||||
{ Y, L, O },
|
||||
{ E, N, O },
|
||||
{ U, E, T },
|
||||
{ U, IU, T },
|
||||
{ E, N, Y },
|
||||
{ I, T, SOFT },
|
||||
{ Y, T, SOFT },
|
||||
{ I, SH, SOFT },
|
||||
{ E, I_, T, E },
|
||||
{ U, I_, T, E }
|
||||
};
|
||||
|
||||
verb1Predessors = new char[][] {
|
||||
{ A },
|
||||
{ IA }
|
||||
};
|
||||
|
||||
nounEndings = new char[][] {
|
||||
{ A },
|
||||
{ IU },
|
||||
{ I_ },
|
||||
{ O },
|
||||
{ U },
|
||||
{ E },
|
||||
{ Y },
|
||||
{ I },
|
||||
{ SOFT },
|
||||
{ IA },
|
||||
{ E, V },
|
||||
{ O, V },
|
||||
{ I, E },
|
||||
{ SOFT, E },
|
||||
{ IA, X },
|
||||
{ I, IU },
|
||||
{ E, I },
|
||||
{ I, I },
|
||||
{ E, I_ },
|
||||
{ O, I_ },
|
||||
{ E, M },
|
||||
{ A, M },
|
||||
{ O, M },
|
||||
{ A, X },
|
||||
{ SOFT, IU },
|
||||
{ I, IA },
|
||||
{ SOFT, IA },
|
||||
{ I, I_ },
|
||||
{ IA, M },
|
||||
{ IA, M, I },
|
||||
{ A, M, I },
|
||||
{ I, E, I_ },
|
||||
{ I, IA, M },
|
||||
{ I, E, M },
|
||||
{ I, IA, X },
|
||||
{ I, IA, M, I }
|
||||
};
|
||||
|
||||
superlativeEndings = new char[][] {
|
||||
{ E, I_, SH },
|
||||
{ E, I_, SH, E }
|
||||
};
|
||||
|
||||
derivationalEndings = new char[][] {
|
||||
{ O, S, T },
|
||||
{ O, S, T, SOFT }
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the stem for given Russian word.
|
||||
* Creation date: (16/03/2002 3:36:48 PM)
|
||||
* @return java.lang.String
|
||||
* @param input java.lang.String
|
||||
*/
|
||||
public String stem(String input)
|
||||
{
|
||||
markPositions(input);
|
||||
if (RV == 0)
|
||||
return input; //RV wasn't detected, nothing to stem
|
||||
StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
|
||||
// stemming goes on in RV
|
||||
// Step 1
|
||||
|
||||
if (!perfectiveGerund(stemmingZone))
|
||||
{
|
||||
reflexive(stemmingZone);
|
||||
boolean r =
|
||||
adjectival(stemmingZone)
|
||||
|| verb(stemmingZone)
|
||||
|| noun(stemmingZone);
|
||||
}
|
||||
// Step 2
|
||||
removeI(stemmingZone);
|
||||
// Step 3
|
||||
derivational(stemmingZone);
|
||||
// Step 4
|
||||
superlative(stemmingZone);
|
||||
undoubleN(stemmingZone);
|
||||
removeSoft(stemmingZone);
|
||||
// return result
|
||||
return input.substring(0, RV) + stemmingZone.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Superlative endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean superlative(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(stemmingZone, superlativeEndings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Undoubles N.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean undoubleN(StringBuffer stemmingZone)
|
||||
{
|
||||
char[][] doubleN = {
|
||||
{ N, N }
|
||||
};
|
||||
if (findEnding(stemmingZone, doubleN) != 0)
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verb endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean verb(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(
|
||||
stemmingZone,
|
||||
verbEndings1,
|
||||
verb1Predessors)
|
||||
|| findAndRemoveEnding(stemmingZone, verbEndings2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Static method for stemming with different charsets
|
||||
*/
|
||||
public static String stem(String theWord, char[] charset)
|
||||
{
|
||||
RussianStemmer stemmer = new RussianStemmer();
|
||||
stemmer.setCharset(charset);
|
||||
return stemmer.stem(theWord);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue