copy the Russian and German analyzers plus their test cases to the sandbox

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150998 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Daniel Naber 2004-08-16 20:30:46 +00:00
parent 726ddaeb5a
commit 87bcdf6f25
24 changed files with 2398 additions and 0 deletions

View File

@ -0,0 +1,135 @@
package org.apache.lucene.analysis.de;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
/**
* Analyzer for German language. Supports an external list of stopwords (words that
* will not be indexed at all) and an external list of exclusions (word that will
* not be stemmed, but indexed).
* A default set of stopwords is used unless an alternative list is specified, the
* exclusion list is empty by default.
*
* @author Gerhard Schwarz
* @version $Id$
*/
public class GermanAnalyzer extends Analyzer {
/**
* List of typical german stopwords.
*/
private String[] GERMAN_STOP_WORDS = {
"einer", "eine", "eines", "einem", "einen",
"der", "die", "das", "dass", "daß",
"du", "er", "sie", "es",
"was", "wer", "wie", "wir",
"und", "oder", "ohne", "mit",
"am", "im", "in", "aus", "auf",
"ist", "sein", "war", "wird",
"ihr", "ihre", "ihres",
"als", "für", "von", "mit",
"dich", "dir", "mich", "mir",
"mein", "sein", "kein",
"durch", "wegen", "wird"
};
/**
* Contains the stopwords used with the StopFilter.
*/
private Set stopSet = new HashSet();
/**
* Contains words that should be indexed but not stemmed.
*/
private Set exclusionSet = new HashSet();
/**
* Builds an analyzer.
*/
public GermanAnalyzer() {
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
}
/**
* Builds an analyzer with the given stop words.
*/
public GermanAnalyzer(String[] stopwords) {
stopSet = StopFilter.makeStopSet(stopwords);
}
/**
* Builds an analyzer with the given stop words.
*/
public GermanAnalyzer(Hashtable stopwords) {
stopSet = new HashSet(stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
*/
public GermanAnalyzer(File stopwords) throws IOException {
stopSet = WordlistLoader.getWordSet(stopwords);
}
/**
* Builds an exclusionlist from an array of Strings.
*/
public void setStemExclusionTable(String[] exclusionlist) {
exclusionSet = StopFilter.makeStopSet(exclusionlist);
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable(Hashtable exclusionlist) {
exclusionSet = new HashSet(exclusionlist.keySet());
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable(File exclusionlist) throws IOException {
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
}
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
* StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
result = new GermanStemFilter(result, exclusionSet);
return result;
}
}

View File

@ -0,0 +1,119 @@
package org.apache.lucene.analysis.de;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/**
* A filter that stems German words. It supports a table of words that should
* not be stemmed at all. The stemmer used can be changed at runtime after the
* filter object is created (as long as it is a GermanStemmer).
*
* @author Gerhard Schwarz
* @version $Id$
*/
public final class GermanStemFilter extends TokenFilter
{
/**
* The actual token in the input stream.
*/
private Token token = null;
private GermanStemmer stemmer = null;
private Set exclusionSet = null;
public GermanStemFilter( TokenStream in )
{
super(in);
stemmer = new GermanStemmer();
}
/**
* Builds a GermanStemFilter that uses an exclusiontable.
* @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
*/
public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
{
this( in );
exclusionSet = new HashSet(exclusiontable.keySet());
}
/**
* Builds a GermanStemFilter that uses an exclusiontable.
*/
public GermanStemFilter( TokenStream in, Set exclusionSet )
{
this( in );
this.exclusionSet = exclusionSet;
}
/**
* @return Returns the next token in the stream, or null at EOS
*/
public final Token next()
throws IOException
{
if ( ( token = input.next() ) == null ) {
return null;
}
// Check the exclusiontable
else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
return token;
}
else {
String s = stemmer.stem( token.termText() );
// If not stemmed, dont waste the time creating a new token
if ( !s.equals( token.termText() ) ) {
return new Token( s, token.startOffset(),
token.endOffset(), token.type() );
}
return token;
}
}
/**
* Set a alternative/custom GermanStemmer for this filter.
*/
public void setStemmer( GermanStemmer stemmer )
{
if ( stemmer != null ) {
this.stemmer = stemmer;
}
}
/**
* Set an alternative exclusion list for this filter.
* @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
*/
public void setExclusionTable( Hashtable exclusiontable )
{
exclusionSet = new HashSet(exclusiontable.keySet());
}
/**
* Set an alternative exclusion list for this filter.
*/
public void setExclusionSet( Set exclusionSet )
{
this.exclusionSet = exclusionSet;
}
}

View File

@ -0,0 +1,265 @@
package org.apache.lucene.analysis.de;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A stemmer for German words. The algorithm is based on the report
* "A Fast and Simple Stemming Algorithm for German Words" by Jörg
* Caumanns (joerg.caumanns@isst.fhg.de).
*
* @author Gerhard Schwarz
* @version $Id$
*/
public class GermanStemmer
{
/**
* Buffer for the terms while stemming them.
*/
private StringBuffer sb = new StringBuffer();
/**
* Amount of characters that are removed with <tt>substitute()</tt> while stemming.
*/
private int substCount = 0;
/**
* Stemms the given term to an unique <tt>discriminator</tt>.
*
* @param term The term that should be stemmed.
* @return Discriminator for <tt>term</tt>
*/
protected String stem( String term )
{
// Use lowercase for medium stemming.
term = term.toLowerCase();
if ( !isStemmable( term ) )
return term;
// Reset the StringBuffer.
sb.delete( 0, sb.length() );
sb.insert( 0, term );
// Stemming starts here...
substitute( sb );
strip( sb );
optimize( sb );
resubstitute( sb );
removeParticleDenotion( sb );
return sb.toString();
}
/**
* Checks if a term could be stemmed.
*
* @return true if, and only if, the given term consists in letters.
*/
private boolean isStemmable( String term )
{
for ( int c = 0; c < term.length(); c++ ) {
if ( !Character.isLetter( term.charAt( c ) ) )
return false;
}
return true;
}
/**
* suffix stripping (stemming) on the current term. The stripping is reduced
* to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
* from which all regular suffixes are build of. The simplification causes
* some overstemming, and way more irregular stems, but still provides unique.
* discriminators in the most of those cases.
* The algorithm is context free, except of the length restrictions.
*/
private void strip( StringBuffer buffer )
{
boolean doMore = true;
while ( doMore && buffer.length() > 3 ) {
if ( ( buffer.length() + substCount > 5 ) &&
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
{
buffer.delete( buffer.length() - 2, buffer.length() );
}
else if ( ( buffer.length() + substCount > 4 ) &&
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
buffer.delete( buffer.length() - 2, buffer.length() );
}
else if ( ( buffer.length() + substCount > 4 ) &&
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
buffer.delete( buffer.length() - 2, buffer.length() );
}
else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
// "t" occurs only as suffix of verbs.
else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
buffer.deleteCharAt( buffer.length() - 1 );
}
else {
doMore = false;
}
}
}
/**
* Does some optimizations on the term. This optimisations are
* contextual.
*/
private void optimize( StringBuffer buffer )
{
// Additional step for female plurals of professions and inhabitants.
if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
buffer.deleteCharAt( buffer.length() -1 );
strip( buffer );
}
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
buffer.setCharAt( buffer.length() - 1, 'x' );
}
}
/**
* Removes a particle denotion ("ge") from a term.
*/
private void removeParticleDenotion( StringBuffer buffer )
{
if ( buffer.length() > 4 ) {
for ( int c = 0; c < buffer.length() - 3; c++ ) {
if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
buffer.delete( c, c + 2 );
return;
}
}
}
}
/**
* Do some substitutions for the term to reduce overstemming:
*
* - Substitute Umlauts with their corresponding vowel: äöü -> aou,
* "ß" is substituted by "ss"
* - Substitute a second char of a pair of equal characters with
* an asterisk: ?? -> ?*
* - Substitute some common character combinations with a token:
* sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
*/
private void substitute( StringBuffer buffer )
{
substCount = 0;
for ( int c = 0; c < buffer.length(); c++ ) {
// Replace the second char of a pair of the equal characters with an asterisk
if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
buffer.setCharAt( c, '*' );
}
// Substitute Umlauts.
else if ( buffer.charAt( c ) == 'ä' ) {
buffer.setCharAt( c, 'a' );
}
else if ( buffer.charAt( c ) == 'ö' ) {
buffer.setCharAt( c, 'o' );
}
else if ( buffer.charAt( c ) == 'ü' ) {
buffer.setCharAt( c, 'u' );
}
// Fix bug so that 'ß' at the end of a word is replaced.
else if ( buffer.charAt( c ) == 'ß' ) {
buffer.setCharAt( c, 's' );
buffer.insert( c + 1, 's' );
substCount++;
}
// Take care that at least one character is left left side from the current one
if ( c < buffer.length() - 1 ) {
// Masking several common character combinations with an token
if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
{
buffer.setCharAt( c, '$' );
buffer.delete( c + 1, c + 3 );
substCount =+ 2;
}
else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
buffer.setCharAt( c, '§' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
buffer.setCharAt( c, '%' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
buffer.setCharAt( c, '&' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
buffer.setCharAt( c, '#' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
buffer.setCharAt( c, '!' );
buffer.deleteCharAt( c + 1 );
substCount++;
}
}
}
}
/**
* Undoes the changes made by substitute(). That are character pairs and
* character combinations. Umlauts will remain as their corresponding vowel,
* as "ß" remains as "ss".
*/
private void resubstitute( StringBuffer buffer )
{
for ( int c = 0; c < buffer.length(); c++ ) {
if ( buffer.charAt( c ) == '*' ) {
char x = buffer.charAt( c - 1 );
buffer.setCharAt( c, x );
}
else if ( buffer.charAt( c ) == '$' ) {
buffer.setCharAt( c, 's' );
buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
}
else if ( buffer.charAt( c ) == '§' ) {
buffer.setCharAt( c, 'c' );
buffer.insert( c + 1, 'h' );
}
else if ( buffer.charAt( c ) == '%' ) {
buffer.setCharAt( c, 'e' );
buffer.insert( c + 1, 'i' );
}
else if ( buffer.charAt( c ) == '&' ) {
buffer.setCharAt( c, 'i' );
buffer.insert( c + 1, 'e' );
}
else if ( buffer.charAt( c ) == '#' ) {
buffer.setCharAt( c, 'i' );
buffer.insert( c + 1, 'g' );
}
else if ( buffer.charAt( c ) == '!' ) {
buffer.setCharAt( c, 's' );
buffer.insert( c + 1, 't' );
}
}
}
}

View File

@ -0,0 +1,111 @@
package org.apache.lucene.analysis.de;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
/**
* Loader for text files that represent a list of stopwords.
*
* @author Gerhard Schwarz
* @version $Id$
*
* @todo this is not specific to German, it should be moved up
*/
public class WordlistLoader {
/**
* Loads a text file and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
*
* @param wordfile File containing the wordlist
* @return A HashSet with the file's words
*/
public static HashSet getWordSet(File wordfile) throws IOException {
HashSet result = new HashSet();
FileReader freader = null;
LineNumberReader lnr = null;
try {
freader = new FileReader(wordfile);
lnr = new LineNumberReader(freader);
String word = null;
while ((word = lnr.readLine()) != null) {
result.add(word.trim());
}
}
finally {
if (lnr != null)
lnr.close();
if (freader != null)
freader.close();
}
return result;
}
/**
* @param path Path to the wordlist
* @param wordfile Name of the wordlist
*
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
*/
public static Hashtable getWordtable(String path, String wordfile) throws IOException {
return getWordtable(new File(path, wordfile));
}
/**
* @param wordfile Complete path to the wordlist
*
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
*/
public static Hashtable getWordtable(String wordfile) throws IOException {
return getWordtable(new File(wordfile));
}
/**
* @param wordfile File object that points to the wordlist
*
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
*/
public static Hashtable getWordtable(File wordfile) throws IOException {
HashSet wordSet = (HashSet)getWordSet(wordfile);
Hashtable result = makeWordTable(wordSet);
return result;
}
/**
* Builds a wordlist table, using words as both keys and values
* for backward compatibility.
*
* @param wordSet stopword set
*/
private static Hashtable makeWordTable(HashSet wordSet) {
Hashtable table = new Hashtable();
for (Iterator iter = wordSet.iterator(); iter.hasNext();) {
String word = (String)iter.next();
table.put(word, word);
}
return table;
}
}

View File

@ -0,0 +1,5 @@
<html>
<body>
Support for indexing and searching of German text.
</body>
</html>

View File

@ -0,0 +1,259 @@
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.Reader;
import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/**
* Analyzer for Russian language. Supports an external list of stopwords (words that
* will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
*
* @author Boris Okner, b.okner@rogers.com
* @version $Id$
*/
public final class RussianAnalyzer extends Analyzer
{
// letters (currently unused letters are commented out)
private final static char A = 0;
private final static char B = 1;
private final static char V = 2;
private final static char G = 3;
private final static char D = 4;
private final static char E = 5;
private final static char ZH = 6;
private final static char Z = 7;
private final static char I = 8;
private final static char I_ = 9;
private final static char K = 10;
private final static char L = 11;
private final static char M = 12;
private final static char N = 13;
private final static char O = 14;
private final static char P = 15;
private final static char R = 16;
private final static char S = 17;
private final static char T = 18;
private final static char U = 19;
//private final static char F = 20;
private final static char X = 21;
//private final static char TS = 22;
private final static char CH = 23;
private final static char SH = 24;
private final static char SHCH = 25;
//private final static char HARD = 26;
private final static char Y = 27;
private final static char SOFT = 28;
private final static char AE = 29;
private final static char IU = 30;
private final static char IA = 31;
/**
* List of typical Russian stopwords.
*/
private static char[][] RUSSIAN_STOP_WORDS = {
{A},
{B, E, Z},
{B, O, L, E, E},
{B, Y},
{B, Y, L},
{B, Y, L, A},
{B, Y, L, I},
{B, Y, L, O},
{B, Y, T, SOFT},
{V},
{V, A, M},
{V, A, S},
{V, E, S, SOFT},
{V, O},
{V, O, T},
{V, S, E},
{V, S, E, G, O},
{V, S, E, X},
{V, Y},
{G, D, E},
{D, A},
{D, A, ZH, E},
{D, L, IA},
{D, O},
{E, G, O},
{E, E},
{E, I_,},
{E, IU},
{E, S, L, I},
{E, S, T, SOFT},
{E, SHCH, E},
{ZH, E},
{Z, A},
{Z, D, E, S, SOFT},
{I},
{I, Z},
{I, L, I},
{I, M},
{I, X},
{K},
{K, A, K},
{K, O},
{K, O, G, D, A},
{K, T, O},
{L, I},
{L, I, B, O},
{M, N, E},
{M, O, ZH, E, T},
{M, Y},
{N, A},
{N, A, D, O},
{N, A, SH},
{N, E},
{N, E, G, O},
{N, E, E},
{N, E, T},
{N, I},
{N, I, X},
{N, O},
{N, U},
{O},
{O, B},
{O, D, N, A, K, O},
{O, N},
{O, N, A},
{O, N, I},
{O, N, O},
{O, T},
{O, CH, E, N, SOFT},
{P, O},
{P, O, D},
{P, R, I},
{S},
{S, O},
{T, A, K},
{T, A, K, ZH, E},
{T, A, K, O, I_},
{T, A, M},
{T, E},
{T, E, M},
{T, O},
{T, O, G, O},
{T, O, ZH, E},
{T, O, I_},
{T, O, L, SOFT, K, O},
{T, O, M},
{T, Y},
{U},
{U, ZH, E},
{X, O, T, IA},
{CH, E, G, O},
{CH, E, I_},
{CH, E, M},
{CH, T, O},
{CH, T, O, B, Y},
{CH, SOFT, E},
{CH, SOFT, IA},
{AE, T, A},
{AE, T, I},
{AE, T, O},
{IA}
};
/**
* Contains the stopwords used with the StopFilter.
*/
private Set stopSet = new HashSet();
/**
* Charset for Russian letters.
* Represents encoding for 32 lowercase Russian letters.
* Predefined charsets can be taken from RussianCharSets class
*/
private char[] charset;
public RussianAnalyzer() {
charset = RussianCharsets.UnicodeRussian;
stopSet = StopFilter.makeStopSet(
makeStopWords(RussianCharsets.UnicodeRussian));
}
/**
* Builds an analyzer.
*/
public RussianAnalyzer(char[] charset)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
}
/**
* Builds an analyzer with the given stop words.
*/
public RussianAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(stopwords);
}
// Takes russian stop words and translates them to a String array, using
// the given charset
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[RUSSIAN_STOP_WORDS.length];
for (int i = 0; i < res.length; i++)
{
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
// translate the word, using the charset
StringBuffer theWord = new StringBuffer();
for (int j = 0; j < theStopWord.length; j++)
{
theWord.append(charset[theStopWord[j]]);
}
res[i] = theWord.toString();
}
return res;
}
/**
* Builds an analyzer with the given stop words.
* @todo create a Set version of this ctor
*/
public RussianAnalyzer(char[] charset, Hashtable stopwords)
{
this.charset = charset;
stopSet = new HashSet(stopwords.keySet());
}
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a RussianLetterTokenizer filtered with
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new RussianLetterTokenizer(reader, charset);
result = new RussianLowerCaseFilter(result, charset);
result = new StopFilter(result, stopSet);
result = new RussianStemFilter(result, charset);
return result;
}
}

View File

@ -0,0 +1,279 @@
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
* for russian characters in Unicode, KOI8 and CP1252.
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
* and adding logic to toLowerCase() method for that charset.
*
* @author Boris Okner, b.okner@rogers.com
* @version $Id$
*/
public class RussianCharsets
{
// Unicode Russian charset (lowercase letters only)
public static char[] UnicodeRussian = {
'\u0430',
'\u0431',
'\u0432',
'\u0433',
'\u0434',
'\u0435',
'\u0436',
'\u0437',
'\u0438',
'\u0439',
'\u043A',
'\u043B',
'\u043C',
'\u043D',
'\u043E',
'\u043F',
'\u0440',
'\u0441',
'\u0442',
'\u0443',
'\u0444',
'\u0445',
'\u0446',
'\u0447',
'\u0448',
'\u0449',
'\u044A',
'\u044B',
'\u044C',
'\u044D',
'\u044E',
'\u044F',
// upper case
'\u0410',
'\u0411',
'\u0412',
'\u0413',
'\u0414',
'\u0415',
'\u0416',
'\u0417',
'\u0418',
'\u0419',
'\u041A',
'\u041B',
'\u041C',
'\u041D',
'\u041E',
'\u041F',
'\u0420',
'\u0421',
'\u0422',
'\u0423',
'\u0424',
'\u0425',
'\u0426',
'\u0427',
'\u0428',
'\u0429',
'\u042A',
'\u042B',
'\u042C',
'\u042D',
'\u042E',
'\u042F'
};
// KOI8 charset
public static char[] KOI8 = {
0xc1,
0xc2,
0xd7,
0xc7,
0xc4,
0xc5,
0xd6,
0xda,
0xc9,
0xca,
0xcb,
0xcc,
0xcd,
0xce,
0xcf,
0xd0,
0xd2,
0xd3,
0xd4,
0xd5,
0xc6,
0xc8,
0xc3,
0xde,
0xdb,
0xdd,
0xdf,
0xd9,
0xd8,
0xdc,
0xc0,
0xd1,
// upper case
0xe1,
0xe2,
0xf7,
0xe7,
0xe4,
0xe5,
0xf6,
0xfa,
0xe9,
0xea,
0xeb,
0xec,
0xed,
0xee,
0xef,
0xf0,
0xf2,
0xf3,
0xf4,
0xf5,
0xe6,
0xe8,
0xe3,
0xfe,
0xfb,
0xfd,
0xff,
0xf9,
0xf8,
0xfc,
0xe0,
0xf1
};
// CP1251 eharset
public static char[] CP1251 = {
0xE0,
0xE1,
0xE2,
0xE3,
0xE4,
0xE5,
0xE6,
0xE7,
0xE8,
0xE9,
0xEA,
0xEB,
0xEC,
0xED,
0xEE,
0xEF,
0xF0,
0xF1,
0xF2,
0xF3,
0xF4,
0xF5,
0xF6,
0xF7,
0xF8,
0xF9,
0xFA,
0xFB,
0xFC,
0xFD,
0xFE,
0xFF,
// upper case
0xC0,
0xC1,
0xC2,
0xC3,
0xC4,
0xC5,
0xC6,
0xC7,
0xC8,
0xC9,
0xCA,
0xCB,
0xCC,
0xCD,
0xCE,
0xCF,
0xD0,
0xD1,
0xD2,
0xD3,
0xD4,
0xD5,
0xD6,
0xD7,
0xD8,
0xD9,
0xDA,
0xDB,
0xDC,
0xDD,
0xDE,
0xDF
};
public static char toLowerCase(char letter, char[] charset)
{
if (charset == UnicodeRussian)
{
if (letter >= '\u0430' && letter <= '\u044F')
{
return letter;
}
if (letter >= '\u0410' && letter <= '\u042F')
{
return (char) (letter + 32);
}
}
if (charset == KOI8)
{
if (letter >= 0xe0 && letter <= 0xff)
{
return (char) (letter - 32);
}
if (letter >= 0xc0 && letter <= 0xdf)
{
return letter;
}
}
if (charset == CP1251)
{
if (letter >= 0xC0 && letter <= 0xDF)
{
return (char) (letter + 32);
}
if (letter >= 0xE0 && letter <= 0xFF)
{
return letter;
}
}
return Character.toLowerCase(letter);
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
/**
* A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
* in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
* (well-known problems with 0xD7 and 0xF7 chars)
*
* @author Boris Okner, b.okner@rogers.com
* @version $Id$
*/
public class RussianLetterTokenizer extends CharTokenizer
{
/** Construct a new LetterTokenizer. */
private char[] charset;
public RussianLetterTokenizer(Reader in, char[] charset)
{
super(in);
this.charset = charset;
}
/**
* Collects only characters which satisfy
* {@link Character#isLetter(char)}.
*/
protected boolean isTokenChar(char c)
{
if (Character.isLetter(c))
return true;
for (int i = 0; i < charset.length; i++)
{
if (c == charset[i])
return true;
}
return false;
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* Normalizes token text to lower case, analyzing given ("russian") charset.
*
* @author Boris Okner, b.okner@rogers.com
* @version $Id$
*/
public final class RussianLowerCaseFilter extends TokenFilter
{
char[] charset;
public RussianLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
}
public final Token next() throws java.io.IOException
{
Token t = input.next();
if (t == null)
return null;
String txt = t.termText();
char[] chArray = txt.toCharArray();
for (int i = 0; i < chArray.length; i++)
{
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
}
String newTxt = new String(chArray);
// create new token
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
return newToken;
}
}

View File

@ -0,0 +1,77 @@
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
/**
* A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
* The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
* because RussianStemFilter only works with lowercase part of any "russian" charset.
*
* @author Boris Okner, b.okner@rogers.com
* @version $Id$
*/
public final class RussianStemFilter extends TokenFilter
{
/**
* The actual token in the input stream.
*/
private Token token = null;
private RussianStemmer stemmer = null;
public RussianStemFilter(TokenStream in, char[] charset)
{
super(in);
stemmer = new RussianStemmer(charset);
}
/**
* @return Returns the next token in the stream, or null at EOS
*/
public final Token next() throws IOException
{
if ((token = input.next()) == null)
{
return null;
}
else
{
String s = stemmer.stem(token.termText());
if (!s.equals(token.termText()))
{
return new Token(s, token.startOffset(), token.endOffset(),
token.type());
}
return token;
}
}
/**
* Set a alternative/custom RussianStemmer for this filter.
*/
public void setStemmer(RussianStemmer stemmer)
{
if (stemmer != null)
{
this.stemmer = stemmer;
}
}
}

View File

@ -0,0 +1,629 @@
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
*
* @author Boris Okner, b.okner@rogers.com
* @version $Id$
*/
class RussianStemmer
{
private char[] charset;
// positions of RV, R1 and R2 respectively
private int RV, R1, R2;
// letters (currently unused letters are commented out)
private final static char A = 0;
//private final static char B = 1;
private final static char V = 2;
private final static char G = 3;
//private final static char D = 4;
private final static char E = 5;
//private final static char ZH = 6;
//private final static char Z = 7;
private final static char I = 8;
private final static char I_ = 9;
//private final static char K = 10;
private final static char L = 11;
private final static char M = 12;
private final static char N = 13;
private final static char O = 14;
//private final static char P = 15;
//private final static char R = 16;
private final static char S = 17;
private final static char T = 18;
private final static char U = 19;
//private final static char F = 20;
private final static char X = 21;
//private final static char TS = 22;
//private final static char CH = 23;
private final static char SH = 24;
private final static char SHCH = 25;
//private final static char HARD = 26;
private final static char Y = 27;
private final static char SOFT = 28;
private final static char AE = 29;
private final static char IU = 30;
private final static char IA = 31;
// stem definitions
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
private static char[][] perfectiveGerundEndings1 = {
{ V },
{ V, SH, I },
{ V, SH, I, S, SOFT }
};
private static char[][] perfectiveGerund1Predessors = {
{ A },
{ IA }
};
private static char[][] perfectiveGerundEndings2 = { { I, V }, {
Y, V }, {
I, V, SH, I }, {
Y, V, SH, I }, {
I, V, SH, I, S, SOFT }, {
Y, V, SH, I, S, SOFT }
};
private static char[][] adjectiveEndings = {
{ E, E },
{ I, E },
{ Y, E },
{ O, E },
{ E, I_ },
{ I, I_ },
{ Y, I_ },
{ O, I_ },
{ E, M },
{ I, M },
{ Y, M },
{ O, M },
{ I, X },
{ Y, X },
{ U, IU },
{ IU, IU },
{ A, IA },
{ IA, IA },
{ O, IU },
{ E, IU },
{ I, M, I },
{ Y, M, I },
{ E, G, O },
{ O, G, O },
{ E, M, U },
{O, M, U }
};
private static char[][] participleEndings1 = {
{ SHCH },
{ E, M },
{ N, N },
{ V, SH },
{ IU, SHCH }
};
private static char[][] participleEndings2 = {
{ I, V, SH },
{ Y, V, SH },
{ U, IU, SHCH }
};
private static char[][] participle1Predessors = {
{ A },
{ IA }
};
private static char[][] reflexiveEndings = {
{ S, IA },
{ S, SOFT }
};
private static char[][] verbEndings1 = {
{ I_ },
{ L },
{ N },
{ L, O },
{ N, O },
{ E, T },
{ IU, T },
{ L, A },
{ N, A },
{ L, I },
{ E, M },
{ N, Y },
{ E, T, E },
{ I_, T, E },
{ T, SOFT },
{ E, SH, SOFT },
{ N, N, O }
};
private static char[][] verbEndings2 = {
{ IU },
{ U, IU },
{ E, N },
{ E, I_ },
{ IA, T },
{ U, I_ },
{ I, L },
{ Y, L },
{ I, M },
{ Y, M },
{ I, T },
{ Y, T },
{ I, L, A },
{ Y, L, A },
{ E, N, A },
{ I, T, E },
{ I, L, I },
{ Y, L, I },
{ I, L, O },
{ Y, L, O },
{ E, N, O },
{ U, E, T },
{ U, IU, T },
{ E, N, Y },
{ I, T, SOFT },
{ Y, T, SOFT },
{ I, SH, SOFT },
{ E, I_, T, E },
{ U, I_, T, E }
};
private static char[][] verb1Predessors = {
{ A },
{ IA }
};
private static char[][] nounEndings = {
{ A },
{ U },
{ I_ },
{ O },
{ U },
{ E },
{ Y },
{ I },
{ SOFT },
{ IA },
{ E, V },
{ O, V },
{ I, E },
{ SOFT, E },
{ IA, X },
{ I, IU },
{ E, I },
{ I, I },
{ E, I_ },
{ O, I_ },
{ E, M },
{ A, M },
{ O, M },
{ A, X },
{ SOFT, IU },
{ I, IA },
{ SOFT, IA },
{ I, I_ },
{ IA, M },
{ IA, M, I },
{ A, M, I },
{ I, E, I_ },
{ I, IA, M },
{ I, E, M },
{ I, IA, X },
{ I, IA, M, I }
};
private static char[][] superlativeEndings = {
{ E, I_, SH },
{ E, I_, SH, E }
};
private static char[][] derivationalEndings = {
{ O, S, T },
{ O, S, T, SOFT }
};
/**
* RussianStemmer constructor comment.
*/
public RussianStemmer()
{
super();
}
/**
* RussianStemmer constructor comment.
*/
public RussianStemmer(char[] charset)
{
super();
this.charset = charset;
}
/**
* Adjectival ending is an adjective ending,
* optionally preceded by participle ending.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean adjectival(StringBuffer stemmingZone)
{
// look for adjective ending in a stemming zone
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
return false;
// if adjective ending was found, try for participle ending.
// variable r is unused, we are just interested in the side effect of
// findAndRemoveEnding():
boolean r =
findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
||
findAndRemoveEnding(stemmingZone, participleEndings2);
return true;
}
/**
* Derivational endings
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean derivational(StringBuffer stemmingZone)
{
int endingLength = findEnding(stemmingZone, derivationalEndings);
if (endingLength == 0)
// no derivational ending found
return false;
else
{
// Ensure that the ending locates in R2
if (R2 - RV <= stemmingZone.length() - endingLength)
{
stemmingZone.setLength(stemmingZone.length() - endingLength);
return true;
}
else
{
return false;
}
}
}
/**
* Finds ending among given ending class and returns the length of ending found(0, if not found).
* Creation date: (17/03/2002 8:18:34 PM)
*/
private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
{
boolean match = false;
for (int i = theEndingClass.length - 1; i >= 0; i--)
{
char[] theEnding = theEndingClass[i];
// check if the ending is bigger than stemming zone
if (startIndex < theEnding.length - 1)
{
match = false;
continue;
}
match = true;
int stemmingIndex = startIndex;
for (int j = theEnding.length - 1; j >= 0; j--)
{
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
{
match = false;
break;
}
}
// check if ending was found
if (match)
{
return theEndingClass[i].length; // cut ending
}
}
return 0;
}
private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
{
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
}
/**
* Finds the ending among the given class of endings and removes it from stemming zone.
* Creation date: (17/03/2002 8:18:34 PM)
*/
private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
{
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
/**
* Finds the ending among the given class of endings, then checks if this ending was
* preceded by any of given predessors, and if so, removes it from stemming zone.
* Creation date: (17/03/2002 8:18:34 PM)
*/
private boolean findAndRemoveEnding(StringBuffer stemmingZone,
char[][] theEndingClass, char[][] thePredessors)
{
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
int predessorLength =
findEnding(stemmingZone,
stemmingZone.length() - endingLength - 1,
thePredessors);
if (predessorLength == 0)
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
}
/**
* Marks positions of RV, R1 and R2 in a given word.
* Creation date: (16/03/2002 3:40:11 PM)
*/
private void markPositions(String word)
{
RV = 0;
R1 = 0;
R2 = 0;
int i = 0;
// find RV
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // RV zone is empty
RV = i;
// find R1
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R1 zone is empty
R1 = i;
// find R2
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
R2 = i;
}
/**
* Checks if character is a vowel..
* Creation date: (16/03/2002 10:47:03 PM)
* @return boolean
* @param letter char
*/
private boolean isVowel(char letter)
{
for (int i = 0; i < vowels.length; i++)
{
if (letter == charset[vowels[i]])
return true;
}
return false;
}
/**
* Noun endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean noun(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, nounEndings);
}
/**
* Perfective gerund endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean perfectiveGerund(StringBuffer stemmingZone)
{
return findAndRemoveEnding(
stemmingZone,
perfectiveGerundEndings1,
perfectiveGerund1Predessors)
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
}
/**
* Reflexive endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean reflexive(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
}
/**
* Insert the method's description here.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean removeI(StringBuffer stemmingZone)
{
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Insert the method's description here.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean removeSoft(StringBuffer stemmingZone)
{
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Insert the method's description here.
* Creation date: (16/03/2002 10:58:42 PM)
* @param newCharset char[]
*/
public void setCharset(char[] newCharset)
{
charset = newCharset;
}
/**
* Finds the stem for given Russian word.
* Creation date: (16/03/2002 3:36:48 PM)
* @return java.lang.String
* @param input java.lang.String
*/
public String stem(String input)
{
markPositions(input);
if (RV == 0)
return input; //RV wasn't detected, nothing to stem
StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
// stemming goes on in RV
// Step 1
if (!perfectiveGerund(stemmingZone))
{
reflexive(stemmingZone);
// variable r is unused, we are just interested in the flow that gets
// created by logical expression: apply adjectival(); if that fails,
// apply verb() etc
boolean r =
adjectival(stemmingZone)
|| verb(stemmingZone)
|| noun(stemmingZone);
}
// Step 2
removeI(stemmingZone);
// Step 3
derivational(stemmingZone);
// Step 4
superlative(stemmingZone);
undoubleN(stemmingZone);
removeSoft(stemmingZone);
// return result
return input.substring(0, RV) + stemmingZone.toString();
}
/**
* Superlative endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean superlative(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, superlativeEndings);
}
/**
* Undoubles N.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean undoubleN(StringBuffer stemmingZone)
{
char[][] doubleN = {
{ N, N }
};
if (findEnding(stemmingZone, doubleN) != 0)
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Verb endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean verb(StringBuffer stemmingZone)
{
return findAndRemoveEnding(
stemmingZone,
verbEndings1,
verb1Predessors)
|| findAndRemoveEnding(stemmingZone, verbEndings2);
}
/**
* Static method for stemming with different charsets
*/
public static String stem(String theWord, char[] charset)
{
RussianStemmer stemmer = new RussianStemmer();
stemmer.setCharset(charset);
return stemmer.stem(theWord);
}
}

View File

@ -0,0 +1,5 @@
<html>
<body>
Support for indexing and searching Russian text.
</body>
</html>

View File

@ -0,0 +1,78 @@
package org.apache.lucene.analysis.de;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* Test the German stemmer. The stemming algorithm is known to work less
* than perfect, as it doesn't use any word lists with exceptions. We
* also check some of the cases where the algorithm is wrong.
*
* @author Daniel Naber
*/
public class TestGermanStemFilter extends TestCase {
public void testStemming() {
try {
// read test cases from external file:
File dataDir = new File(System.getProperty("dataDir", "./bin"));
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
FileInputStream fis = new FileInputStream(testFile);
InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1");
BufferedReader breader = new BufferedReader(isr);
while(true) {
String line = breader.readLine();
if (line == null)
break;
line = line.trim();
if (line.startsWith("#") || line.equals(""))
continue; // ignore comments and empty lines
String[] parts = line.split(";");
//System.out.println(parts[0] + " -- " + parts[1]);
check(parts[0], parts[1]);
}
breader.close();
isr.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
fail();
}
}
private void check(final String input, final String expected) throws IOException {
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
GermanStemFilter filter = new GermanStemFilter(tokenStream);
Token t = filter.next();
if (t == null)
fail();
assertEquals(expected, t.termText());
filter.close();
}
}

View File

@ -0,0 +1,48 @@
# German special characters are replaced:
häufig;haufig
# here the stemmer works okay, it maps related words to the same stem:
abschließen;abschliess
abschließender;abschliess
abschließendes;abschliess
abschließenden;abschliess
Tisch;tisch
Tische;tisch
Tischen;tisch
Haus;hau
Hauses;hau
Häuser;hau
Häusern;hau
# here's a case where overstemming occurs, i.e. a word is
# mapped to the same stem as unrelated words:
hauen;hau
# here's a case where understemming occurs, i.e. two related words
# are not mapped to the same stem. This is the case with basically
# all irregular forms:
Drama;drama
Dramen;dram
# replace "ß" with 'ss':
Ausmaß;ausmass
# fake words to test if suffixes are cut off:
xxxxxe;xxxxx
xxxxxs;xxxxx
xxxxxn;xxxxx
xxxxxt;xxxxx
xxxxxem;xxxxx
xxxxxer;xxxxx
xxxxxnd;xxxxx
# the suffixes are also removed when combined:
xxxxxetende;xxxxx
# words that are shorter than four charcters are not changed:
xxe;xxe
# -em and -er are not removed from words shorter than five characters:
xxem;xxem
xxer;xxer
# -nd is not removed from words shorter than six characters:
xxxnd;xxxnd

View File

@ -0,0 +1,170 @@
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import java.io.*;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
/**
* Test case for RussianAnalyzer.
*
* @author Boris Okner
* @version $Id$
*/
public class TestRussianAnalyzer extends TestCase
{
private InputStreamReader inWords;
private InputStreamReader sampleUnicode;
private Reader inWordsKOI8;
private Reader sampleKOI8;
private Reader inWords1251;
private Reader sample1251;
private File dataDir;
protected void setUp() throws Exception
{
dataDir = new File(System.getProperty("dataDir", "./bin"));
}
public void testUnicode() throws IOException
{
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
inWords =
new InputStreamReader(
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")),
"Unicode");
sampleUnicode =
new InputStreamReader(
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")),
"Unicode");
TokenStream in = ra.tokenStream("all", inWords);
RussianLetterTokenizer sample =
new RussianLetterTokenizer(
sampleUnicode,
RussianCharsets.UnicodeRussian);
for (;;)
{
Token token = in.next();
if (token == null)
{
break;
}
Token sampleToken = sample.next();
assertEquals(
"Unicode",
token.termText(),
sampleToken == null
? null
: sampleToken.termText());
}
inWords.close();
sampleUnicode.close();
}
public void testKOI8() throws IOException
{
//System.out.println(new java.util.Date());
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
// KOI8
inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
TokenStream in = ra.tokenStream("all", inWordsKOI8);
RussianLetterTokenizer sample =
new RussianLetterTokenizer(
sampleKOI8,
RussianCharsets.KOI8);
for (;;)
{
Token token = in.next();
if (token == null)
{
break;
}
Token sampleToken = sample.next();
assertEquals(
"KOI8",
token.termText(),
sampleToken == null
? null
: sampleToken.termText());
}
inWordsKOI8.close();
sampleKOI8.close();
}
public void test1251() throws IOException
{
// 1251
inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
TokenStream in = ra.tokenStream("", inWords1251);
RussianLetterTokenizer sample =
new RussianLetterTokenizer(
sample1251,
RussianCharsets.CP1251);
for (;;)
{
Token token = in.next();
if (token == null)
{
break;
}
Token sampleToken = sample.next();
assertEquals(
"1251",
token.termText(),
sampleToken == null
? null
: sampleToken.termText());
}
inWords1251.close();
sample1251.close();
}
}

View File

@ -0,0 +1,94 @@
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.util.ArrayList;
public class TestRussianStem extends TestCase
{
private ArrayList words = new ArrayList();
private ArrayList stems = new ArrayList();
public TestRussianStem(String name)
{
super(name);
}
/**
* @see TestCase#setUp()
*/
protected void setUp() throws Exception
{
super.setUp();
//System.out.println(new java.util.Date());
String str;
File dataDir = new File(System.getProperty("dataDir", "./bin"));
// open and read words into an array list
BufferedReader inWords =
new BufferedReader(
new InputStreamReader(
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/wordsUnicode.txt")),
"Unicode"));
while ((str = inWords.readLine()) != null)
{
words.add(str);
}
inWords.close();
// open and read stems into an array list
BufferedReader inStems =
new BufferedReader(
new InputStreamReader(
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/stemsUnicode.txt")),
"Unicode"));
while ((str = inStems.readLine()) != null)
{
stems.add(str);
}
inStems.close();
}
/**
* @see TestCase#tearDown()
*/
protected void tearDown() throws Exception
{
super.tearDown();
}
public void testStem()
{
for (int i = 0; i < words.size(); i++)
{
//if ( (i % 100) == 0 ) System.err.println(i);
String realStem =
RussianStemmer.stem(
(String) words.get(i),
RussianCharsets.UnicodeRussian);
assertEquals("unicode", stems.get(i), realStem);
}
}
}

View File

@ -0,0 +1 @@
[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]

View File

@ -0,0 +1 @@
[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃

View File

@ -0,0 +1,2 @@
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.

View File

@ -0,0 +1,2 @@
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.