mirror of https://github.com/apache/lucene.git
copy the Russian and German analyzers plus their test cases to the sandbox
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150998 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
726ddaeb5a
commit
87bcdf6f25
sandbox/contributions/analyzers/src
java/org/apache/lucene/analysis
de
ru
test/org/apache/lucene/analysis
|
@ -0,0 +1,135 @@
|
|||
package org.apache.lucene.analysis.de;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Analyzer for German language. Supports an external list of stopwords (words that
|
||||
* will not be indexed at all) and an external list of exclusions (word that will
|
||||
* not be stemmed, but indexed).
|
||||
* A default set of stopwords is used unless an alternative list is specified, the
|
||||
* exclusion list is empty by default.
|
||||
*
|
||||
* @author Gerhard Schwarz
|
||||
* @version $Id$
|
||||
*/
|
||||
public class GermanAnalyzer extends Analyzer {
|
||||
/**
|
||||
* List of typical german stopwords.
|
||||
*/
|
||||
private String[] GERMAN_STOP_WORDS = {
|
||||
"einer", "eine", "eines", "einem", "einen",
|
||||
"der", "die", "das", "dass", "daß",
|
||||
"du", "er", "sie", "es",
|
||||
"was", "wer", "wie", "wir",
|
||||
"und", "oder", "ohne", "mit",
|
||||
"am", "im", "in", "aus", "auf",
|
||||
"ist", "sein", "war", "wird",
|
||||
"ihr", "ihre", "ihres",
|
||||
"als", "für", "von", "mit",
|
||||
"dich", "dir", "mich", "mir",
|
||||
"mein", "sein", "kein",
|
||||
"durch", "wegen", "wird"
|
||||
};
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Set stopSet = new HashSet();
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Set exclusionSet = new HashSet();
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public GermanAnalyzer() {
|
||||
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public GermanAnalyzer(String[] stopwords) {
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public GermanAnalyzer(Hashtable stopwords) {
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public GermanAnalyzer(File stopwords) throws IOException {
|
||||
stopSet = WordlistLoader.getWordSet(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
*/
|
||||
public void setStemExclusionTable(String[] exclusionlist) {
|
||||
exclusionSet = StopFilter.makeStopSet(exclusionlist);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from a Hashtable.
|
||||
*/
|
||||
public void setStemExclusionTable(Hashtable exclusionlist) {
|
||||
exclusionSet = new HashSet(exclusionlist.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) throws IOException {
|
||||
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||
* StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopSet);
|
||||
result = new GermanStemFilter(result, exclusionSet);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
package org.apache.lucene.analysis.de;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* A filter that stems German words. It supports a table of words that should
|
||||
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
||||
* filter object is created (as long as it is a GermanStemmer).
|
||||
*
|
||||
* @author Gerhard Schwarz
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class GermanStemFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private GermanStemmer stemmer = null;
|
||||
private Set exclusionSet = null;
|
||||
|
||||
public GermanStemFilter( TokenStream in )
|
||||
{
|
||||
super(in);
|
||||
stemmer = new GermanStemmer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a GermanStemFilter that uses an exclusiontable.
|
||||
* @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
|
||||
*/
|
||||
public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
|
||||
{
|
||||
this( in );
|
||||
exclusionSet = new HashSet(exclusiontable.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a GermanStemFilter that uses an exclusiontable.
|
||||
*/
|
||||
public GermanStemFilter( TokenStream in, Set exclusionSet )
|
||||
{
|
||||
this( in );
|
||||
this.exclusionSet = exclusionSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public final Token next()
|
||||
throws IOException
|
||||
{
|
||||
if ( ( token = input.next() ) == null ) {
|
||||
return null;
|
||||
}
|
||||
// Check the exclusiontable
|
||||
else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
|
||||
return token;
|
||||
}
|
||||
else {
|
||||
String s = stemmer.stem( token.termText() );
|
||||
// If not stemmed, dont waste the time creating a new token
|
||||
if ( !s.equals( token.termText() ) ) {
|
||||
return new Token( s, token.startOffset(),
|
||||
token.endOffset(), token.type() );
|
||||
}
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a alternative/custom GermanStemmer for this filter.
|
||||
*/
|
||||
public void setStemmer( GermanStemmer stemmer )
|
||||
{
|
||||
if ( stemmer != null ) {
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set an alternative exclusion list for this filter.
|
||||
* @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
|
||||
*/
|
||||
public void setExclusionTable( Hashtable exclusiontable )
|
||||
{
|
||||
exclusionSet = new HashSet(exclusiontable.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Set an alternative exclusion list for this filter.
|
||||
*/
|
||||
public void setExclusionSet( Set exclusionSet )
|
||||
{
|
||||
this.exclusionSet = exclusionSet;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,265 @@
|
|||
package org.apache.lucene.analysis.de;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A stemmer for German words. The algorithm is based on the report
|
||||
* "A Fast and Simple Stemming Algorithm for German Words" by Jörg
|
||||
* Caumanns (joerg.caumanns@isst.fhg.de).
|
||||
*
|
||||
* @author Gerhard Schwarz
|
||||
* @version $Id$
|
||||
*/
|
||||
public class GermanStemmer
|
||||
{
|
||||
/**
|
||||
* Buffer for the terms while stemming them.
|
||||
*/
|
||||
private StringBuffer sb = new StringBuffer();
|
||||
|
||||
/**
|
||||
* Amount of characters that are removed with <tt>substitute()</tt> while stemming.
|
||||
*/
|
||||
private int substCount = 0;
|
||||
|
||||
/**
|
||||
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||
*
|
||||
* @param term The term that should be stemmed.
|
||||
* @return Discriminator for <tt>term</tt>
|
||||
*/
|
||||
protected String stem( String term )
|
||||
{
|
||||
// Use lowercase for medium stemming.
|
||||
term = term.toLowerCase();
|
||||
if ( !isStemmable( term ) )
|
||||
return term;
|
||||
// Reset the StringBuffer.
|
||||
sb.delete( 0, sb.length() );
|
||||
sb.insert( 0, term );
|
||||
// Stemming starts here...
|
||||
substitute( sb );
|
||||
strip( sb );
|
||||
optimize( sb );
|
||||
resubstitute( sb );
|
||||
removeParticleDenotion( sb );
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a term could be stemmed.
|
||||
*
|
||||
* @return true if, and only if, the given term consists in letters.
|
||||
*/
|
||||
private boolean isStemmable( String term )
|
||||
{
|
||||
for ( int c = 0; c < term.length(); c++ ) {
|
||||
if ( !Character.isLetter( term.charAt( c ) ) )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* suffix stripping (stemming) on the current term. The stripping is reduced
|
||||
* to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
|
||||
* from which all regular suffixes are build of. The simplification causes
|
||||
* some overstemming, and way more irregular stems, but still provides unique.
|
||||
* discriminators in the most of those cases.
|
||||
* The algorithm is context free, except of the length restrictions.
|
||||
*/
|
||||
private void strip( StringBuffer buffer )
|
||||
{
|
||||
boolean doMore = true;
|
||||
while ( doMore && buffer.length() > 3 ) {
|
||||
if ( ( buffer.length() + substCount > 5 ) &&
|
||||
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
|
||||
{
|
||||
buffer.delete( buffer.length() - 2, buffer.length() );
|
||||
}
|
||||
else if ( ( buffer.length() + substCount > 4 ) &&
|
||||
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
|
||||
buffer.delete( buffer.length() - 2, buffer.length() );
|
||||
}
|
||||
else if ( ( buffer.length() + substCount > 4 ) &&
|
||||
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
|
||||
buffer.delete( buffer.length() - 2, buffer.length() );
|
||||
}
|
||||
else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
|
||||
buffer.deleteCharAt( buffer.length() - 1 );
|
||||
}
|
||||
else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
|
||||
buffer.deleteCharAt( buffer.length() - 1 );
|
||||
}
|
||||
else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
|
||||
buffer.deleteCharAt( buffer.length() - 1 );
|
||||
}
|
||||
// "t" occurs only as suffix of verbs.
|
||||
else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
|
||||
buffer.deleteCharAt( buffer.length() - 1 );
|
||||
}
|
||||
else {
|
||||
doMore = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Does some optimizations on the term. This optimisations are
|
||||
* contextual.
|
||||
*/
|
||||
private void optimize( StringBuffer buffer )
|
||||
{
|
||||
// Additional step for female plurals of professions and inhabitants.
|
||||
if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
|
||||
buffer.deleteCharAt( buffer.length() -1 );
|
||||
strip( buffer );
|
||||
}
|
||||
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
|
||||
if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
|
||||
buffer.setCharAt( buffer.length() - 1, 'x' );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes a particle denotion ("ge") from a term.
|
||||
*/
|
||||
private void removeParticleDenotion( StringBuffer buffer )
|
||||
{
|
||||
if ( buffer.length() > 4 ) {
|
||||
for ( int c = 0; c < buffer.length() - 3; c++ ) {
|
||||
if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
|
||||
buffer.delete( c, c + 2 );
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Do some substitutions for the term to reduce overstemming:
|
||||
*
|
||||
* - Substitute Umlauts with their corresponding vowel: äöü -> aou,
|
||||
* "ß" is substituted by "ss"
|
||||
* - Substitute a second char of a pair of equal characters with
|
||||
* an asterisk: ?? -> ?*
|
||||
* - Substitute some common character combinations with a token:
|
||||
* sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
|
||||
*/
|
||||
private void substitute( StringBuffer buffer )
|
||||
{
|
||||
substCount = 0;
|
||||
for ( int c = 0; c < buffer.length(); c++ ) {
|
||||
// Replace the second char of a pair of the equal characters with an asterisk
|
||||
if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
|
||||
buffer.setCharAt( c, '*' );
|
||||
}
|
||||
// Substitute Umlauts.
|
||||
else if ( buffer.charAt( c ) == 'ä' ) {
|
||||
buffer.setCharAt( c, 'a' );
|
||||
}
|
||||
else if ( buffer.charAt( c ) == 'ö' ) {
|
||||
buffer.setCharAt( c, 'o' );
|
||||
}
|
||||
else if ( buffer.charAt( c ) == 'ü' ) {
|
||||
buffer.setCharAt( c, 'u' );
|
||||
}
|
||||
// Fix bug so that 'ß' at the end of a word is replaced.
|
||||
else if ( buffer.charAt( c ) == 'ß' ) {
|
||||
buffer.setCharAt( c, 's' );
|
||||
buffer.insert( c + 1, 's' );
|
||||
substCount++;
|
||||
}
|
||||
// Take care that at least one character is left left side from the current one
|
||||
if ( c < buffer.length() - 1 ) {
|
||||
// Masking several common character combinations with an token
|
||||
if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
|
||||
buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
|
||||
{
|
||||
buffer.setCharAt( c, '$' );
|
||||
buffer.delete( c + 1, c + 3 );
|
||||
substCount =+ 2;
|
||||
}
|
||||
else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
|
||||
buffer.setCharAt( c, '§' );
|
||||
buffer.deleteCharAt( c + 1 );
|
||||
substCount++;
|
||||
}
|
||||
else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
|
||||
buffer.setCharAt( c, '%' );
|
||||
buffer.deleteCharAt( c + 1 );
|
||||
substCount++;
|
||||
}
|
||||
else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
|
||||
buffer.setCharAt( c, '&' );
|
||||
buffer.deleteCharAt( c + 1 );
|
||||
substCount++;
|
||||
}
|
||||
else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
|
||||
buffer.setCharAt( c, '#' );
|
||||
buffer.deleteCharAt( c + 1 );
|
||||
substCount++;
|
||||
}
|
||||
else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
|
||||
buffer.setCharAt( c, '!' );
|
||||
buffer.deleteCharAt( c + 1 );
|
||||
substCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Undoes the changes made by substitute(). That are character pairs and
|
||||
* character combinations. Umlauts will remain as their corresponding vowel,
|
||||
* as "ß" remains as "ss".
|
||||
*/
|
||||
private void resubstitute( StringBuffer buffer )
|
||||
{
|
||||
for ( int c = 0; c < buffer.length(); c++ ) {
|
||||
if ( buffer.charAt( c ) == '*' ) {
|
||||
char x = buffer.charAt( c - 1 );
|
||||
buffer.setCharAt( c, x );
|
||||
}
|
||||
else if ( buffer.charAt( c ) == '$' ) {
|
||||
buffer.setCharAt( c, 's' );
|
||||
buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
|
||||
}
|
||||
else if ( buffer.charAt( c ) == '§' ) {
|
||||
buffer.setCharAt( c, 'c' );
|
||||
buffer.insert( c + 1, 'h' );
|
||||
}
|
||||
else if ( buffer.charAt( c ) == '%' ) {
|
||||
buffer.setCharAt( c, 'e' );
|
||||
buffer.insert( c + 1, 'i' );
|
||||
}
|
||||
else if ( buffer.charAt( c ) == '&' ) {
|
||||
buffer.setCharAt( c, 'i' );
|
||||
buffer.insert( c + 1, 'e' );
|
||||
}
|
||||
else if ( buffer.charAt( c ) == '#' ) {
|
||||
buffer.setCharAt( c, 'i' );
|
||||
buffer.insert( c + 1, 'g' );
|
||||
}
|
||||
else if ( buffer.charAt( c ) == '!' ) {
|
||||
buffer.setCharAt( c, 's' );
|
||||
buffer.insert( c + 1, 't' );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
package org.apache.lucene.analysis.de;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.LineNumberReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Loader for text files that represent a list of stopwords.
|
||||
*
|
||||
* @author Gerhard Schwarz
|
||||
* @version $Id$
|
||||
*
|
||||
* @todo this is not specific to German, it should be moved up
|
||||
*/
|
||||
public class WordlistLoader {
|
||||
|
||||
/**
|
||||
* Loads a text file and adds every line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the file should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
|
||||
*
|
||||
* @param wordfile File containing the wordlist
|
||||
* @return A HashSet with the file's words
|
||||
*/
|
||||
public static HashSet getWordSet(File wordfile) throws IOException {
|
||||
HashSet result = new HashSet();
|
||||
FileReader freader = null;
|
||||
LineNumberReader lnr = null;
|
||||
try {
|
||||
freader = new FileReader(wordfile);
|
||||
lnr = new LineNumberReader(freader);
|
||||
String word = null;
|
||||
while ((word = lnr.readLine()) != null) {
|
||||
result.add(word.trim());
|
||||
}
|
||||
}
|
||||
finally {
|
||||
if (lnr != null)
|
||||
lnr.close();
|
||||
if (freader != null)
|
||||
freader.close();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param path Path to the wordlist
|
||||
* @param wordfile Name of the wordlist
|
||||
*
|
||||
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
|
||||
*/
|
||||
public static Hashtable getWordtable(String path, String wordfile) throws IOException {
|
||||
return getWordtable(new File(path, wordfile));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param wordfile Complete path to the wordlist
|
||||
*
|
||||
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
|
||||
*/
|
||||
public static Hashtable getWordtable(String wordfile) throws IOException {
|
||||
return getWordtable(new File(wordfile));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param wordfile File object that points to the wordlist
|
||||
*
|
||||
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
|
||||
*/
|
||||
public static Hashtable getWordtable(File wordfile) throws IOException {
|
||||
HashSet wordSet = (HashSet)getWordSet(wordfile);
|
||||
Hashtable result = makeWordTable(wordSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a wordlist table, using words as both keys and values
|
||||
* for backward compatibility.
|
||||
*
|
||||
* @param wordSet stopword set
|
||||
*/
|
||||
private static Hashtable makeWordTable(HashSet wordSet) {
|
||||
Hashtable table = new Hashtable();
|
||||
for (Iterator iter = wordSet.iterator(); iter.hasNext();) {
|
||||
String word = (String)iter.next();
|
||||
table.put(word, word);
|
||||
}
|
||||
return table;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
Support for indexing and searching of German text.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,259 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* Analyzer for Russian language. Supports an external list of stopwords (words that
|
||||
* will not be indexed at all).
|
||||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
*
|
||||
* @author Boris Okner, b.okner@rogers.com
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class RussianAnalyzer extends Analyzer
|
||||
{
|
||||
// letters (currently unused letters are commented out)
|
||||
private final static char A = 0;
|
||||
private final static char B = 1;
|
||||
private final static char V = 2;
|
||||
private final static char G = 3;
|
||||
private final static char D = 4;
|
||||
private final static char E = 5;
|
||||
private final static char ZH = 6;
|
||||
private final static char Z = 7;
|
||||
private final static char I = 8;
|
||||
private final static char I_ = 9;
|
||||
private final static char K = 10;
|
||||
private final static char L = 11;
|
||||
private final static char M = 12;
|
||||
private final static char N = 13;
|
||||
private final static char O = 14;
|
||||
private final static char P = 15;
|
||||
private final static char R = 16;
|
||||
private final static char S = 17;
|
||||
private final static char T = 18;
|
||||
private final static char U = 19;
|
||||
//private final static char F = 20;
|
||||
private final static char X = 21;
|
||||
//private final static char TS = 22;
|
||||
private final static char CH = 23;
|
||||
private final static char SH = 24;
|
||||
private final static char SHCH = 25;
|
||||
//private final static char HARD = 26;
|
||||
private final static char Y = 27;
|
||||
private final static char SOFT = 28;
|
||||
private final static char AE = 29;
|
||||
private final static char IU = 30;
|
||||
private final static char IA = 31;
|
||||
|
||||
/**
|
||||
* List of typical Russian stopwords.
|
||||
*/
|
||||
private static char[][] RUSSIAN_STOP_WORDS = {
|
||||
{A},
|
||||
{B, E, Z},
|
||||
{B, O, L, E, E},
|
||||
{B, Y},
|
||||
{B, Y, L},
|
||||
{B, Y, L, A},
|
||||
{B, Y, L, I},
|
||||
{B, Y, L, O},
|
||||
{B, Y, T, SOFT},
|
||||
{V},
|
||||
{V, A, M},
|
||||
{V, A, S},
|
||||
{V, E, S, SOFT},
|
||||
{V, O},
|
||||
{V, O, T},
|
||||
{V, S, E},
|
||||
{V, S, E, G, O},
|
||||
{V, S, E, X},
|
||||
{V, Y},
|
||||
{G, D, E},
|
||||
{D, A},
|
||||
{D, A, ZH, E},
|
||||
{D, L, IA},
|
||||
{D, O},
|
||||
{E, G, O},
|
||||
{E, E},
|
||||
{E, I_,},
|
||||
{E, IU},
|
||||
{E, S, L, I},
|
||||
{E, S, T, SOFT},
|
||||
{E, SHCH, E},
|
||||
{ZH, E},
|
||||
{Z, A},
|
||||
{Z, D, E, S, SOFT},
|
||||
{I},
|
||||
{I, Z},
|
||||
{I, L, I},
|
||||
{I, M},
|
||||
{I, X},
|
||||
{K},
|
||||
{K, A, K},
|
||||
{K, O},
|
||||
{K, O, G, D, A},
|
||||
{K, T, O},
|
||||
{L, I},
|
||||
{L, I, B, O},
|
||||
{M, N, E},
|
||||
{M, O, ZH, E, T},
|
||||
{M, Y},
|
||||
{N, A},
|
||||
{N, A, D, O},
|
||||
{N, A, SH},
|
||||
{N, E},
|
||||
{N, E, G, O},
|
||||
{N, E, E},
|
||||
{N, E, T},
|
||||
{N, I},
|
||||
{N, I, X},
|
||||
{N, O},
|
||||
{N, U},
|
||||
{O},
|
||||
{O, B},
|
||||
{O, D, N, A, K, O},
|
||||
{O, N},
|
||||
{O, N, A},
|
||||
{O, N, I},
|
||||
{O, N, O},
|
||||
{O, T},
|
||||
{O, CH, E, N, SOFT},
|
||||
{P, O},
|
||||
{P, O, D},
|
||||
{P, R, I},
|
||||
{S},
|
||||
{S, O},
|
||||
{T, A, K},
|
||||
{T, A, K, ZH, E},
|
||||
{T, A, K, O, I_},
|
||||
{T, A, M},
|
||||
{T, E},
|
||||
{T, E, M},
|
||||
{T, O},
|
||||
{T, O, G, O},
|
||||
{T, O, ZH, E},
|
||||
{T, O, I_},
|
||||
{T, O, L, SOFT, K, O},
|
||||
{T, O, M},
|
||||
{T, Y},
|
||||
{U},
|
||||
{U, ZH, E},
|
||||
{X, O, T, IA},
|
||||
{CH, E, G, O},
|
||||
{CH, E, I_},
|
||||
{CH, E, M},
|
||||
{CH, T, O},
|
||||
{CH, T, O, B, Y},
|
||||
{CH, SOFT, E},
|
||||
{CH, SOFT, IA},
|
||||
{AE, T, A},
|
||||
{AE, T, I},
|
||||
{AE, T, O},
|
||||
{IA}
|
||||
};
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Set stopSet = new HashSet();
|
||||
|
||||
/**
|
||||
* Charset for Russian letters.
|
||||
* Represents encoding for 32 lowercase Russian letters.
|
||||
* Predefined charsets can be taken from RussianCharSets class
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
|
||||
public RussianAnalyzer() {
|
||||
charset = RussianCharsets.UnicodeRussian;
|
||||
stopSet = StopFilter.makeStopSet(
|
||||
makeStopWords(RussianCharsets.UnicodeRussian));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset, String[] stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
// Takes russian stop words and translates them to a String array, using
|
||||
// the given charset
|
||||
private static String[] makeStopWords(char[] charset)
|
||||
{
|
||||
String[] res = new String[RUSSIAN_STOP_WORDS.length];
|
||||
for (int i = 0; i < res.length; i++)
|
||||
{
|
||||
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
|
||||
// translate the word, using the charset
|
||||
StringBuffer theWord = new StringBuffer();
|
||||
for (int j = 0; j < theStopWord.length; j++)
|
||||
{
|
||||
theWord.append(charset[theStopWord[j]]);
|
||||
}
|
||||
res[i] = theWord.toString();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @todo create a Set version of this ctor
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset, Hashtable stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a RussianLetterTokenizer filtered with
|
||||
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
TokenStream result = new RussianLetterTokenizer(reader, charset);
|
||||
result = new RussianLowerCaseFilter(result, charset);
|
||||
result = new StopFilter(result, stopSet);
|
||||
result = new RussianStemFilter(result, charset);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,279 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
||||
* for russian characters in Unicode, KOI8 and CP1252.
|
||||
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
|
||||
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
||||
* and adding logic to toLowerCase() method for that charset.
|
||||
*
|
||||
* @author Boris Okner, b.okner@rogers.com
|
||||
* @version $Id$
|
||||
*/
|
||||
public class RussianCharsets
|
||||
{
|
||||
// Unicode Russian charset (lowercase letters only)
|
||||
public static char[] UnicodeRussian = {
|
||||
'\u0430',
|
||||
'\u0431',
|
||||
'\u0432',
|
||||
'\u0433',
|
||||
'\u0434',
|
||||
'\u0435',
|
||||
'\u0436',
|
||||
'\u0437',
|
||||
'\u0438',
|
||||
'\u0439',
|
||||
'\u043A',
|
||||
'\u043B',
|
||||
'\u043C',
|
||||
'\u043D',
|
||||
'\u043E',
|
||||
'\u043F',
|
||||
'\u0440',
|
||||
'\u0441',
|
||||
'\u0442',
|
||||
'\u0443',
|
||||
'\u0444',
|
||||
'\u0445',
|
||||
'\u0446',
|
||||
'\u0447',
|
||||
'\u0448',
|
||||
'\u0449',
|
||||
'\u044A',
|
||||
'\u044B',
|
||||
'\u044C',
|
||||
'\u044D',
|
||||
'\u044E',
|
||||
'\u044F',
|
||||
// upper case
|
||||
'\u0410',
|
||||
'\u0411',
|
||||
'\u0412',
|
||||
'\u0413',
|
||||
'\u0414',
|
||||
'\u0415',
|
||||
'\u0416',
|
||||
'\u0417',
|
||||
'\u0418',
|
||||
'\u0419',
|
||||
'\u041A',
|
||||
'\u041B',
|
||||
'\u041C',
|
||||
'\u041D',
|
||||
'\u041E',
|
||||
'\u041F',
|
||||
'\u0420',
|
||||
'\u0421',
|
||||
'\u0422',
|
||||
'\u0423',
|
||||
'\u0424',
|
||||
'\u0425',
|
||||
'\u0426',
|
||||
'\u0427',
|
||||
'\u0428',
|
||||
'\u0429',
|
||||
'\u042A',
|
||||
'\u042B',
|
||||
'\u042C',
|
||||
'\u042D',
|
||||
'\u042E',
|
||||
'\u042F'
|
||||
};
|
||||
|
||||
// KOI8 charset
|
||||
public static char[] KOI8 = {
|
||||
0xc1,
|
||||
0xc2,
|
||||
0xd7,
|
||||
0xc7,
|
||||
0xc4,
|
||||
0xc5,
|
||||
0xd6,
|
||||
0xda,
|
||||
0xc9,
|
||||
0xca,
|
||||
0xcb,
|
||||
0xcc,
|
||||
0xcd,
|
||||
0xce,
|
||||
0xcf,
|
||||
0xd0,
|
||||
0xd2,
|
||||
0xd3,
|
||||
0xd4,
|
||||
0xd5,
|
||||
0xc6,
|
||||
0xc8,
|
||||
0xc3,
|
||||
0xde,
|
||||
0xdb,
|
||||
0xdd,
|
||||
0xdf,
|
||||
0xd9,
|
||||
0xd8,
|
||||
0xdc,
|
||||
0xc0,
|
||||
0xd1,
|
||||
// upper case
|
||||
0xe1,
|
||||
0xe2,
|
||||
0xf7,
|
||||
0xe7,
|
||||
0xe4,
|
||||
0xe5,
|
||||
0xf6,
|
||||
0xfa,
|
||||
0xe9,
|
||||
0xea,
|
||||
0xeb,
|
||||
0xec,
|
||||
0xed,
|
||||
0xee,
|
||||
0xef,
|
||||
0xf0,
|
||||
0xf2,
|
||||
0xf3,
|
||||
0xf4,
|
||||
0xf5,
|
||||
0xe6,
|
||||
0xe8,
|
||||
0xe3,
|
||||
0xfe,
|
||||
0xfb,
|
||||
0xfd,
|
||||
0xff,
|
||||
0xf9,
|
||||
0xf8,
|
||||
0xfc,
|
||||
0xe0,
|
||||
0xf1
|
||||
};
|
||||
|
||||
// CP1251 eharset
|
||||
public static char[] CP1251 = {
|
||||
0xE0,
|
||||
0xE1,
|
||||
0xE2,
|
||||
0xE3,
|
||||
0xE4,
|
||||
0xE5,
|
||||
0xE6,
|
||||
0xE7,
|
||||
0xE8,
|
||||
0xE9,
|
||||
0xEA,
|
||||
0xEB,
|
||||
0xEC,
|
||||
0xED,
|
||||
0xEE,
|
||||
0xEF,
|
||||
0xF0,
|
||||
0xF1,
|
||||
0xF2,
|
||||
0xF3,
|
||||
0xF4,
|
||||
0xF5,
|
||||
0xF6,
|
||||
0xF7,
|
||||
0xF8,
|
||||
0xF9,
|
||||
0xFA,
|
||||
0xFB,
|
||||
0xFC,
|
||||
0xFD,
|
||||
0xFE,
|
||||
0xFF,
|
||||
// upper case
|
||||
0xC0,
|
||||
0xC1,
|
||||
0xC2,
|
||||
0xC3,
|
||||
0xC4,
|
||||
0xC5,
|
||||
0xC6,
|
||||
0xC7,
|
||||
0xC8,
|
||||
0xC9,
|
||||
0xCA,
|
||||
0xCB,
|
||||
0xCC,
|
||||
0xCD,
|
||||
0xCE,
|
||||
0xCF,
|
||||
0xD0,
|
||||
0xD1,
|
||||
0xD2,
|
||||
0xD3,
|
||||
0xD4,
|
||||
0xD5,
|
||||
0xD6,
|
||||
0xD7,
|
||||
0xD8,
|
||||
0xD9,
|
||||
0xDA,
|
||||
0xDB,
|
||||
0xDC,
|
||||
0xDD,
|
||||
0xDE,
|
||||
0xDF
|
||||
};
|
||||
|
||||
public static char toLowerCase(char letter, char[] charset)
|
||||
{
|
||||
if (charset == UnicodeRussian)
|
||||
{
|
||||
if (letter >= '\u0430' && letter <= '\u044F')
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
if (letter >= '\u0410' && letter <= '\u042F')
|
||||
{
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
}
|
||||
|
||||
if (charset == KOI8)
|
||||
{
|
||||
if (letter >= 0xe0 && letter <= 0xff)
|
||||
{
|
||||
return (char) (letter - 32);
|
||||
}
|
||||
if (letter >= 0xc0 && letter <= 0xdf)
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (charset == CP1251)
|
||||
{
|
||||
if (letter >= 0xC0 && letter <= 0xDF)
|
||||
{
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
if (letter >= 0xE0 && letter <= 0xFF)
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return Character.toLowerCase(letter);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
|
||||
/**
|
||||
* A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
|
||||
* in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
|
||||
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
|
||||
* (well-known problems with 0xD7 and 0xF7 chars)
|
||||
*
|
||||
* @author Boris Okner, b.okner@rogers.com
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
public class RussianLetterTokenizer extends CharTokenizer
|
||||
{
|
||||
/** Construct a new LetterTokenizer. */
|
||||
private char[] charset;
|
||||
|
||||
public RussianLetterTokenizer(Reader in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects only characters which satisfy
|
||||
* {@link Character#isLetter(char)}.
|
||||
*/
|
||||
protected boolean isTokenChar(char c)
|
||||
{
|
||||
if (Character.isLetter(c))
|
||||
return true;
|
||||
for (int i = 0; i < charset.length; i++)
|
||||
{
|
||||
if (c == charset[i])
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case, analyzing given ("russian") charset.
|
||||
*
|
||||
* @author Boris Okner, b.okner@rogers.com
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class RussianLowerCaseFilter extends TokenFilter
|
||||
{
|
||||
char[] charset;
|
||||
|
||||
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException
|
||||
{
|
||||
Token t = input.next();
|
||||
|
||||
if (t == null)
|
||||
return null;
|
||||
|
||||
String txt = t.termText();
|
||||
|
||||
char[] chArray = txt.toCharArray();
|
||||
for (int i = 0; i < chArray.length; i++)
|
||||
{
|
||||
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
|
||||
}
|
||||
|
||||
String newTxt = new String(chArray);
|
||||
// create new token
|
||||
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
|
||||
|
||||
return newToken;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
|
||||
* The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
|
||||
* because RussianStemFilter only works with lowercase part of any "russian" charset.
|
||||
*
|
||||
* @author Boris Okner, b.okner@rogers.com
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class RussianStemFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private RussianStemmer stemmer = null;
|
||||
|
||||
public RussianStemFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
stemmer = new RussianStemmer(charset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public final Token next() throws IOException
|
||||
{
|
||||
if ((token = input.next()) == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
else
|
||||
{
|
||||
String s = stemmer.stem(token.termText());
|
||||
if (!s.equals(token.termText()))
|
||||
{
|
||||
return new Token(s, token.startOffset(), token.endOffset(),
|
||||
token.type());
|
||||
}
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a alternative/custom RussianStemmer for this filter.
|
||||
*/
|
||||
public void setStemmer(RussianStemmer stemmer)
|
||||
{
|
||||
if (stemmer != null)
|
||||
{
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,629 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
|
||||
*
|
||||
* @author Boris Okner, b.okner@rogers.com
|
||||
* @version $Id$
|
||||
*/
|
||||
class RussianStemmer
|
||||
{
|
||||
private char[] charset;
|
||||
|
||||
// positions of RV, R1 and R2 respectively
|
||||
private int RV, R1, R2;
|
||||
|
||||
// letters (currently unused letters are commented out)
|
||||
private final static char A = 0;
|
||||
//private final static char B = 1;
|
||||
private final static char V = 2;
|
||||
private final static char G = 3;
|
||||
//private final static char D = 4;
|
||||
private final static char E = 5;
|
||||
//private final static char ZH = 6;
|
||||
//private final static char Z = 7;
|
||||
private final static char I = 8;
|
||||
private final static char I_ = 9;
|
||||
//private final static char K = 10;
|
||||
private final static char L = 11;
|
||||
private final static char M = 12;
|
||||
private final static char N = 13;
|
||||
private final static char O = 14;
|
||||
//private final static char P = 15;
|
||||
//private final static char R = 16;
|
||||
private final static char S = 17;
|
||||
private final static char T = 18;
|
||||
private final static char U = 19;
|
||||
//private final static char F = 20;
|
||||
private final static char X = 21;
|
||||
//private final static char TS = 22;
|
||||
//private final static char CH = 23;
|
||||
private final static char SH = 24;
|
||||
private final static char SHCH = 25;
|
||||
//private final static char HARD = 26;
|
||||
private final static char Y = 27;
|
||||
private final static char SOFT = 28;
|
||||
private final static char AE = 29;
|
||||
private final static char IU = 30;
|
||||
private final static char IA = 31;
|
||||
|
||||
// stem definitions
|
||||
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
|
||||
|
||||
private static char[][] perfectiveGerundEndings1 = {
|
||||
{ V },
|
||||
{ V, SH, I },
|
||||
{ V, SH, I, S, SOFT }
|
||||
};
|
||||
|
||||
private static char[][] perfectiveGerund1Predessors = {
|
||||
{ A },
|
||||
{ IA }
|
||||
};
|
||||
|
||||
private static char[][] perfectiveGerundEndings2 = { { I, V }, {
|
||||
Y, V }, {
|
||||
I, V, SH, I }, {
|
||||
Y, V, SH, I }, {
|
||||
I, V, SH, I, S, SOFT }, {
|
||||
Y, V, SH, I, S, SOFT }
|
||||
};
|
||||
|
||||
private static char[][] adjectiveEndings = {
|
||||
{ E, E },
|
||||
{ I, E },
|
||||
{ Y, E },
|
||||
{ O, E },
|
||||
{ E, I_ },
|
||||
{ I, I_ },
|
||||
{ Y, I_ },
|
||||
{ O, I_ },
|
||||
{ E, M },
|
||||
{ I, M },
|
||||
{ Y, M },
|
||||
{ O, M },
|
||||
{ I, X },
|
||||
{ Y, X },
|
||||
{ U, IU },
|
||||
{ IU, IU },
|
||||
{ A, IA },
|
||||
{ IA, IA },
|
||||
{ O, IU },
|
||||
{ E, IU },
|
||||
{ I, M, I },
|
||||
{ Y, M, I },
|
||||
{ E, G, O },
|
||||
{ O, G, O },
|
||||
{ E, M, U },
|
||||
{O, M, U }
|
||||
};
|
||||
|
||||
private static char[][] participleEndings1 = {
|
||||
{ SHCH },
|
||||
{ E, M },
|
||||
{ N, N },
|
||||
{ V, SH },
|
||||
{ IU, SHCH }
|
||||
};
|
||||
|
||||
private static char[][] participleEndings2 = {
|
||||
{ I, V, SH },
|
||||
{ Y, V, SH },
|
||||
{ U, IU, SHCH }
|
||||
};
|
||||
|
||||
private static char[][] participle1Predessors = {
|
||||
{ A },
|
||||
{ IA }
|
||||
};
|
||||
|
||||
private static char[][] reflexiveEndings = {
|
||||
{ S, IA },
|
||||
{ S, SOFT }
|
||||
};
|
||||
|
||||
private static char[][] verbEndings1 = {
|
||||
{ I_ },
|
||||
{ L },
|
||||
{ N },
|
||||
{ L, O },
|
||||
{ N, O },
|
||||
{ E, T },
|
||||
{ IU, T },
|
||||
{ L, A },
|
||||
{ N, A },
|
||||
{ L, I },
|
||||
{ E, M },
|
||||
{ N, Y },
|
||||
{ E, T, E },
|
||||
{ I_, T, E },
|
||||
{ T, SOFT },
|
||||
{ E, SH, SOFT },
|
||||
{ N, N, O }
|
||||
};
|
||||
|
||||
private static char[][] verbEndings2 = {
|
||||
{ IU },
|
||||
{ U, IU },
|
||||
{ E, N },
|
||||
{ E, I_ },
|
||||
{ IA, T },
|
||||
{ U, I_ },
|
||||
{ I, L },
|
||||
{ Y, L },
|
||||
{ I, M },
|
||||
{ Y, M },
|
||||
{ I, T },
|
||||
{ Y, T },
|
||||
{ I, L, A },
|
||||
{ Y, L, A },
|
||||
{ E, N, A },
|
||||
{ I, T, E },
|
||||
{ I, L, I },
|
||||
{ Y, L, I },
|
||||
{ I, L, O },
|
||||
{ Y, L, O },
|
||||
{ E, N, O },
|
||||
{ U, E, T },
|
||||
{ U, IU, T },
|
||||
{ E, N, Y },
|
||||
{ I, T, SOFT },
|
||||
{ Y, T, SOFT },
|
||||
{ I, SH, SOFT },
|
||||
{ E, I_, T, E },
|
||||
{ U, I_, T, E }
|
||||
};
|
||||
|
||||
private static char[][] verb1Predessors = {
|
||||
{ A },
|
||||
{ IA }
|
||||
};
|
||||
|
||||
private static char[][] nounEndings = {
|
||||
{ A },
|
||||
{ U },
|
||||
{ I_ },
|
||||
{ O },
|
||||
{ U },
|
||||
{ E },
|
||||
{ Y },
|
||||
{ I },
|
||||
{ SOFT },
|
||||
{ IA },
|
||||
{ E, V },
|
||||
{ O, V },
|
||||
{ I, E },
|
||||
{ SOFT, E },
|
||||
{ IA, X },
|
||||
{ I, IU },
|
||||
{ E, I },
|
||||
{ I, I },
|
||||
{ E, I_ },
|
||||
{ O, I_ },
|
||||
{ E, M },
|
||||
{ A, M },
|
||||
{ O, M },
|
||||
{ A, X },
|
||||
{ SOFT, IU },
|
||||
{ I, IA },
|
||||
{ SOFT, IA },
|
||||
{ I, I_ },
|
||||
{ IA, M },
|
||||
{ IA, M, I },
|
||||
{ A, M, I },
|
||||
{ I, E, I_ },
|
||||
{ I, IA, M },
|
||||
{ I, E, M },
|
||||
{ I, IA, X },
|
||||
{ I, IA, M, I }
|
||||
};
|
||||
|
||||
private static char[][] superlativeEndings = {
|
||||
{ E, I_, SH },
|
||||
{ E, I_, SH, E }
|
||||
};
|
||||
|
||||
private static char[][] derivationalEndings = {
|
||||
{ O, S, T },
|
||||
{ O, S, T, SOFT }
|
||||
};
|
||||
|
||||
/**
|
||||
* RussianStemmer constructor comment.
|
||||
*/
|
||||
public RussianStemmer()
|
||||
{
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* RussianStemmer constructor comment.
|
||||
*/
|
||||
public RussianStemmer(char[] charset)
|
||||
{
|
||||
super();
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjectival ending is an adjective ending,
|
||||
* optionally preceded by participle ending.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean adjectival(StringBuffer stemmingZone)
|
||||
{
|
||||
// look for adjective ending in a stemming zone
|
||||
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
|
||||
return false;
|
||||
// if adjective ending was found, try for participle ending.
|
||||
// variable r is unused, we are just interested in the side effect of
|
||||
// findAndRemoveEnding():
|
||||
boolean r =
|
||||
findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
|
||||
||
|
||||
findAndRemoveEnding(stemmingZone, participleEndings2);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Derivational endings
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean derivational(StringBuffer stemmingZone)
|
||||
{
|
||||
int endingLength = findEnding(stemmingZone, derivationalEndings);
|
||||
if (endingLength == 0)
|
||||
// no derivational ending found
|
||||
return false;
|
||||
else
|
||||
{
|
||||
// Ensure that the ending locates in R2
|
||||
if (R2 - RV <= stemmingZone.length() - endingLength)
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds ending among given ending class and returns the length of ending found(0, if not found).
|
||||
* Creation date: (17/03/2002 8:18:34 PM)
|
||||
*/
|
||||
private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
|
||||
{
|
||||
boolean match = false;
|
||||
for (int i = theEndingClass.length - 1; i >= 0; i--)
|
||||
{
|
||||
char[] theEnding = theEndingClass[i];
|
||||
// check if the ending is bigger than stemming zone
|
||||
if (startIndex < theEnding.length - 1)
|
||||
{
|
||||
match = false;
|
||||
continue;
|
||||
}
|
||||
match = true;
|
||||
int stemmingIndex = startIndex;
|
||||
for (int j = theEnding.length - 1; j >= 0; j--)
|
||||
{
|
||||
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
|
||||
{
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// check if ending was found
|
||||
if (match)
|
||||
{
|
||||
return theEndingClass[i].length; // cut ending
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
|
||||
{
|
||||
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the ending among the given class of endings and removes it from stemming zone.
|
||||
* Creation date: (17/03/2002 8:18:34 PM)
|
||||
*/
|
||||
private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
|
||||
{
|
||||
int endingLength = findEnding(stemmingZone, theEndingClass);
|
||||
if (endingLength == 0)
|
||||
// not found
|
||||
return false;
|
||||
else {
|
||||
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||
// cut the ending found
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the ending among the given class of endings, then checks if this ending was
|
||||
* preceded by any of given predessors, and if so, removes it from stemming zone.
|
||||
* Creation date: (17/03/2002 8:18:34 PM)
|
||||
*/
|
||||
private boolean findAndRemoveEnding(StringBuffer stemmingZone,
|
||||
char[][] theEndingClass, char[][] thePredessors)
|
||||
{
|
||||
int endingLength = findEnding(stemmingZone, theEndingClass);
|
||||
if (endingLength == 0)
|
||||
// not found
|
||||
return false;
|
||||
else
|
||||
{
|
||||
int predessorLength =
|
||||
findEnding(stemmingZone,
|
||||
stemmingZone.length() - endingLength - 1,
|
||||
thePredessors);
|
||||
if (predessorLength == 0)
|
||||
return false;
|
||||
else {
|
||||
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||
// cut the ending found
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks positions of RV, R1 and R2 in a given word.
|
||||
* Creation date: (16/03/2002 3:40:11 PM)
|
||||
*/
|
||||
private void markPositions(String word)
|
||||
{
|
||||
RV = 0;
|
||||
R1 = 0;
|
||||
R2 = 0;
|
||||
int i = 0;
|
||||
// find RV
|
||||
while (word.length() > i && !isVowel(word.charAt(i)))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
if (word.length() - 1 < ++i)
|
||||
return; // RV zone is empty
|
||||
RV = i;
|
||||
// find R1
|
||||
while (word.length() > i && isVowel(word.charAt(i)))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
if (word.length() - 1 < ++i)
|
||||
return; // R1 zone is empty
|
||||
R1 = i;
|
||||
// find R2
|
||||
while (word.length() > i && !isVowel(word.charAt(i)))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
if (word.length() - 1 < ++i)
|
||||
return; // R2 zone is empty
|
||||
while (word.length() > i && isVowel(word.charAt(i)))
|
||||
{
|
||||
i++;
|
||||
}
|
||||
if (word.length() - 1 < ++i)
|
||||
return; // R2 zone is empty
|
||||
R2 = i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if character is a vowel..
|
||||
* Creation date: (16/03/2002 10:47:03 PM)
|
||||
* @return boolean
|
||||
* @param letter char
|
||||
*/
|
||||
private boolean isVowel(char letter)
|
||||
{
|
||||
for (int i = 0; i < vowels.length; i++)
|
||||
{
|
||||
if (letter == charset[vowels[i]])
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Noun endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean noun(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(stemmingZone, nounEndings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perfective gerund endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean perfectiveGerund(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(
|
||||
stemmingZone,
|
||||
perfectiveGerundEndings1,
|
||||
perfectiveGerund1Predessors)
|
||||
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reflexive endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean reflexive(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert the method's description here.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean removeI(StringBuffer stemmingZone)
|
||||
{
|
||||
if (stemmingZone.length() > 0
|
||||
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert the method's description here.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean removeSoft(StringBuffer stemmingZone)
|
||||
{
|
||||
if (stemmingZone.length() > 0
|
||||
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert the method's description here.
|
||||
* Creation date: (16/03/2002 10:58:42 PM)
|
||||
* @param newCharset char[]
|
||||
*/
|
||||
public void setCharset(char[] newCharset)
|
||||
{
|
||||
charset = newCharset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the stem for given Russian word.
|
||||
* Creation date: (16/03/2002 3:36:48 PM)
|
||||
* @return java.lang.String
|
||||
* @param input java.lang.String
|
||||
*/
|
||||
public String stem(String input)
|
||||
{
|
||||
markPositions(input);
|
||||
if (RV == 0)
|
||||
return input; //RV wasn't detected, nothing to stem
|
||||
StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
|
||||
// stemming goes on in RV
|
||||
// Step 1
|
||||
|
||||
if (!perfectiveGerund(stemmingZone))
|
||||
{
|
||||
reflexive(stemmingZone);
|
||||
// variable r is unused, we are just interested in the flow that gets
|
||||
// created by logical expression: apply adjectival(); if that fails,
|
||||
// apply verb() etc
|
||||
boolean r =
|
||||
adjectival(stemmingZone)
|
||||
|| verb(stemmingZone)
|
||||
|| noun(stemmingZone);
|
||||
}
|
||||
// Step 2
|
||||
removeI(stemmingZone);
|
||||
// Step 3
|
||||
derivational(stemmingZone);
|
||||
// Step 4
|
||||
superlative(stemmingZone);
|
||||
undoubleN(stemmingZone);
|
||||
removeSoft(stemmingZone);
|
||||
// return result
|
||||
return input.substring(0, RV) + stemmingZone.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Superlative endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean superlative(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(stemmingZone, superlativeEndings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Undoubles N.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean undoubleN(StringBuffer stemmingZone)
|
||||
{
|
||||
char[][] doubleN = {
|
||||
{ N, N }
|
||||
};
|
||||
if (findEnding(stemmingZone, doubleN) != 0)
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verb endings.
|
||||
* Creation date: (17/03/2002 12:14:58 AM)
|
||||
* @param stemmingZone java.lang.StringBuffer
|
||||
*/
|
||||
private boolean verb(StringBuffer stemmingZone)
|
||||
{
|
||||
return findAndRemoveEnding(
|
||||
stemmingZone,
|
||||
verbEndings1,
|
||||
verb1Predessors)
|
||||
|| findAndRemoveEnding(stemmingZone, verbEndings2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Static method for stemming with different charsets
|
||||
*/
|
||||
public static String stem(String theWord, char[] charset)
|
||||
{
|
||||
RussianStemmer stemmer = new RussianStemmer();
|
||||
stemmer.setCharset(charset);
|
||||
return stemmer.stem(theWord);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
Support for indexing and searching Russian text.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,78 @@
|
|||
package org.apache.lucene.analysis.de;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
/**
|
||||
* Test the German stemmer. The stemming algorithm is known to work less
|
||||
* than perfect, as it doesn't use any word lists with exceptions. We
|
||||
* also check some of the cases where the algorithm is wrong.
|
||||
*
|
||||
* @author Daniel Naber
|
||||
*/
|
||||
public class TestGermanStemFilter extends TestCase {
|
||||
|
||||
public void testStemming() {
|
||||
try {
|
||||
// read test cases from external file:
|
||||
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
|
||||
FileInputStream fis = new FileInputStream(testFile);
|
||||
InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1");
|
||||
BufferedReader breader = new BufferedReader(isr);
|
||||
while(true) {
|
||||
String line = breader.readLine();
|
||||
if (line == null)
|
||||
break;
|
||||
line = line.trim();
|
||||
if (line.startsWith("#") || line.equals(""))
|
||||
continue; // ignore comments and empty lines
|
||||
String[] parts = line.split(";");
|
||||
//System.out.println(parts[0] + " -- " + parts[1]);
|
||||
check(parts[0], parts[1]);
|
||||
}
|
||||
breader.close();
|
||||
isr.close();
|
||||
fis.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
|
||||
GermanStemFilter filter = new GermanStemFilter(tokenStream);
|
||||
Token t = filter.next();
|
||||
if (t == null)
|
||||
fail();
|
||||
assertEquals(expected, t.termText());
|
||||
filter.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
# German special characters are replaced:
|
||||
häufig;haufig
|
||||
|
||||
# here the stemmer works okay, it maps related words to the same stem:
|
||||
abschließen;abschliess
|
||||
abschließender;abschliess
|
||||
abschließendes;abschliess
|
||||
abschließenden;abschliess
|
||||
|
||||
Tisch;tisch
|
||||
Tische;tisch
|
||||
Tischen;tisch
|
||||
|
||||
Haus;hau
|
||||
Hauses;hau
|
||||
Häuser;hau
|
||||
Häusern;hau
|
||||
# here's a case where overstemming occurs, i.e. a word is
|
||||
# mapped to the same stem as unrelated words:
|
||||
hauen;hau
|
||||
|
||||
# here's a case where understemming occurs, i.e. two related words
|
||||
# are not mapped to the same stem. This is the case with basically
|
||||
# all irregular forms:
|
||||
Drama;drama
|
||||
Dramen;dram
|
||||
|
||||
# replace "ß" with 'ss':
|
||||
Ausmaß;ausmass
|
||||
|
||||
# fake words to test if suffixes are cut off:
|
||||
xxxxxe;xxxxx
|
||||
xxxxxs;xxxxx
|
||||
xxxxxn;xxxxx
|
||||
xxxxxt;xxxxx
|
||||
xxxxxem;xxxxx
|
||||
xxxxxer;xxxxx
|
||||
xxxxxnd;xxxxx
|
||||
# the suffixes are also removed when combined:
|
||||
xxxxxetende;xxxxx
|
||||
|
||||
# words that are shorter than four charcters are not changed:
|
||||
xxe;xxe
|
||||
# -em and -er are not removed from words shorter than five characters:
|
||||
xxem;xxem
|
||||
xxer;xxer
|
||||
# -nd is not removed from words shorter than six characters:
|
||||
xxxnd;xxxnd
|
|
@ -0,0 +1,170 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* Test case for RussianAnalyzer.
|
||||
*
|
||||
* @author Boris Okner
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
public class TestRussianAnalyzer extends TestCase
|
||||
{
|
||||
private InputStreamReader inWords;
|
||||
|
||||
private InputStreamReader sampleUnicode;
|
||||
|
||||
private Reader inWordsKOI8;
|
||||
|
||||
private Reader sampleKOI8;
|
||||
|
||||
private Reader inWords1251;
|
||||
|
||||
private Reader sample1251;
|
||||
|
||||
private File dataDir;
|
||||
|
||||
protected void setUp() throws Exception
|
||||
{
|
||||
dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
}
|
||||
|
||||
public void testUnicode() throws IOException
|
||||
{
|
||||
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
|
||||
inWords =
|
||||
new InputStreamReader(
|
||||
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")),
|
||||
"Unicode");
|
||||
|
||||
sampleUnicode =
|
||||
new InputStreamReader(
|
||||
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")),
|
||||
"Unicode");
|
||||
|
||||
TokenStream in = ra.tokenStream("all", inWords);
|
||||
|
||||
RussianLetterTokenizer sample =
|
||||
new RussianLetterTokenizer(
|
||||
sampleUnicode,
|
||||
RussianCharsets.UnicodeRussian);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
Token token = in.next();
|
||||
|
||||
if (token == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Token sampleToken = sample.next();
|
||||
assertEquals(
|
||||
"Unicode",
|
||||
token.termText(),
|
||||
sampleToken == null
|
||||
? null
|
||||
: sampleToken.termText());
|
||||
}
|
||||
|
||||
inWords.close();
|
||||
sampleUnicode.close();
|
||||
}
|
||||
|
||||
public void testKOI8() throws IOException
|
||||
{
|
||||
//System.out.println(new java.util.Date());
|
||||
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
|
||||
// KOI8
|
||||
inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
|
||||
|
||||
sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
|
||||
|
||||
TokenStream in = ra.tokenStream("all", inWordsKOI8);
|
||||
RussianLetterTokenizer sample =
|
||||
new RussianLetterTokenizer(
|
||||
sampleKOI8,
|
||||
RussianCharsets.KOI8);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
Token token = in.next();
|
||||
|
||||
if (token == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Token sampleToken = sample.next();
|
||||
assertEquals(
|
||||
"KOI8",
|
||||
token.termText(),
|
||||
sampleToken == null
|
||||
? null
|
||||
: sampleToken.termText());
|
||||
|
||||
}
|
||||
|
||||
inWordsKOI8.close();
|
||||
sampleKOI8.close();
|
||||
}
|
||||
|
||||
public void test1251() throws IOException
|
||||
{
|
||||
// 1251
|
||||
inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
|
||||
|
||||
sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
|
||||
|
||||
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
|
||||
TokenStream in = ra.tokenStream("", inWords1251);
|
||||
RussianLetterTokenizer sample =
|
||||
new RussianLetterTokenizer(
|
||||
sample1251,
|
||||
RussianCharsets.CP1251);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
Token token = in.next();
|
||||
|
||||
if (token == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Token sampleToken = sample.next();
|
||||
assertEquals(
|
||||
"1251",
|
||||
token.termText(),
|
||||
sampleToken == null
|
||||
? null
|
||||
: sampleToken.termText());
|
||||
|
||||
}
|
||||
|
||||
inWords1251.close();
|
||||
sample1251.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class TestRussianStem extends TestCase
|
||||
{
|
||||
private ArrayList words = new ArrayList();
|
||||
private ArrayList stems = new ArrayList();
|
||||
|
||||
public TestRussianStem(String name)
|
||||
{
|
||||
super(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see TestCase#setUp()
|
||||
*/
|
||||
protected void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
//System.out.println(new java.util.Date());
|
||||
String str;
|
||||
|
||||
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
|
||||
// open and read words into an array list
|
||||
BufferedReader inWords =
|
||||
new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/wordsUnicode.txt")),
|
||||
"Unicode"));
|
||||
while ((str = inWords.readLine()) != null)
|
||||
{
|
||||
words.add(str);
|
||||
}
|
||||
inWords.close();
|
||||
|
||||
// open and read stems into an array list
|
||||
BufferedReader inStems =
|
||||
new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/stemsUnicode.txt")),
|
||||
"Unicode"));
|
||||
while ((str = inStems.readLine()) != null)
|
||||
{
|
||||
stems.add(str);
|
||||
}
|
||||
inStems.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* @see TestCase#tearDown()
|
||||
*/
|
||||
protected void tearDown() throws Exception
|
||||
{
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void testStem()
|
||||
{
|
||||
for (int i = 0; i < words.size(); i++)
|
||||
{
|
||||
//if ( (i % 100) == 0 ) System.err.println(i);
|
||||
String realStem =
|
||||
RussianStemmer.stem(
|
||||
(String) words.get(i),
|
||||
RussianCharsets.UnicodeRussian);
|
||||
assertEquals("unicode", stems.get(i), realStem);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
|
|
@ -0,0 +1 @@
|
|||
[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,2 @@
|
|||
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
|
||||
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
|
|
@ -0,0 +1,2 @@
|
|||
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
|
||||
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue