mirror of https://github.com/apache/lucene.git
copy the Russian and German analyzers plus their test cases to the sandbox
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150998 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
726ddaeb5a
commit
87bcdf6f25
|
@ -0,0 +1,135 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for German language. Supports an external list of stopwords (words that
|
||||||
|
* will not be indexed at all) and an external list of exclusions (word that will
|
||||||
|
* not be stemmed, but indexed).
|
||||||
|
* A default set of stopwords is used unless an alternative list is specified, the
|
||||||
|
* exclusion list is empty by default.
|
||||||
|
*
|
||||||
|
* @author Gerhard Schwarz
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class GermanAnalyzer extends Analyzer {
|
||||||
|
/**
|
||||||
|
* List of typical german stopwords.
|
||||||
|
*/
|
||||||
|
private String[] GERMAN_STOP_WORDS = {
|
||||||
|
"einer", "eine", "eines", "einem", "einen",
|
||||||
|
"der", "die", "das", "dass", "daß",
|
||||||
|
"du", "er", "sie", "es",
|
||||||
|
"was", "wer", "wie", "wir",
|
||||||
|
"und", "oder", "ohne", "mit",
|
||||||
|
"am", "im", "in", "aus", "auf",
|
||||||
|
"ist", "sein", "war", "wird",
|
||||||
|
"ihr", "ihre", "ihres",
|
||||||
|
"als", "für", "von", "mit",
|
||||||
|
"dich", "dir", "mich", "mir",
|
||||||
|
"mein", "sein", "kein",
|
||||||
|
"durch", "wegen", "wird"
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Contains the stopwords used with the StopFilter.
|
||||||
|
*/
|
||||||
|
private Set stopSet = new HashSet();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Contains words that should be indexed but not stemmed.
|
||||||
|
*/
|
||||||
|
private Set exclusionSet = new HashSet();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer.
|
||||||
|
*/
|
||||||
|
public GermanAnalyzer() {
|
||||||
|
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public GermanAnalyzer(String[] stopwords) {
|
||||||
|
stopSet = StopFilter.makeStopSet(stopwords);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public GermanAnalyzer(Hashtable stopwords) {
|
||||||
|
stopSet = new HashSet(stopwords.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public GermanAnalyzer(File stopwords) throws IOException {
|
||||||
|
stopSet = WordlistLoader.getWordSet(stopwords);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an exclusionlist from an array of Strings.
|
||||||
|
*/
|
||||||
|
public void setStemExclusionTable(String[] exclusionlist) {
|
||||||
|
exclusionSet = StopFilter.makeStopSet(exclusionlist);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an exclusionlist from a Hashtable.
|
||||||
|
*/
|
||||||
|
public void setStemExclusionTable(Hashtable exclusionlist) {
|
||||||
|
exclusionSet = new HashSet(exclusionlist.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an exclusionlist from the words contained in the given file.
|
||||||
|
*/
|
||||||
|
public void setStemExclusionTable(File exclusionlist) throws IOException {
|
||||||
|
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||||
|
*
|
||||||
|
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||||
|
* StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
|
||||||
|
*/
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream result = new StandardTokenizer(reader);
|
||||||
|
result = new StandardFilter(result);
|
||||||
|
result = new LowerCaseFilter(result);
|
||||||
|
result = new StopFilter(result, stopSet);
|
||||||
|
result = new GermanStemFilter(result, exclusionSet);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,119 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A filter that stems German words. It supports a table of words that should
|
||||||
|
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
||||||
|
* filter object is created (as long as it is a GermanStemmer).
|
||||||
|
*
|
||||||
|
* @author Gerhard Schwarz
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public final class GermanStemFilter extends TokenFilter
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* The actual token in the input stream.
|
||||||
|
*/
|
||||||
|
private Token token = null;
|
||||||
|
private GermanStemmer stemmer = null;
|
||||||
|
private Set exclusionSet = null;
|
||||||
|
|
||||||
|
public GermanStemFilter( TokenStream in )
|
||||||
|
{
|
||||||
|
super(in);
|
||||||
|
stemmer = new GermanStemmer();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a GermanStemFilter that uses an exclusiontable.
|
||||||
|
* @deprecated Use {@link #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
|
||||||
|
*/
|
||||||
|
public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
|
||||||
|
{
|
||||||
|
this( in );
|
||||||
|
exclusionSet = new HashSet(exclusiontable.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a GermanStemFilter that uses an exclusiontable.
|
||||||
|
*/
|
||||||
|
public GermanStemFilter( TokenStream in, Set exclusionSet )
|
||||||
|
{
|
||||||
|
this( in );
|
||||||
|
this.exclusionSet = exclusionSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Returns the next token in the stream, or null at EOS
|
||||||
|
*/
|
||||||
|
public final Token next()
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
if ( ( token = input.next() ) == null ) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// Check the exclusiontable
|
||||||
|
else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
String s = stemmer.stem( token.termText() );
|
||||||
|
// If not stemmed, dont waste the time creating a new token
|
||||||
|
if ( !s.equals( token.termText() ) ) {
|
||||||
|
return new Token( s, token.startOffset(),
|
||||||
|
token.endOffset(), token.type() );
|
||||||
|
}
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set a alternative/custom GermanStemmer for this filter.
|
||||||
|
*/
|
||||||
|
public void setStemmer( GermanStemmer stemmer )
|
||||||
|
{
|
||||||
|
if ( stemmer != null ) {
|
||||||
|
this.stemmer = stemmer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set an alternative exclusion list for this filter.
|
||||||
|
* @deprecated Use {@link #setExclusionSet(java.util.Set)} instead.
|
||||||
|
*/
|
||||||
|
public void setExclusionTable( Hashtable exclusiontable )
|
||||||
|
{
|
||||||
|
exclusionSet = new HashSet(exclusiontable.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set an alternative exclusion list for this filter.
|
||||||
|
*/
|
||||||
|
public void setExclusionSet( Set exclusionSet )
|
||||||
|
{
|
||||||
|
this.exclusionSet = exclusionSet;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,265 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A stemmer for German words. The algorithm is based on the report
|
||||||
|
* "A Fast and Simple Stemming Algorithm for German Words" by Jörg
|
||||||
|
* Caumanns (joerg.caumanns@isst.fhg.de).
|
||||||
|
*
|
||||||
|
* @author Gerhard Schwarz
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class GermanStemmer
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Buffer for the terms while stemming them.
|
||||||
|
*/
|
||||||
|
private StringBuffer sb = new StringBuffer();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Amount of characters that are removed with <tt>substitute()</tt> while stemming.
|
||||||
|
*/
|
||||||
|
private int substCount = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stemms the given term to an unique <tt>discriminator</tt>.
|
||||||
|
*
|
||||||
|
* @param term The term that should be stemmed.
|
||||||
|
* @return Discriminator for <tt>term</tt>
|
||||||
|
*/
|
||||||
|
protected String stem( String term )
|
||||||
|
{
|
||||||
|
// Use lowercase for medium stemming.
|
||||||
|
term = term.toLowerCase();
|
||||||
|
if ( !isStemmable( term ) )
|
||||||
|
return term;
|
||||||
|
// Reset the StringBuffer.
|
||||||
|
sb.delete( 0, sb.length() );
|
||||||
|
sb.insert( 0, term );
|
||||||
|
// Stemming starts here...
|
||||||
|
substitute( sb );
|
||||||
|
strip( sb );
|
||||||
|
optimize( sb );
|
||||||
|
resubstitute( sb );
|
||||||
|
removeParticleDenotion( sb );
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if a term could be stemmed.
|
||||||
|
*
|
||||||
|
* @return true if, and only if, the given term consists in letters.
|
||||||
|
*/
|
||||||
|
private boolean isStemmable( String term )
|
||||||
|
{
|
||||||
|
for ( int c = 0; c < term.length(); c++ ) {
|
||||||
|
if ( !Character.isLetter( term.charAt( c ) ) )
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* suffix stripping (stemming) on the current term. The stripping is reduced
|
||||||
|
* to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
|
||||||
|
* from which all regular suffixes are build of. The simplification causes
|
||||||
|
* some overstemming, and way more irregular stems, but still provides unique.
|
||||||
|
* discriminators in the most of those cases.
|
||||||
|
* The algorithm is context free, except of the length restrictions.
|
||||||
|
*/
|
||||||
|
private void strip( StringBuffer buffer )
|
||||||
|
{
|
||||||
|
boolean doMore = true;
|
||||||
|
while ( doMore && buffer.length() > 3 ) {
|
||||||
|
if ( ( buffer.length() + substCount > 5 ) &&
|
||||||
|
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
|
||||||
|
{
|
||||||
|
buffer.delete( buffer.length() - 2, buffer.length() );
|
||||||
|
}
|
||||||
|
else if ( ( buffer.length() + substCount > 4 ) &&
|
||||||
|
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
|
||||||
|
buffer.delete( buffer.length() - 2, buffer.length() );
|
||||||
|
}
|
||||||
|
else if ( ( buffer.length() + substCount > 4 ) &&
|
||||||
|
buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
|
||||||
|
buffer.delete( buffer.length() - 2, buffer.length() );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
|
||||||
|
buffer.deleteCharAt( buffer.length() - 1 );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
|
||||||
|
buffer.deleteCharAt( buffer.length() - 1 );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
|
||||||
|
buffer.deleteCharAt( buffer.length() - 1 );
|
||||||
|
}
|
||||||
|
// "t" occurs only as suffix of verbs.
|
||||||
|
else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
|
||||||
|
buffer.deleteCharAt( buffer.length() - 1 );
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
doMore = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does some optimizations on the term. This optimisations are
|
||||||
|
* contextual.
|
||||||
|
*/
|
||||||
|
private void optimize( StringBuffer buffer )
|
||||||
|
{
|
||||||
|
// Additional step for female plurals of professions and inhabitants.
|
||||||
|
if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
|
||||||
|
buffer.deleteCharAt( buffer.length() -1 );
|
||||||
|
strip( buffer );
|
||||||
|
}
|
||||||
|
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
|
||||||
|
if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
|
||||||
|
buffer.setCharAt( buffer.length() - 1, 'x' );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes a particle denotion ("ge") from a term.
|
||||||
|
*/
|
||||||
|
private void removeParticleDenotion( StringBuffer buffer )
|
||||||
|
{
|
||||||
|
if ( buffer.length() > 4 ) {
|
||||||
|
for ( int c = 0; c < buffer.length() - 3; c++ ) {
|
||||||
|
if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
|
||||||
|
buffer.delete( c, c + 2 );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do some substitutions for the term to reduce overstemming:
|
||||||
|
*
|
||||||
|
* - Substitute Umlauts with their corresponding vowel: äöü -> aou,
|
||||||
|
* "ß" is substituted by "ss"
|
||||||
|
* - Substitute a second char of a pair of equal characters with
|
||||||
|
* an asterisk: ?? -> ?*
|
||||||
|
* - Substitute some common character combinations with a token:
|
||||||
|
* sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
|
||||||
|
*/
|
||||||
|
private void substitute( StringBuffer buffer )
|
||||||
|
{
|
||||||
|
substCount = 0;
|
||||||
|
for ( int c = 0; c < buffer.length(); c++ ) {
|
||||||
|
// Replace the second char of a pair of the equal characters with an asterisk
|
||||||
|
if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
|
||||||
|
buffer.setCharAt( c, '*' );
|
||||||
|
}
|
||||||
|
// Substitute Umlauts.
|
||||||
|
else if ( buffer.charAt( c ) == 'ä' ) {
|
||||||
|
buffer.setCharAt( c, 'a' );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == 'ö' ) {
|
||||||
|
buffer.setCharAt( c, 'o' );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == 'ü' ) {
|
||||||
|
buffer.setCharAt( c, 'u' );
|
||||||
|
}
|
||||||
|
// Fix bug so that 'ß' at the end of a word is replaced.
|
||||||
|
else if ( buffer.charAt( c ) == 'ß' ) {
|
||||||
|
buffer.setCharAt( c, 's' );
|
||||||
|
buffer.insert( c + 1, 's' );
|
||||||
|
substCount++;
|
||||||
|
}
|
||||||
|
// Take care that at least one character is left left side from the current one
|
||||||
|
if ( c < buffer.length() - 1 ) {
|
||||||
|
// Masking several common character combinations with an token
|
||||||
|
if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
|
||||||
|
buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
|
||||||
|
{
|
||||||
|
buffer.setCharAt( c, '$' );
|
||||||
|
buffer.delete( c + 1, c + 3 );
|
||||||
|
substCount =+ 2;
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
|
||||||
|
buffer.setCharAt( c, '§' );
|
||||||
|
buffer.deleteCharAt( c + 1 );
|
||||||
|
substCount++;
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
|
||||||
|
buffer.setCharAt( c, '%' );
|
||||||
|
buffer.deleteCharAt( c + 1 );
|
||||||
|
substCount++;
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
|
||||||
|
buffer.setCharAt( c, '&' );
|
||||||
|
buffer.deleteCharAt( c + 1 );
|
||||||
|
substCount++;
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
|
||||||
|
buffer.setCharAt( c, '#' );
|
||||||
|
buffer.deleteCharAt( c + 1 );
|
||||||
|
substCount++;
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
|
||||||
|
buffer.setCharAt( c, '!' );
|
||||||
|
buffer.deleteCharAt( c + 1 );
|
||||||
|
substCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Undoes the changes made by substitute(). That are character pairs and
|
||||||
|
* character combinations. Umlauts will remain as their corresponding vowel,
|
||||||
|
* as "ß" remains as "ss".
|
||||||
|
*/
|
||||||
|
private void resubstitute( StringBuffer buffer )
|
||||||
|
{
|
||||||
|
for ( int c = 0; c < buffer.length(); c++ ) {
|
||||||
|
if ( buffer.charAt( c ) == '*' ) {
|
||||||
|
char x = buffer.charAt( c - 1 );
|
||||||
|
buffer.setCharAt( c, x );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == '$' ) {
|
||||||
|
buffer.setCharAt( c, 's' );
|
||||||
|
buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == '§' ) {
|
||||||
|
buffer.setCharAt( c, 'c' );
|
||||||
|
buffer.insert( c + 1, 'h' );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == '%' ) {
|
||||||
|
buffer.setCharAt( c, 'e' );
|
||||||
|
buffer.insert( c + 1, 'i' );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == '&' ) {
|
||||||
|
buffer.setCharAt( c, 'i' );
|
||||||
|
buffer.insert( c + 1, 'e' );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == '#' ) {
|
||||||
|
buffer.setCharAt( c, 'i' );
|
||||||
|
buffer.insert( c + 1, 'g' );
|
||||||
|
}
|
||||||
|
else if ( buffer.charAt( c ) == '!' ) {
|
||||||
|
buffer.setCharAt( c, 's' );
|
||||||
|
buffer.insert( c + 1, 't' );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,111 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.LineNumberReader;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loader for text files that represent a list of stopwords.
|
||||||
|
*
|
||||||
|
* @author Gerhard Schwarz
|
||||||
|
* @version $Id$
|
||||||
|
*
|
||||||
|
* @todo this is not specific to German, it should be moved up
|
||||||
|
*/
|
||||||
|
public class WordlistLoader {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a text file and adds every line as an entry to a HashSet (omitting
|
||||||
|
* leading and trailing whitespace). Every line of the file should contain only
|
||||||
|
* one word. The words need to be in lowercase if you make use of an
|
||||||
|
* Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
|
||||||
|
*
|
||||||
|
* @param wordfile File containing the wordlist
|
||||||
|
* @return A HashSet with the file's words
|
||||||
|
*/
|
||||||
|
public static HashSet getWordSet(File wordfile) throws IOException {
|
||||||
|
HashSet result = new HashSet();
|
||||||
|
FileReader freader = null;
|
||||||
|
LineNumberReader lnr = null;
|
||||||
|
try {
|
||||||
|
freader = new FileReader(wordfile);
|
||||||
|
lnr = new LineNumberReader(freader);
|
||||||
|
String word = null;
|
||||||
|
while ((word = lnr.readLine()) != null) {
|
||||||
|
result.add(word.trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
if (lnr != null)
|
||||||
|
lnr.close();
|
||||||
|
if (freader != null)
|
||||||
|
freader.close();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param path Path to the wordlist
|
||||||
|
* @param wordfile Name of the wordlist
|
||||||
|
*
|
||||||
|
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
|
||||||
|
*/
|
||||||
|
public static Hashtable getWordtable(String path, String wordfile) throws IOException {
|
||||||
|
return getWordtable(new File(path, wordfile));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param wordfile Complete path to the wordlist
|
||||||
|
*
|
||||||
|
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
|
||||||
|
*/
|
||||||
|
public static Hashtable getWordtable(String wordfile) throws IOException {
|
||||||
|
return getWordtable(new File(wordfile));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param wordfile File object that points to the wordlist
|
||||||
|
*
|
||||||
|
* @deprecated Use {@link #getWordSet(File)} getWordSet(File)} instead
|
||||||
|
*/
|
||||||
|
public static Hashtable getWordtable(File wordfile) throws IOException {
|
||||||
|
HashSet wordSet = (HashSet)getWordSet(wordfile);
|
||||||
|
Hashtable result = makeWordTable(wordSet);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a wordlist table, using words as both keys and values
|
||||||
|
* for backward compatibility.
|
||||||
|
*
|
||||||
|
* @param wordSet stopword set
|
||||||
|
*/
|
||||||
|
private static Hashtable makeWordTable(HashSet wordSet) {
|
||||||
|
Hashtable table = new Hashtable();
|
||||||
|
for (Iterator iter = wordSet.iterator(); iter.hasNext();) {
|
||||||
|
String word = (String)iter.next();
|
||||||
|
table.put(word, word);
|
||||||
|
}
|
||||||
|
return table;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
Support for indexing and searching of German text.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,259 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Hashtable;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Russian language. Supports an external list of stopwords (words that
|
||||||
|
* will not be indexed at all).
|
||||||
|
* A default set of stopwords is used unless an alternative list is specified.
|
||||||
|
*
|
||||||
|
* @author Boris Okner, b.okner@rogers.com
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public final class RussianAnalyzer extends Analyzer
|
||||||
|
{
|
||||||
|
// letters (currently unused letters are commented out)
|
||||||
|
private final static char A = 0;
|
||||||
|
private final static char B = 1;
|
||||||
|
private final static char V = 2;
|
||||||
|
private final static char G = 3;
|
||||||
|
private final static char D = 4;
|
||||||
|
private final static char E = 5;
|
||||||
|
private final static char ZH = 6;
|
||||||
|
private final static char Z = 7;
|
||||||
|
private final static char I = 8;
|
||||||
|
private final static char I_ = 9;
|
||||||
|
private final static char K = 10;
|
||||||
|
private final static char L = 11;
|
||||||
|
private final static char M = 12;
|
||||||
|
private final static char N = 13;
|
||||||
|
private final static char O = 14;
|
||||||
|
private final static char P = 15;
|
||||||
|
private final static char R = 16;
|
||||||
|
private final static char S = 17;
|
||||||
|
private final static char T = 18;
|
||||||
|
private final static char U = 19;
|
||||||
|
//private final static char F = 20;
|
||||||
|
private final static char X = 21;
|
||||||
|
//private final static char TS = 22;
|
||||||
|
private final static char CH = 23;
|
||||||
|
private final static char SH = 24;
|
||||||
|
private final static char SHCH = 25;
|
||||||
|
//private final static char HARD = 26;
|
||||||
|
private final static char Y = 27;
|
||||||
|
private final static char SOFT = 28;
|
||||||
|
private final static char AE = 29;
|
||||||
|
private final static char IU = 30;
|
||||||
|
private final static char IA = 31;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List of typical Russian stopwords.
|
||||||
|
*/
|
||||||
|
private static char[][] RUSSIAN_STOP_WORDS = {
|
||||||
|
{A},
|
||||||
|
{B, E, Z},
|
||||||
|
{B, O, L, E, E},
|
||||||
|
{B, Y},
|
||||||
|
{B, Y, L},
|
||||||
|
{B, Y, L, A},
|
||||||
|
{B, Y, L, I},
|
||||||
|
{B, Y, L, O},
|
||||||
|
{B, Y, T, SOFT},
|
||||||
|
{V},
|
||||||
|
{V, A, M},
|
||||||
|
{V, A, S},
|
||||||
|
{V, E, S, SOFT},
|
||||||
|
{V, O},
|
||||||
|
{V, O, T},
|
||||||
|
{V, S, E},
|
||||||
|
{V, S, E, G, O},
|
||||||
|
{V, S, E, X},
|
||||||
|
{V, Y},
|
||||||
|
{G, D, E},
|
||||||
|
{D, A},
|
||||||
|
{D, A, ZH, E},
|
||||||
|
{D, L, IA},
|
||||||
|
{D, O},
|
||||||
|
{E, G, O},
|
||||||
|
{E, E},
|
||||||
|
{E, I_,},
|
||||||
|
{E, IU},
|
||||||
|
{E, S, L, I},
|
||||||
|
{E, S, T, SOFT},
|
||||||
|
{E, SHCH, E},
|
||||||
|
{ZH, E},
|
||||||
|
{Z, A},
|
||||||
|
{Z, D, E, S, SOFT},
|
||||||
|
{I},
|
||||||
|
{I, Z},
|
||||||
|
{I, L, I},
|
||||||
|
{I, M},
|
||||||
|
{I, X},
|
||||||
|
{K},
|
||||||
|
{K, A, K},
|
||||||
|
{K, O},
|
||||||
|
{K, O, G, D, A},
|
||||||
|
{K, T, O},
|
||||||
|
{L, I},
|
||||||
|
{L, I, B, O},
|
||||||
|
{M, N, E},
|
||||||
|
{M, O, ZH, E, T},
|
||||||
|
{M, Y},
|
||||||
|
{N, A},
|
||||||
|
{N, A, D, O},
|
||||||
|
{N, A, SH},
|
||||||
|
{N, E},
|
||||||
|
{N, E, G, O},
|
||||||
|
{N, E, E},
|
||||||
|
{N, E, T},
|
||||||
|
{N, I},
|
||||||
|
{N, I, X},
|
||||||
|
{N, O},
|
||||||
|
{N, U},
|
||||||
|
{O},
|
||||||
|
{O, B},
|
||||||
|
{O, D, N, A, K, O},
|
||||||
|
{O, N},
|
||||||
|
{O, N, A},
|
||||||
|
{O, N, I},
|
||||||
|
{O, N, O},
|
||||||
|
{O, T},
|
||||||
|
{O, CH, E, N, SOFT},
|
||||||
|
{P, O},
|
||||||
|
{P, O, D},
|
||||||
|
{P, R, I},
|
||||||
|
{S},
|
||||||
|
{S, O},
|
||||||
|
{T, A, K},
|
||||||
|
{T, A, K, ZH, E},
|
||||||
|
{T, A, K, O, I_},
|
||||||
|
{T, A, M},
|
||||||
|
{T, E},
|
||||||
|
{T, E, M},
|
||||||
|
{T, O},
|
||||||
|
{T, O, G, O},
|
||||||
|
{T, O, ZH, E},
|
||||||
|
{T, O, I_},
|
||||||
|
{T, O, L, SOFT, K, O},
|
||||||
|
{T, O, M},
|
||||||
|
{T, Y},
|
||||||
|
{U},
|
||||||
|
{U, ZH, E},
|
||||||
|
{X, O, T, IA},
|
||||||
|
{CH, E, G, O},
|
||||||
|
{CH, E, I_},
|
||||||
|
{CH, E, M},
|
||||||
|
{CH, T, O},
|
||||||
|
{CH, T, O, B, Y},
|
||||||
|
{CH, SOFT, E},
|
||||||
|
{CH, SOFT, IA},
|
||||||
|
{AE, T, A},
|
||||||
|
{AE, T, I},
|
||||||
|
{AE, T, O},
|
||||||
|
{IA}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Contains the stopwords used with the StopFilter.
|
||||||
|
*/
|
||||||
|
private Set stopSet = new HashSet();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Charset for Russian letters.
|
||||||
|
* Represents encoding for 32 lowercase Russian letters.
|
||||||
|
* Predefined charsets can be taken from RussianCharSets class
|
||||||
|
*/
|
||||||
|
private char[] charset;
|
||||||
|
|
||||||
|
|
||||||
|
public RussianAnalyzer() {
|
||||||
|
charset = RussianCharsets.UnicodeRussian;
|
||||||
|
stopSet = StopFilter.makeStopSet(
|
||||||
|
makeStopWords(RussianCharsets.UnicodeRussian));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer.
|
||||||
|
*/
|
||||||
|
public RussianAnalyzer(char[] charset)
|
||||||
|
{
|
||||||
|
this.charset = charset;
|
||||||
|
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public RussianAnalyzer(char[] charset, String[] stopwords)
|
||||||
|
{
|
||||||
|
this.charset = charset;
|
||||||
|
stopSet = StopFilter.makeStopSet(stopwords);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Takes russian stop words and translates them to a String array, using
|
||||||
|
// the given charset
|
||||||
|
private static String[] makeStopWords(char[] charset)
|
||||||
|
{
|
||||||
|
String[] res = new String[RUSSIAN_STOP_WORDS.length];
|
||||||
|
for (int i = 0; i < res.length; i++)
|
||||||
|
{
|
||||||
|
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
|
||||||
|
// translate the word, using the charset
|
||||||
|
StringBuffer theWord = new StringBuffer();
|
||||||
|
for (int j = 0; j < theStopWord.length; j++)
|
||||||
|
{
|
||||||
|
theWord.append(charset[theStopWord[j]]);
|
||||||
|
}
|
||||||
|
res[i] = theWord.toString();
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
* @todo create a Set version of this ctor
|
||||||
|
*/
|
||||||
|
public RussianAnalyzer(char[] charset, Hashtable stopwords)
|
||||||
|
{
|
||||||
|
this.charset = charset;
|
||||||
|
stopSet = new HashSet(stopwords.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||||
|
*
|
||||||
|
* @return A TokenStream build from a RussianLetterTokenizer filtered with
|
||||||
|
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
|
||||||
|
*/
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||||
|
{
|
||||||
|
TokenStream result = new RussianLetterTokenizer(reader, charset);
|
||||||
|
result = new RussianLowerCaseFilter(result, charset);
|
||||||
|
result = new StopFilter(result, stopSet);
|
||||||
|
result = new RussianStemFilter(result, charset);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,279 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
||||||
|
* for russian characters in Unicode, KOI8 and CP1252.
|
||||||
|
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
|
||||||
|
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
||||||
|
* and adding logic to toLowerCase() method for that charset.
|
||||||
|
*
|
||||||
|
* @author Boris Okner, b.okner@rogers.com
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class RussianCharsets
|
||||||
|
{
|
||||||
|
// Unicode Russian charset (lowercase letters only)
|
||||||
|
public static char[] UnicodeRussian = {
|
||||||
|
'\u0430',
|
||||||
|
'\u0431',
|
||||||
|
'\u0432',
|
||||||
|
'\u0433',
|
||||||
|
'\u0434',
|
||||||
|
'\u0435',
|
||||||
|
'\u0436',
|
||||||
|
'\u0437',
|
||||||
|
'\u0438',
|
||||||
|
'\u0439',
|
||||||
|
'\u043A',
|
||||||
|
'\u043B',
|
||||||
|
'\u043C',
|
||||||
|
'\u043D',
|
||||||
|
'\u043E',
|
||||||
|
'\u043F',
|
||||||
|
'\u0440',
|
||||||
|
'\u0441',
|
||||||
|
'\u0442',
|
||||||
|
'\u0443',
|
||||||
|
'\u0444',
|
||||||
|
'\u0445',
|
||||||
|
'\u0446',
|
||||||
|
'\u0447',
|
||||||
|
'\u0448',
|
||||||
|
'\u0449',
|
||||||
|
'\u044A',
|
||||||
|
'\u044B',
|
||||||
|
'\u044C',
|
||||||
|
'\u044D',
|
||||||
|
'\u044E',
|
||||||
|
'\u044F',
|
||||||
|
// upper case
|
||||||
|
'\u0410',
|
||||||
|
'\u0411',
|
||||||
|
'\u0412',
|
||||||
|
'\u0413',
|
||||||
|
'\u0414',
|
||||||
|
'\u0415',
|
||||||
|
'\u0416',
|
||||||
|
'\u0417',
|
||||||
|
'\u0418',
|
||||||
|
'\u0419',
|
||||||
|
'\u041A',
|
||||||
|
'\u041B',
|
||||||
|
'\u041C',
|
||||||
|
'\u041D',
|
||||||
|
'\u041E',
|
||||||
|
'\u041F',
|
||||||
|
'\u0420',
|
||||||
|
'\u0421',
|
||||||
|
'\u0422',
|
||||||
|
'\u0423',
|
||||||
|
'\u0424',
|
||||||
|
'\u0425',
|
||||||
|
'\u0426',
|
||||||
|
'\u0427',
|
||||||
|
'\u0428',
|
||||||
|
'\u0429',
|
||||||
|
'\u042A',
|
||||||
|
'\u042B',
|
||||||
|
'\u042C',
|
||||||
|
'\u042D',
|
||||||
|
'\u042E',
|
||||||
|
'\u042F'
|
||||||
|
};
|
||||||
|
|
||||||
|
// KOI8 charset
|
||||||
|
public static char[] KOI8 = {
|
||||||
|
0xc1,
|
||||||
|
0xc2,
|
||||||
|
0xd7,
|
||||||
|
0xc7,
|
||||||
|
0xc4,
|
||||||
|
0xc5,
|
||||||
|
0xd6,
|
||||||
|
0xda,
|
||||||
|
0xc9,
|
||||||
|
0xca,
|
||||||
|
0xcb,
|
||||||
|
0xcc,
|
||||||
|
0xcd,
|
||||||
|
0xce,
|
||||||
|
0xcf,
|
||||||
|
0xd0,
|
||||||
|
0xd2,
|
||||||
|
0xd3,
|
||||||
|
0xd4,
|
||||||
|
0xd5,
|
||||||
|
0xc6,
|
||||||
|
0xc8,
|
||||||
|
0xc3,
|
||||||
|
0xde,
|
||||||
|
0xdb,
|
||||||
|
0xdd,
|
||||||
|
0xdf,
|
||||||
|
0xd9,
|
||||||
|
0xd8,
|
||||||
|
0xdc,
|
||||||
|
0xc0,
|
||||||
|
0xd1,
|
||||||
|
// upper case
|
||||||
|
0xe1,
|
||||||
|
0xe2,
|
||||||
|
0xf7,
|
||||||
|
0xe7,
|
||||||
|
0xe4,
|
||||||
|
0xe5,
|
||||||
|
0xf6,
|
||||||
|
0xfa,
|
||||||
|
0xe9,
|
||||||
|
0xea,
|
||||||
|
0xeb,
|
||||||
|
0xec,
|
||||||
|
0xed,
|
||||||
|
0xee,
|
||||||
|
0xef,
|
||||||
|
0xf0,
|
||||||
|
0xf2,
|
||||||
|
0xf3,
|
||||||
|
0xf4,
|
||||||
|
0xf5,
|
||||||
|
0xe6,
|
||||||
|
0xe8,
|
||||||
|
0xe3,
|
||||||
|
0xfe,
|
||||||
|
0xfb,
|
||||||
|
0xfd,
|
||||||
|
0xff,
|
||||||
|
0xf9,
|
||||||
|
0xf8,
|
||||||
|
0xfc,
|
||||||
|
0xe0,
|
||||||
|
0xf1
|
||||||
|
};
|
||||||
|
|
||||||
|
// CP1251 eharset
|
||||||
|
public static char[] CP1251 = {
|
||||||
|
0xE0,
|
||||||
|
0xE1,
|
||||||
|
0xE2,
|
||||||
|
0xE3,
|
||||||
|
0xE4,
|
||||||
|
0xE5,
|
||||||
|
0xE6,
|
||||||
|
0xE7,
|
||||||
|
0xE8,
|
||||||
|
0xE9,
|
||||||
|
0xEA,
|
||||||
|
0xEB,
|
||||||
|
0xEC,
|
||||||
|
0xED,
|
||||||
|
0xEE,
|
||||||
|
0xEF,
|
||||||
|
0xF0,
|
||||||
|
0xF1,
|
||||||
|
0xF2,
|
||||||
|
0xF3,
|
||||||
|
0xF4,
|
||||||
|
0xF5,
|
||||||
|
0xF6,
|
||||||
|
0xF7,
|
||||||
|
0xF8,
|
||||||
|
0xF9,
|
||||||
|
0xFA,
|
||||||
|
0xFB,
|
||||||
|
0xFC,
|
||||||
|
0xFD,
|
||||||
|
0xFE,
|
||||||
|
0xFF,
|
||||||
|
// upper case
|
||||||
|
0xC0,
|
||||||
|
0xC1,
|
||||||
|
0xC2,
|
||||||
|
0xC3,
|
||||||
|
0xC4,
|
||||||
|
0xC5,
|
||||||
|
0xC6,
|
||||||
|
0xC7,
|
||||||
|
0xC8,
|
||||||
|
0xC9,
|
||||||
|
0xCA,
|
||||||
|
0xCB,
|
||||||
|
0xCC,
|
||||||
|
0xCD,
|
||||||
|
0xCE,
|
||||||
|
0xCF,
|
||||||
|
0xD0,
|
||||||
|
0xD1,
|
||||||
|
0xD2,
|
||||||
|
0xD3,
|
||||||
|
0xD4,
|
||||||
|
0xD5,
|
||||||
|
0xD6,
|
||||||
|
0xD7,
|
||||||
|
0xD8,
|
||||||
|
0xD9,
|
||||||
|
0xDA,
|
||||||
|
0xDB,
|
||||||
|
0xDC,
|
||||||
|
0xDD,
|
||||||
|
0xDE,
|
||||||
|
0xDF
|
||||||
|
};
|
||||||
|
|
||||||
|
public static char toLowerCase(char letter, char[] charset)
|
||||||
|
{
|
||||||
|
if (charset == UnicodeRussian)
|
||||||
|
{
|
||||||
|
if (letter >= '\u0430' && letter <= '\u044F')
|
||||||
|
{
|
||||||
|
return letter;
|
||||||
|
}
|
||||||
|
if (letter >= '\u0410' && letter <= '\u042F')
|
||||||
|
{
|
||||||
|
return (char) (letter + 32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (charset == KOI8)
|
||||||
|
{
|
||||||
|
if (letter >= 0xe0 && letter <= 0xff)
|
||||||
|
{
|
||||||
|
return (char) (letter - 32);
|
||||||
|
}
|
||||||
|
if (letter >= 0xc0 && letter <= 0xdf)
|
||||||
|
{
|
||||||
|
return letter;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (charset == CP1251)
|
||||||
|
{
|
||||||
|
if (letter >= 0xC0 && letter <= 0xDF)
|
||||||
|
{
|
||||||
|
return (char) (letter + 32);
|
||||||
|
}
|
||||||
|
if (letter >= 0xE0 && letter <= 0xFF)
|
||||||
|
{
|
||||||
|
return letter;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return Character.toLowerCase(letter);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import org.apache.lucene.analysis.CharTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
|
||||||
|
* in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
|
||||||
|
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
|
||||||
|
* (well-known problems with 0xD7 and 0xF7 chars)
|
||||||
|
*
|
||||||
|
* @author Boris Okner, b.okner@rogers.com
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class RussianLetterTokenizer extends CharTokenizer
|
||||||
|
{
|
||||||
|
/** Construct a new LetterTokenizer. */
|
||||||
|
private char[] charset;
|
||||||
|
|
||||||
|
public RussianLetterTokenizer(Reader in, char[] charset)
|
||||||
|
{
|
||||||
|
super(in);
|
||||||
|
this.charset = charset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Collects only characters which satisfy
|
||||||
|
* {@link Character#isLetter(char)}.
|
||||||
|
*/
|
||||||
|
protected boolean isTokenChar(char c)
|
||||||
|
{
|
||||||
|
if (Character.isLetter(c))
|
||||||
|
return true;
|
||||||
|
for (int i = 0; i < charset.length; i++)
|
||||||
|
{
|
||||||
|
if (c == charset[i])
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,60 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes token text to lower case, analyzing given ("russian") charset.
|
||||||
|
*
|
||||||
|
* @author Boris Okner, b.okner@rogers.com
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public final class RussianLowerCaseFilter extends TokenFilter
|
||||||
|
{
|
||||||
|
char[] charset;
|
||||||
|
|
||||||
|
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
||||||
|
{
|
||||||
|
super(in);
|
||||||
|
this.charset = charset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final Token next() throws java.io.IOException
|
||||||
|
{
|
||||||
|
Token t = input.next();
|
||||||
|
|
||||||
|
if (t == null)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
String txt = t.termText();
|
||||||
|
|
||||||
|
char[] chArray = txt.toCharArray();
|
||||||
|
for (int i = 0; i < chArray.length; i++)
|
||||||
|
{
|
||||||
|
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
String newTxt = new String(chArray);
|
||||||
|
// create new token
|
||||||
|
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
|
||||||
|
|
||||||
|
return newToken;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,77 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A filter that stems Russian words. The implementation was inspired by GermanStemFilter.
|
||||||
|
* The input should be filtered by RussianLowerCaseFilter before passing it to RussianStemFilter ,
|
||||||
|
* because RussianStemFilter only works with lowercase part of any "russian" charset.
|
||||||
|
*
|
||||||
|
* @author Boris Okner, b.okner@rogers.com
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public final class RussianStemFilter extends TokenFilter
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* The actual token in the input stream.
|
||||||
|
*/
|
||||||
|
private Token token = null;
|
||||||
|
private RussianStemmer stemmer = null;
|
||||||
|
|
||||||
|
public RussianStemFilter(TokenStream in, char[] charset)
|
||||||
|
{
|
||||||
|
super(in);
|
||||||
|
stemmer = new RussianStemmer(charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Returns the next token in the stream, or null at EOS
|
||||||
|
*/
|
||||||
|
public final Token next() throws IOException
|
||||||
|
{
|
||||||
|
if ((token = input.next()) == null)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
String s = stemmer.stem(token.termText());
|
||||||
|
if (!s.equals(token.termText()))
|
||||||
|
{
|
||||||
|
return new Token(s, token.startOffset(), token.endOffset(),
|
||||||
|
token.type());
|
||||||
|
}
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set a alternative/custom RussianStemmer for this filter.
|
||||||
|
*/
|
||||||
|
public void setStemmer(RussianStemmer stemmer)
|
||||||
|
{
|
||||||
|
if (stemmer != null)
|
||||||
|
{
|
||||||
|
this.stemmer = stemmer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,629 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
|
||||||
|
*
|
||||||
|
* @author Boris Okner, b.okner@rogers.com
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
class RussianStemmer
|
||||||
|
{
|
||||||
|
private char[] charset;
|
||||||
|
|
||||||
|
// positions of RV, R1 and R2 respectively
|
||||||
|
private int RV, R1, R2;
|
||||||
|
|
||||||
|
// letters (currently unused letters are commented out)
|
||||||
|
private final static char A = 0;
|
||||||
|
//private final static char B = 1;
|
||||||
|
private final static char V = 2;
|
||||||
|
private final static char G = 3;
|
||||||
|
//private final static char D = 4;
|
||||||
|
private final static char E = 5;
|
||||||
|
//private final static char ZH = 6;
|
||||||
|
//private final static char Z = 7;
|
||||||
|
private final static char I = 8;
|
||||||
|
private final static char I_ = 9;
|
||||||
|
//private final static char K = 10;
|
||||||
|
private final static char L = 11;
|
||||||
|
private final static char M = 12;
|
||||||
|
private final static char N = 13;
|
||||||
|
private final static char O = 14;
|
||||||
|
//private final static char P = 15;
|
||||||
|
//private final static char R = 16;
|
||||||
|
private final static char S = 17;
|
||||||
|
private final static char T = 18;
|
||||||
|
private final static char U = 19;
|
||||||
|
//private final static char F = 20;
|
||||||
|
private final static char X = 21;
|
||||||
|
//private final static char TS = 22;
|
||||||
|
//private final static char CH = 23;
|
||||||
|
private final static char SH = 24;
|
||||||
|
private final static char SHCH = 25;
|
||||||
|
//private final static char HARD = 26;
|
||||||
|
private final static char Y = 27;
|
||||||
|
private final static char SOFT = 28;
|
||||||
|
private final static char AE = 29;
|
||||||
|
private final static char IU = 30;
|
||||||
|
private final static char IA = 31;
|
||||||
|
|
||||||
|
// stem definitions
|
||||||
|
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
|
||||||
|
|
||||||
|
private static char[][] perfectiveGerundEndings1 = {
|
||||||
|
{ V },
|
||||||
|
{ V, SH, I },
|
||||||
|
{ V, SH, I, S, SOFT }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] perfectiveGerund1Predessors = {
|
||||||
|
{ A },
|
||||||
|
{ IA }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] perfectiveGerundEndings2 = { { I, V }, {
|
||||||
|
Y, V }, {
|
||||||
|
I, V, SH, I }, {
|
||||||
|
Y, V, SH, I }, {
|
||||||
|
I, V, SH, I, S, SOFT }, {
|
||||||
|
Y, V, SH, I, S, SOFT }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] adjectiveEndings = {
|
||||||
|
{ E, E },
|
||||||
|
{ I, E },
|
||||||
|
{ Y, E },
|
||||||
|
{ O, E },
|
||||||
|
{ E, I_ },
|
||||||
|
{ I, I_ },
|
||||||
|
{ Y, I_ },
|
||||||
|
{ O, I_ },
|
||||||
|
{ E, M },
|
||||||
|
{ I, M },
|
||||||
|
{ Y, M },
|
||||||
|
{ O, M },
|
||||||
|
{ I, X },
|
||||||
|
{ Y, X },
|
||||||
|
{ U, IU },
|
||||||
|
{ IU, IU },
|
||||||
|
{ A, IA },
|
||||||
|
{ IA, IA },
|
||||||
|
{ O, IU },
|
||||||
|
{ E, IU },
|
||||||
|
{ I, M, I },
|
||||||
|
{ Y, M, I },
|
||||||
|
{ E, G, O },
|
||||||
|
{ O, G, O },
|
||||||
|
{ E, M, U },
|
||||||
|
{O, M, U }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] participleEndings1 = {
|
||||||
|
{ SHCH },
|
||||||
|
{ E, M },
|
||||||
|
{ N, N },
|
||||||
|
{ V, SH },
|
||||||
|
{ IU, SHCH }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] participleEndings2 = {
|
||||||
|
{ I, V, SH },
|
||||||
|
{ Y, V, SH },
|
||||||
|
{ U, IU, SHCH }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] participle1Predessors = {
|
||||||
|
{ A },
|
||||||
|
{ IA }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] reflexiveEndings = {
|
||||||
|
{ S, IA },
|
||||||
|
{ S, SOFT }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] verbEndings1 = {
|
||||||
|
{ I_ },
|
||||||
|
{ L },
|
||||||
|
{ N },
|
||||||
|
{ L, O },
|
||||||
|
{ N, O },
|
||||||
|
{ E, T },
|
||||||
|
{ IU, T },
|
||||||
|
{ L, A },
|
||||||
|
{ N, A },
|
||||||
|
{ L, I },
|
||||||
|
{ E, M },
|
||||||
|
{ N, Y },
|
||||||
|
{ E, T, E },
|
||||||
|
{ I_, T, E },
|
||||||
|
{ T, SOFT },
|
||||||
|
{ E, SH, SOFT },
|
||||||
|
{ N, N, O }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] verbEndings2 = {
|
||||||
|
{ IU },
|
||||||
|
{ U, IU },
|
||||||
|
{ E, N },
|
||||||
|
{ E, I_ },
|
||||||
|
{ IA, T },
|
||||||
|
{ U, I_ },
|
||||||
|
{ I, L },
|
||||||
|
{ Y, L },
|
||||||
|
{ I, M },
|
||||||
|
{ Y, M },
|
||||||
|
{ I, T },
|
||||||
|
{ Y, T },
|
||||||
|
{ I, L, A },
|
||||||
|
{ Y, L, A },
|
||||||
|
{ E, N, A },
|
||||||
|
{ I, T, E },
|
||||||
|
{ I, L, I },
|
||||||
|
{ Y, L, I },
|
||||||
|
{ I, L, O },
|
||||||
|
{ Y, L, O },
|
||||||
|
{ E, N, O },
|
||||||
|
{ U, E, T },
|
||||||
|
{ U, IU, T },
|
||||||
|
{ E, N, Y },
|
||||||
|
{ I, T, SOFT },
|
||||||
|
{ Y, T, SOFT },
|
||||||
|
{ I, SH, SOFT },
|
||||||
|
{ E, I_, T, E },
|
||||||
|
{ U, I_, T, E }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] verb1Predessors = {
|
||||||
|
{ A },
|
||||||
|
{ IA }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] nounEndings = {
|
||||||
|
{ A },
|
||||||
|
{ U },
|
||||||
|
{ I_ },
|
||||||
|
{ O },
|
||||||
|
{ U },
|
||||||
|
{ E },
|
||||||
|
{ Y },
|
||||||
|
{ I },
|
||||||
|
{ SOFT },
|
||||||
|
{ IA },
|
||||||
|
{ E, V },
|
||||||
|
{ O, V },
|
||||||
|
{ I, E },
|
||||||
|
{ SOFT, E },
|
||||||
|
{ IA, X },
|
||||||
|
{ I, IU },
|
||||||
|
{ E, I },
|
||||||
|
{ I, I },
|
||||||
|
{ E, I_ },
|
||||||
|
{ O, I_ },
|
||||||
|
{ E, M },
|
||||||
|
{ A, M },
|
||||||
|
{ O, M },
|
||||||
|
{ A, X },
|
||||||
|
{ SOFT, IU },
|
||||||
|
{ I, IA },
|
||||||
|
{ SOFT, IA },
|
||||||
|
{ I, I_ },
|
||||||
|
{ IA, M },
|
||||||
|
{ IA, M, I },
|
||||||
|
{ A, M, I },
|
||||||
|
{ I, E, I_ },
|
||||||
|
{ I, IA, M },
|
||||||
|
{ I, E, M },
|
||||||
|
{ I, IA, X },
|
||||||
|
{ I, IA, M, I }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] superlativeEndings = {
|
||||||
|
{ E, I_, SH },
|
||||||
|
{ E, I_, SH, E }
|
||||||
|
};
|
||||||
|
|
||||||
|
private static char[][] derivationalEndings = {
|
||||||
|
{ O, S, T },
|
||||||
|
{ O, S, T, SOFT }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RussianStemmer constructor comment.
|
||||||
|
*/
|
||||||
|
public RussianStemmer()
|
||||||
|
{
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RussianStemmer constructor comment.
|
||||||
|
*/
|
||||||
|
public RussianStemmer(char[] charset)
|
||||||
|
{
|
||||||
|
super();
|
||||||
|
this.charset = charset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjectival ending is an adjective ending,
|
||||||
|
* optionally preceded by participle ending.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean adjectival(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
// look for adjective ending in a stemming zone
|
||||||
|
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
|
||||||
|
return false;
|
||||||
|
// if adjective ending was found, try for participle ending.
|
||||||
|
// variable r is unused, we are just interested in the side effect of
|
||||||
|
// findAndRemoveEnding():
|
||||||
|
boolean r =
|
||||||
|
findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
|
||||||
|
||
|
||||||
|
findAndRemoveEnding(stemmingZone, participleEndings2);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Derivational endings
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean derivational(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
int endingLength = findEnding(stemmingZone, derivationalEndings);
|
||||||
|
if (endingLength == 0)
|
||||||
|
// no derivational ending found
|
||||||
|
return false;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Ensure that the ending locates in R2
|
||||||
|
if (R2 - RV <= stemmingZone.length() - endingLength)
|
||||||
|
{
|
||||||
|
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds ending among given ending class and returns the length of ending found(0, if not found).
|
||||||
|
* Creation date: (17/03/2002 8:18:34 PM)
|
||||||
|
*/
|
||||||
|
private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
|
||||||
|
{
|
||||||
|
boolean match = false;
|
||||||
|
for (int i = theEndingClass.length - 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
char[] theEnding = theEndingClass[i];
|
||||||
|
// check if the ending is bigger than stemming zone
|
||||||
|
if (startIndex < theEnding.length - 1)
|
||||||
|
{
|
||||||
|
match = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
match = true;
|
||||||
|
int stemmingIndex = startIndex;
|
||||||
|
for (int j = theEnding.length - 1; j >= 0; j--)
|
||||||
|
{
|
||||||
|
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
|
||||||
|
{
|
||||||
|
match = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check if ending was found
|
||||||
|
if (match)
|
||||||
|
{
|
||||||
|
return theEndingClass[i].length; // cut ending
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
|
||||||
|
{
|
||||||
|
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the ending among the given class of endings and removes it from stemming zone.
|
||||||
|
* Creation date: (17/03/2002 8:18:34 PM)
|
||||||
|
*/
|
||||||
|
private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
|
||||||
|
{
|
||||||
|
int endingLength = findEnding(stemmingZone, theEndingClass);
|
||||||
|
if (endingLength == 0)
|
||||||
|
// not found
|
||||||
|
return false;
|
||||||
|
else {
|
||||||
|
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||||
|
// cut the ending found
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the ending among the given class of endings, then checks if this ending was
|
||||||
|
* preceded by any of given predessors, and if so, removes it from stemming zone.
|
||||||
|
* Creation date: (17/03/2002 8:18:34 PM)
|
||||||
|
*/
|
||||||
|
private boolean findAndRemoveEnding(StringBuffer stemmingZone,
|
||||||
|
char[][] theEndingClass, char[][] thePredessors)
|
||||||
|
{
|
||||||
|
int endingLength = findEnding(stemmingZone, theEndingClass);
|
||||||
|
if (endingLength == 0)
|
||||||
|
// not found
|
||||||
|
return false;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int predessorLength =
|
||||||
|
findEnding(stemmingZone,
|
||||||
|
stemmingZone.length() - endingLength - 1,
|
||||||
|
thePredessors);
|
||||||
|
if (predessorLength == 0)
|
||||||
|
return false;
|
||||||
|
else {
|
||||||
|
stemmingZone.setLength(stemmingZone.length() - endingLength);
|
||||||
|
// cut the ending found
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Marks positions of RV, R1 and R2 in a given word.
|
||||||
|
* Creation date: (16/03/2002 3:40:11 PM)
|
||||||
|
*/
|
||||||
|
private void markPositions(String word)
|
||||||
|
{
|
||||||
|
RV = 0;
|
||||||
|
R1 = 0;
|
||||||
|
R2 = 0;
|
||||||
|
int i = 0;
|
||||||
|
// find RV
|
||||||
|
while (word.length() > i && !isVowel(word.charAt(i)))
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (word.length() - 1 < ++i)
|
||||||
|
return; // RV zone is empty
|
||||||
|
RV = i;
|
||||||
|
// find R1
|
||||||
|
while (word.length() > i && isVowel(word.charAt(i)))
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (word.length() - 1 < ++i)
|
||||||
|
return; // R1 zone is empty
|
||||||
|
R1 = i;
|
||||||
|
// find R2
|
||||||
|
while (word.length() > i && !isVowel(word.charAt(i)))
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (word.length() - 1 < ++i)
|
||||||
|
return; // R2 zone is empty
|
||||||
|
while (word.length() > i && isVowel(word.charAt(i)))
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (word.length() - 1 < ++i)
|
||||||
|
return; // R2 zone is empty
|
||||||
|
R2 = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if character is a vowel..
|
||||||
|
* Creation date: (16/03/2002 10:47:03 PM)
|
||||||
|
* @return boolean
|
||||||
|
* @param letter char
|
||||||
|
*/
|
||||||
|
private boolean isVowel(char letter)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < vowels.length; i++)
|
||||||
|
{
|
||||||
|
if (letter == charset[vowels[i]])
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Noun endings.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean noun(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
return findAndRemoveEnding(stemmingZone, nounEndings);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perfective gerund endings.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean perfectiveGerund(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
return findAndRemoveEnding(
|
||||||
|
stemmingZone,
|
||||||
|
perfectiveGerundEndings1,
|
||||||
|
perfectiveGerund1Predessors)
|
||||||
|
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reflexive endings.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean reflexive(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Insert the method's description here.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean removeI(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
if (stemmingZone.length() > 0
|
||||||
|
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
|
||||||
|
{
|
||||||
|
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Insert the method's description here.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean removeSoft(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
if (stemmingZone.length() > 0
|
||||||
|
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
|
||||||
|
{
|
||||||
|
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Insert the method's description here.
|
||||||
|
* Creation date: (16/03/2002 10:58:42 PM)
|
||||||
|
* @param newCharset char[]
|
||||||
|
*/
|
||||||
|
public void setCharset(char[] newCharset)
|
||||||
|
{
|
||||||
|
charset = newCharset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the stem for given Russian word.
|
||||||
|
* Creation date: (16/03/2002 3:36:48 PM)
|
||||||
|
* @return java.lang.String
|
||||||
|
* @param input java.lang.String
|
||||||
|
*/
|
||||||
|
public String stem(String input)
|
||||||
|
{
|
||||||
|
markPositions(input);
|
||||||
|
if (RV == 0)
|
||||||
|
return input; //RV wasn't detected, nothing to stem
|
||||||
|
StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
|
||||||
|
// stemming goes on in RV
|
||||||
|
// Step 1
|
||||||
|
|
||||||
|
if (!perfectiveGerund(stemmingZone))
|
||||||
|
{
|
||||||
|
reflexive(stemmingZone);
|
||||||
|
// variable r is unused, we are just interested in the flow that gets
|
||||||
|
// created by logical expression: apply adjectival(); if that fails,
|
||||||
|
// apply verb() etc
|
||||||
|
boolean r =
|
||||||
|
adjectival(stemmingZone)
|
||||||
|
|| verb(stemmingZone)
|
||||||
|
|| noun(stemmingZone);
|
||||||
|
}
|
||||||
|
// Step 2
|
||||||
|
removeI(stemmingZone);
|
||||||
|
// Step 3
|
||||||
|
derivational(stemmingZone);
|
||||||
|
// Step 4
|
||||||
|
superlative(stemmingZone);
|
||||||
|
undoubleN(stemmingZone);
|
||||||
|
removeSoft(stemmingZone);
|
||||||
|
// return result
|
||||||
|
return input.substring(0, RV) + stemmingZone.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Superlative endings.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean superlative(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
return findAndRemoveEnding(stemmingZone, superlativeEndings);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Undoubles N.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean undoubleN(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
char[][] doubleN = {
|
||||||
|
{ N, N }
|
||||||
|
};
|
||||||
|
if (findEnding(stemmingZone, doubleN) != 0)
|
||||||
|
{
|
||||||
|
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verb endings.
|
||||||
|
* Creation date: (17/03/2002 12:14:58 AM)
|
||||||
|
* @param stemmingZone java.lang.StringBuffer
|
||||||
|
*/
|
||||||
|
private boolean verb(StringBuffer stemmingZone)
|
||||||
|
{
|
||||||
|
return findAndRemoveEnding(
|
||||||
|
stemmingZone,
|
||||||
|
verbEndings1,
|
||||||
|
verb1Predessors)
|
||||||
|
|| findAndRemoveEnding(stemmingZone, verbEndings2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Static method for stemming with different charsets
|
||||||
|
*/
|
||||||
|
public static String stem(String theWord, char[] charset)
|
||||||
|
{
|
||||||
|
RussianStemmer stemmer = new RussianStemmer();
|
||||||
|
stemmer.setCharset(charset);
|
||||||
|
return stemmer.stem(theWord);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,5 @@
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
Support for indexing and searching Russian text.
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,78 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the German stemmer. The stemming algorithm is known to work less
|
||||||
|
* than perfect, as it doesn't use any word lists with exceptions. We
|
||||||
|
* also check some of the cases where the algorithm is wrong.
|
||||||
|
*
|
||||||
|
* @author Daniel Naber
|
||||||
|
*/
|
||||||
|
public class TestGermanStemFilter extends TestCase {
|
||||||
|
|
||||||
|
public void testStemming() {
|
||||||
|
try {
|
||||||
|
// read test cases from external file:
|
||||||
|
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
|
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
|
||||||
|
FileInputStream fis = new FileInputStream(testFile);
|
||||||
|
InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1");
|
||||||
|
BufferedReader breader = new BufferedReader(isr);
|
||||||
|
while(true) {
|
||||||
|
String line = breader.readLine();
|
||||||
|
if (line == null)
|
||||||
|
break;
|
||||||
|
line = line.trim();
|
||||||
|
if (line.startsWith("#") || line.equals(""))
|
||||||
|
continue; // ignore comments and empty lines
|
||||||
|
String[] parts = line.split(";");
|
||||||
|
//System.out.println(parts[0] + " -- " + parts[1]);
|
||||||
|
check(parts[0], parts[1]);
|
||||||
|
}
|
||||||
|
breader.close();
|
||||||
|
isr.close();
|
||||||
|
fis.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
fail();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void check(final String input, final String expected) throws IOException {
|
||||||
|
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
|
||||||
|
GermanStemFilter filter = new GermanStemFilter(tokenStream);
|
||||||
|
Token t = filter.next();
|
||||||
|
if (t == null)
|
||||||
|
fail();
|
||||||
|
assertEquals(expected, t.termText());
|
||||||
|
filter.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
# German special characters are replaced:
|
||||||
|
häufig;haufig
|
||||||
|
|
||||||
|
# here the stemmer works okay, it maps related words to the same stem:
|
||||||
|
abschließen;abschliess
|
||||||
|
abschließender;abschliess
|
||||||
|
abschließendes;abschliess
|
||||||
|
abschließenden;abschliess
|
||||||
|
|
||||||
|
Tisch;tisch
|
||||||
|
Tische;tisch
|
||||||
|
Tischen;tisch
|
||||||
|
|
||||||
|
Haus;hau
|
||||||
|
Hauses;hau
|
||||||
|
Häuser;hau
|
||||||
|
Häusern;hau
|
||||||
|
# here's a case where overstemming occurs, i.e. a word is
|
||||||
|
# mapped to the same stem as unrelated words:
|
||||||
|
hauen;hau
|
||||||
|
|
||||||
|
# here's a case where understemming occurs, i.e. two related words
|
||||||
|
# are not mapped to the same stem. This is the case with basically
|
||||||
|
# all irregular forms:
|
||||||
|
Drama;drama
|
||||||
|
Dramen;dram
|
||||||
|
|
||||||
|
# replace "ß" with 'ss':
|
||||||
|
Ausmaß;ausmass
|
||||||
|
|
||||||
|
# fake words to test if suffixes are cut off:
|
||||||
|
xxxxxe;xxxxx
|
||||||
|
xxxxxs;xxxxx
|
||||||
|
xxxxxn;xxxxx
|
||||||
|
xxxxxt;xxxxx
|
||||||
|
xxxxxem;xxxxx
|
||||||
|
xxxxxer;xxxxx
|
||||||
|
xxxxxnd;xxxxx
|
||||||
|
# the suffixes are also removed when combined:
|
||||||
|
xxxxxetende;xxxxx
|
||||||
|
|
||||||
|
# words that are shorter than four charcters are not changed:
|
||||||
|
xxe;xxe
|
||||||
|
# -em and -er are not removed from words shorter than five characters:
|
||||||
|
xxem;xxem
|
||||||
|
xxer;xxer
|
||||||
|
# -nd is not removed from words shorter than six characters:
|
||||||
|
xxxnd;xxxnd
|
|
@ -0,0 +1,170 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test case for RussianAnalyzer.
|
||||||
|
*
|
||||||
|
* @author Boris Okner
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class TestRussianAnalyzer extends TestCase
|
||||||
|
{
|
||||||
|
private InputStreamReader inWords;
|
||||||
|
|
||||||
|
private InputStreamReader sampleUnicode;
|
||||||
|
|
||||||
|
private Reader inWordsKOI8;
|
||||||
|
|
||||||
|
private Reader sampleKOI8;
|
||||||
|
|
||||||
|
private Reader inWords1251;
|
||||||
|
|
||||||
|
private Reader sample1251;
|
||||||
|
|
||||||
|
private File dataDir;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception
|
||||||
|
{
|
||||||
|
dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUnicode() throws IOException
|
||||||
|
{
|
||||||
|
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
|
||||||
|
inWords =
|
||||||
|
new InputStreamReader(
|
||||||
|
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")),
|
||||||
|
"Unicode");
|
||||||
|
|
||||||
|
sampleUnicode =
|
||||||
|
new InputStreamReader(
|
||||||
|
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")),
|
||||||
|
"Unicode");
|
||||||
|
|
||||||
|
TokenStream in = ra.tokenStream("all", inWords);
|
||||||
|
|
||||||
|
RussianLetterTokenizer sample =
|
||||||
|
new RussianLetterTokenizer(
|
||||||
|
sampleUnicode,
|
||||||
|
RussianCharsets.UnicodeRussian);
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
Token token = in.next();
|
||||||
|
|
||||||
|
if (token == null)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Token sampleToken = sample.next();
|
||||||
|
assertEquals(
|
||||||
|
"Unicode",
|
||||||
|
token.termText(),
|
||||||
|
sampleToken == null
|
||||||
|
? null
|
||||||
|
: sampleToken.termText());
|
||||||
|
}
|
||||||
|
|
||||||
|
inWords.close();
|
||||||
|
sampleUnicode.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKOI8() throws IOException
|
||||||
|
{
|
||||||
|
//System.out.println(new java.util.Date());
|
||||||
|
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
|
||||||
|
// KOI8
|
||||||
|
inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
|
||||||
|
|
||||||
|
sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
|
||||||
|
|
||||||
|
TokenStream in = ra.tokenStream("all", inWordsKOI8);
|
||||||
|
RussianLetterTokenizer sample =
|
||||||
|
new RussianLetterTokenizer(
|
||||||
|
sampleKOI8,
|
||||||
|
RussianCharsets.KOI8);
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
Token token = in.next();
|
||||||
|
|
||||||
|
if (token == null)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Token sampleToken = sample.next();
|
||||||
|
assertEquals(
|
||||||
|
"KOI8",
|
||||||
|
token.termText(),
|
||||||
|
sampleToken == null
|
||||||
|
? null
|
||||||
|
: sampleToken.termText());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
inWordsKOI8.close();
|
||||||
|
sampleKOI8.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test1251() throws IOException
|
||||||
|
{
|
||||||
|
// 1251
|
||||||
|
inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
|
||||||
|
|
||||||
|
sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
|
||||||
|
|
||||||
|
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
|
||||||
|
TokenStream in = ra.tokenStream("", inWords1251);
|
||||||
|
RussianLetterTokenizer sample =
|
||||||
|
new RussianLetterTokenizer(
|
||||||
|
sample1251,
|
||||||
|
RussianCharsets.CP1251);
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
Token token = in.next();
|
||||||
|
|
||||||
|
if (token == null)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Token sampleToken = sample.next();
|
||||||
|
assertEquals(
|
||||||
|
"1251",
|
||||||
|
token.termText(),
|
||||||
|
sampleToken == null
|
||||||
|
? null
|
||||||
|
: sampleToken.termText());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
inWords1251.close();
|
||||||
|
sample1251.close();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,94 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class TestRussianStem extends TestCase
|
||||||
|
{
|
||||||
|
private ArrayList words = new ArrayList();
|
||||||
|
private ArrayList stems = new ArrayList();
|
||||||
|
|
||||||
|
public TestRussianStem(String name)
|
||||||
|
{
|
||||||
|
super(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see TestCase#setUp()
|
||||||
|
*/
|
||||||
|
protected void setUp() throws Exception
|
||||||
|
{
|
||||||
|
super.setUp();
|
||||||
|
//System.out.println(new java.util.Date());
|
||||||
|
String str;
|
||||||
|
|
||||||
|
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
|
|
||||||
|
// open and read words into an array list
|
||||||
|
BufferedReader inWords =
|
||||||
|
new BufferedReader(
|
||||||
|
new InputStreamReader(
|
||||||
|
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/wordsUnicode.txt")),
|
||||||
|
"Unicode"));
|
||||||
|
while ((str = inWords.readLine()) != null)
|
||||||
|
{
|
||||||
|
words.add(str);
|
||||||
|
}
|
||||||
|
inWords.close();
|
||||||
|
|
||||||
|
// open and read stems into an array list
|
||||||
|
BufferedReader inStems =
|
||||||
|
new BufferedReader(
|
||||||
|
new InputStreamReader(
|
||||||
|
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/stemsUnicode.txt")),
|
||||||
|
"Unicode"));
|
||||||
|
while ((str = inStems.readLine()) != null)
|
||||||
|
{
|
||||||
|
stems.add(str);
|
||||||
|
}
|
||||||
|
inStems.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see TestCase#tearDown()
|
||||||
|
*/
|
||||||
|
protected void tearDown() throws Exception
|
||||||
|
{
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStem()
|
||||||
|
{
|
||||||
|
for (int i = 0; i < words.size(); i++)
|
||||||
|
{
|
||||||
|
//if ( (i % 100) == 0 ) System.err.println(i);
|
||||||
|
String realStem =
|
||||||
|
RussianStemmer.stem(
|
||||||
|
(String) words.get(i),
|
||||||
|
RussianCharsets.UnicodeRussian);
|
||||||
|
assertEquals("unicode", stems.get(i), realStem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1 @@
|
||||||
|
[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
|
|
@ -0,0 +1 @@
|
||||||
|
[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,2 @@
|
||||||
|
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
|
||||||
|
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
|
|
@ -0,0 +1,2 @@
|
||||||
|
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
|
||||||
|
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue