LUCENE-1936: Remove deprecated charset support from Greek and Russian analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@820756 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-10-01 19:20:09 +00:00
parent c1f5e753d7
commit dd9c1b0101
16 changed files with 168 additions and 1422 deletions

View File

@ -6,6 +6,10 @@ Changes in runtime behavior
API Changes API Changes
* LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms
text exactly the same as LowerCaseFilter. Please use LowerCaseFilter
instead, which has the same functionality. (Robert Muir)
Bug fixes Bug fixes
* LUCENE-1781: Fixed various issues with the lat/lng bounding box * LUCENE-1781: Fixed various issues with the lat/lng bounding box

View File

@ -39,111 +39,19 @@ import java.util.Set;
*/ */
public final class GreekAnalyzer extends Analyzer public final class GreekAnalyzer extends Analyzer
{ {
// the letters are indexes to the charset array (see GreekCharsets.java)
private static char A = 6;
private static char B = 7;
private static char G = 8;
private static char D = 9;
private static char E = 10;
private static char Z = 11;
private static char H = 12;
private static char TH = 13;
private static char I = 14;
private static char K = 15;
private static char L = 16;
private static char M = 17;
private static char N = 18;
private static char KS = 19;
private static char O = 20;
private static char P = 21;
private static char R = 22;
private static char S = 24; // skip final sigma
private static char T = 25;
private static char Y = 26;
private static char F = 27;
private static char X = 28;
private static char PS = 29;
private static char W = 30;
/** /**
* List of typical Greek stopwords. * List of typical Greek stopwords.
*/ */
private static char[][] GREEK_STOP_WORDS = { private static final String[] GREEK_STOP_WORDS = {
{O}, "ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον", "την", "και",
{H}, "κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε", "στο", "στον",
{T, O}, "στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με", "σε", "ωσ",
{O, I}, "παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν", "μη", "μην",
{T, A}, "επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ", "ποια", "ποιο",
{T, O, Y}, "ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη", "αυτο", "αυτοι",
{T, H, S}, "αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη", "εκεινο",
{T, W, N}, "εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
{T, O, N}, "ισωσ", "οσο", "οτι"
{T, H, N},
{K, A, I},
{K, I},
{K},
{E, I, M, A, I},
{E, I, S, A, I},
{E, I, N, A, I},
{E, I, M, A, S, T, E},
{E, I, S, T, E},
{S, T, O},
{S, T, O, N},
{S, T, H},
{S, T, H, N},
{M, A},
{A, L, L, A},
{A, P, O},
{G, I, A},
{P, R, O, S},
{M, E},
{S, E},
{W, S},
{P, A, R, A},
{A, N, T, I},
{K, A, T, A},
{M, E, T, A},
{TH, A},
{N, A},
{D, E},
{D, E, N},
{M, H},
{M, H, N},
{E, P, I},
{E, N, W},
{E, A, N},
{A, N},
{T, O, T, E},
{P, O, Y},
{P, W, S},
{P, O, I, O, S},
{P, O, I, A},
{P, O, I, O},
{P, O, I, O, I},
{P, O, I, E, S},
{P, O, I, W, N},
{P, O, I, O, Y, S},
{A, Y, T, O, S},
{A, Y, T, H},
{A, Y, T, O},
{A, Y, T, O, I},
{A, Y, T, W, N},
{A, Y, T, O, Y, S},
{A, Y, T, E, S},
{A, Y, T, A},
{E, K, E, I, N, O, S},
{E, K, E, I, N, H},
{E, K, E, I, N, O},
{E, K, E, I, N, O, I},
{E, K, E, I, N, E, S},
{E, K, E, I, N, A},
{E, K, E, I, N, W, N},
{E, K, E, I, N, O, Y, S},
{O, P, W, S},
{O, M, W, S},
{I, S, W, S},
{O, S, O},
{O, T, I}
}; };
/** /**
@ -151,28 +59,8 @@ public final class GreekAnalyzer extends Analyzer
*/ */
private Set stopSet = new HashSet(); private Set stopSet = new HashSet();
/**
* Charset for Greek letters.
* Represents encoding for 24 lowercase Greek letters.
* Predefined charsets can be taken from {@link GreekCharsets} class
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
public GreekAnalyzer() { public GreekAnalyzer() {
charset = GreekCharsets.UnicodeGreek; this(GREEK_STOP_WORDS);
stopSet = StopFilter.makeStopSet(
makeStopWords(GreekCharsets.UnicodeGreek));
}
/**
* Builds an analyzer.
* @deprecated Use {@link #GreekAnalyzer()} instead.
*/
public GreekAnalyzer(char[] charset)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
} }
/** /**
@ -181,58 +69,16 @@ public final class GreekAnalyzer extends Analyzer
*/ */
public GreekAnalyzer(String [] stopwords) public GreekAnalyzer(String [] stopwords)
{ {
charset = GreekCharsets.UnicodeGreek; super();
stopSet = StopFilter.makeStopSet(stopwords); stopSet = StopFilter.makeStopSet(stopwords);
} }
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #GreekAnalyzer(String[])} instead.
*/
public GreekAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(stopwords);
}
/**
* Takes greek stop words and translates them to a String array, using
* the given charset.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[GREEK_STOP_WORDS.length];
for (int i = 0; i < res.length; i++)
{
char[] theStopWord = GREEK_STOP_WORDS[i];
// translate the word,using the charset
StringBuffer theWord = new StringBuffer();
for (int j = 0; j < theStopWord.length; j++)
{
theWord.append(charset[theStopWord[j]]);
}
res[i] = theWord.toString();
}
return res;
}
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #GreekAnalyzer(Map)} instead.
*/
public GreekAnalyzer(char[] charset, Map stopwords)
{
this.charset = charset;
stopSet = new HashSet(stopwords.keySet());
}
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
*/ */
public GreekAnalyzer(Map stopwords) public GreekAnalyzer(Map stopwords)
{ {
charset = GreekCharsets.UnicodeGreek; super();
stopSet = new HashSet(stopwords.keySet()); stopSet = new HashSet(stopwords.keySet());
} }
@ -245,7 +91,7 @@ public final class GreekAnalyzer extends Analyzer
public TokenStream tokenStream(String fieldName, Reader reader) public TokenStream tokenStream(String fieldName, Reader reader)
{ {
TokenStream result = new StandardTokenizer(reader); TokenStream result = new StandardTokenizer(reader);
result = new GreekLowerCaseFilter(result, charset); result = new GreekLowerCaseFilter(result);
result = new StopFilter(result, stopSet); result = new StopFilter(result, stopSet);
return result; return result;
} }
@ -268,7 +114,7 @@ public final class GreekAnalyzer extends Analyzer
if (streams == null) { if (streams == null) {
streams = new SavedStreams(); streams = new SavedStreams();
streams.source = new StandardTokenizer(reader); streams.source = new StandardTokenizer(reader);
streams.result = new GreekLowerCaseFilter(streams.source, charset); streams.result = new GreekLowerCaseFilter(streams.source);
streams.result = new StopFilter(streams.result, stopSet); streams.result = new StopFilter(streams.result, stopSet);
setPreviousTokenStream(streams); setPreviousTokenStream(streams);
} else { } else {

View File

@ -1,482 +0,0 @@
package org.apache.lucene.analysis.el;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
* for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
* <p>
* Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
* the definition of a new charset as well as the required logic in the toLowerCase() method.
* </p>
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
public class GreekCharsets
{
// Unicode Greek charset
public static char[] UnicodeGreek = {
// lower case
'\u0390',
'\u03AC',
'\u03AD',
'\u03AE',
'\u03AF',
'\u03B0',
'\u03B1',
'\u03B2',
'\u03B3',
'\u03B4',
'\u03B5',
'\u03B6',
'\u03B7',
'\u03B8',
'\u03B9',
'\u03BA',
'\u03BB',
'\u03BC',
'\u03BD',
'\u03BE',
'\u03BF',
'\u03C0',
'\u03C1',
'\u03C2',
'\u03C3',
'\u03C4',
'\u03C5',
'\u03C6',
'\u03C7',
'\u03C8',
'\u03C9',
'\u03CA',
'\u03CB',
'\u03CC',
'\u03CD',
'\u03CE',
// upper case
'\u0386',
'\u0388',
'\u0389',
'\u038A',
'\u038C',
'\u038E',
'\u038F',
'\u0391',
'\u0392',
'\u0393',
'\u0394',
'\u0395',
'\u0396',
'\u0397',
'\u0398',
'\u0399',
'\u039A',
'\u039B',
'\u039C',
'\u039D',
'\u039E',
'\u039F',
'\u03A0',
'\u03A1',
'\u03A3',
'\u03A4',
'\u03A5',
'\u03A6',
'\u03A7',
'\u03A8',
'\u03A9',
'\u03AA',
'\u03AB'
};
// ISO-8859-7 charset (ELOT-928)
public static char[] ISO = {
// lower case
0xc0,
0xdc,
0xdd,
0xde,
0xdf,
0xe0,
0xe1,
0xe2,
0xe3,
0xe4,
0xe5,
0xe6,
0xe7,
0xe8,
0xe9,
0xea,
0xeb,
0xec,
0xed,
0xee,
0xef,
0xf0,
0xf1,
0xf2,
0xf3,
0xf4,
0xf5,
0xf6,
0xf7,
0xf8,
0xf9,
0xfa,
0xfb,
0xfc,
0xfd,
0xfe,
// upper case
0xb6,
0xb8,
0xb9,
0xba,
0xbc,
0xbe,
0xbf,
0xc1,
0xc2,
0xc3,
0xc4,
0xc5,
0xc6,
0xc7,
0xc8,
0xc9,
0xca,
0xcb,
0xcc,
0xcd,
0xce,
0xcf,
0xd0,
0xd1,
0xd3,
0xd4,
0xd5,
0xd6,
0xd7,
0xd8,
0xd9,
0xda,
0xdb
};
// CP1253 charset
public static char[] CP1253 = {
// lower case
0xc0,
0xdc,
0xdd,
0xde,
0xdf,
0xe0,
0xe1,
0xe2,
0xe3,
0xe4,
0xe5,
0xe6,
0xe7,
0xe8,
0xe9,
0xea,
0xeb,
0xec,
0xed,
0xee,
0xef,
0xf0,
0xf1,
0xf2,
0xf3,
0xf4,
0xf5,
0xf6,
0xf7,
0xf8,
0xf9,
0xfa,
0xfb,
0xfc,
0xfd,
0xfe,
// upper case
0xa2,
0xb8,
0xb9,
0xba,
0xbc,
0xbe,
0xbf,
0xc1,
0xc2,
0xc3,
0xc4,
0xc5,
0xc6,
0xc7,
0xc8,
0xc9,
0xca,
0xcb,
0xcc,
0xcd,
0xce,
0xcf,
0xd0,
0xd1,
0xd3,
0xd4,
0xd5,
0xd6,
0xd7,
0xd8,
0xd9,
0xda,
0xdb
};
public static char toLowerCase(char letter, char[] charset)
{
if (charset == UnicodeGreek) {
// First deal with lower case, not accented letters
if (letter >= '\u03B1' && letter <= '\u03C9')
{
// Special case 'small final sigma', where we return 'small sigma'
if (letter == '\u03C2') {
return '\u03C3';
} else {
return letter;
}
}
// Then deal with lower case, accented letters
// alpha with acute
if (letter == '\u03AC') {
return '\u03B1';
}
// epsilon with acute
if (letter == '\u03AD') {
return '\u03B5';
}
// eta with acute
if (letter == '\u03AE') {
return '\u03B7';
}
// iota with acute, iota with diaeresis, iota with acute and diaeresis
if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') {
return '\u03B9';
}
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') {
return '\u03C5';
}
// omicron with acute
if (letter == '\u03CC') {
return '\u03BF';
}
// omega with acute
if (letter == '\u03CE') {
return '\u03C9';
}
// After that, deal with upper case, not accented letters
if (letter >= '\u0391' && letter <= '\u03A9')
{
return (char) (letter + 32);
}
// Finally deal with upper case, accented letters
// alpha with acute
if (letter == '\u0386') {
return '\u03B1';
}
// epsilon with acute
if (letter == '\u0388') {
return '\u03B5';
}
// eta with acute
if (letter == '\u0389') {
return '\u03B7';
}
// iota with acute, iota with diaeresis
if (letter == '\u038A' || letter == '\u03AA') {
return '\u03B9';
}
// upsilon with acute, upsilon with diaeresis
if (letter == '\u038E' || letter == '\u03AB') {
return '\u03C5';
}
// omicron with acute
if (letter == '\u038C') {
return '\u03BF';
}
// omega with acute
if (letter == '\u038F') {
return '\u03C9';
}
} else if (charset == ISO) {
// First deal with lower case, not accented letters
if (letter >= 0xe1 && letter <= 0xf9)
{
// Special case 'small final sigma', where we return 'small sigma'
if (letter == 0xf2) {
return 0xf3;
} else {
return letter;
}
}
// Then deal with lower case, accented letters
// alpha with acute
if (letter == 0xdc) {
return 0xe1;
}
// epsilon with acute
if (letter == 0xdd) {
return 0xe5;
}
// eta with acute
if (letter == 0xde) {
return 0xe7;
}
// iota with acute, iota with diaeresis, iota with acute and diaeresis
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
return '\u03B9';
}
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
return 0xf5;
}
// omicron with acute
if (letter == 0xfc) {
return 0xef;
}
// omega with acute
if (letter == 0xfe) {
return 0xf9;
}
// After that, deal with upper case, not accented letters
if (letter >= 0xc1 && letter <= 0xd9) {
return (char) (letter + 32);
}
// Finally deal with upper case, accented letters
// alpha with acute
if (letter == 0xb6) {
return 0xe1;
}
// epsilon with acute
if (letter == 0xb8) {
return 0xe5;
}
// eta with acute
if (letter == 0xb9) {
return 0xe7;
}
// iota with acute, iota with diaeresis
if (letter == 0xba || letter == 0xda) {
return 0xe9;
}
// upsilon with acute, upsilon with diaeresis
if (letter == 0xbe || letter == 0xdb) {
return 0xf5;
}
// omicron with acute
if (letter == 0xbc) {
return 0xef;
}
// omega with acute
if (letter == 0xbf) {
return 0xf9;
}
} else if (charset == CP1253) {
// First deal with lower case, not accented letters
if (letter >= 0xe1 && letter <= 0xf9)
{
// Special case 'small final sigma', where we return 'small sigma'
if (letter == 0xf2) {
return 0xf3;
} else {
return letter;
}
}
// Then deal with lower case, accented letters
// alpha with acute
if (letter == 0xdc) {
return 0xe1;
}
// epsilon with acute
if (letter == 0xdd) {
return 0xe5;
}
// eta with acute
if (letter == 0xde) {
return 0xe7;
}
// iota with acute, iota with diaeresis, iota with acute and diaeresis
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
return '\u03B9';
}
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
return 0xf5;
}
// omicron with acute
if (letter == 0xfc) {
return 0xef;
}
// omega with acute
if (letter == 0xfe) {
return 0xf9;
}
// After that, deal with upper case, not accented letters
if (letter >= 0xc1 && letter <= 0xd9) {
return (char) (letter + 32);
}
// Finally deal with upper case, accented letters
// alpha with acute
if (letter == 0xa2) {
return 0xe1;
}
// epsilon with acute
if (letter == 0xb8) {
return 0xe5;
}
// eta with acute
if (letter == 0xb9) {
return 0xe7;
}
// iota with acute, iota with diaeresis
if (letter == 0xba || letter == 0xda) {
return 0xe9;
}
// upsilon with acute, upsilon with diaeresis
if (letter == 0xbe || letter == 0xdb) {
return 0xf5;
}
// omicron with acute
if (letter == 0xbc) {
return 0xef;
}
// omega with acute
if (letter == 0xbf) {
return 0xf9;
}
}
return Character.toLowerCase(letter);
}
}

View File

@ -23,44 +23,93 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/** /**
* Normalizes token text to lower case, analyzing given ("greek") charset. * Normalizes token text to lower case, removes some Greek diacritics,
* and standardizes final sigma to sigma.
* *
*/ */
public final class GreekLowerCaseFilter extends TokenFilter public final class GreekLowerCaseFilter extends TokenFilter
{ {
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
char[] charset;
private TermAttribute termAtt; private TermAttribute termAtt;
/**
* @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
*/
public GreekLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
termAtt = addAttribute(TermAttribute.class);
}
public GreekLowerCaseFilter(TokenStream in) public GreekLowerCaseFilter(TokenStream in)
{ {
this(in, GreekCharsets.UnicodeGreek); super(in);
termAtt = addAttribute(TermAttribute.class);
} }
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if (input.incrementToken()) { if (input.incrementToken()) {
char[] chArray = termAtt.termBuffer(); char[] chArray = termAtt.termBuffer();
int chLen = termAtt.termLength(); int chLen = termAtt.termLength();
// TODO: iterate codepoints to support supp. characters
for (int i = 0; i < chLen; i++) for (int i = 0; i < chLen; i++)
{ {
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); chArray[i] = (char) lowerCase(chArray[i]);
} }
return true; return true;
} else { } else {
return false; return false;
} }
} }
private int lowerCase(int codepoint) {
switch(codepoint) {
/* There are two lowercase forms of sigma:
* U+03C2: small final sigma (end of word)
* U+03C3: small sigma (otherwise)
*
* Standardize both to U+03C3
*/
case '\u03C2': /* small final sigma */
return '\u03C3'; /* small sigma */
/* Some greek characters contain diacritics.
* This filter removes these, converting to the lowercase base form.
*/
case '\u0386': /* capital alpha with tonos */
case '\u03AC': /* small alpha with tonos */
return '\u03B1'; /* small alpha */
case '\u0388': /* capital epsilon with tonos */
case '\u03AD': /* small epsilon with tonos */
return '\u03B5'; /* small epsilon */
case '\u0389': /* capital eta with tonos */
case '\u03AE': /* small eta with tonos */
return '\u03B7'; /* small eta */
case '\u038A': /* capital iota with tonos */
case '\u03AA': /* capital iota with dialytika */
case '\u03AF': /* small iota with tonos */
case '\u03CA': /* small iota with dialytika */
case '\u0390': /* small iota with dialytika and tonos */
return '\u03B9'; /* small iota */
case '\u038E': /* capital upsilon with tonos */
case '\u03AB': /* capital upsilon with dialytika */
case '\u03CD': /* small upsilon with tonos */
case '\u03CB': /* small upsilon with dialytika */
case '\u03B0': /* small upsilon with dialytika and tonos */
return '\u03C5'; /* small upsilon */
case '\u038C': /* capital omicron with tonos */
case '\u03CC': /* small omicron with tonos */
return '\u03BF'; /* small omicron */
case '\u038F': /* capital omega with tonos */
case '\u03CE': /* small omega with tonos */
return '\u03C9'; /* small omega */
/* The previous implementation did the conversion below.
* Only implemented for backwards compatibility with old indexes.
*/
case '\u03A2': /* reserved */
return '\u03C2'; /* small final sigma */
default:
return Character.toLowerCase(codepoint);
}
}
} }

View File

@ -24,6 +24,7 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
@ -40,145 +41,20 @@ import org.apache.lucene.analysis.Tokenizer;
*/ */
public final class RussianAnalyzer extends Analyzer public final class RussianAnalyzer extends Analyzer
{ {
// letters (currently unused letters are commented out)
private final static char A = 0;
private final static char B = 1;
private final static char V = 2;
private final static char G = 3;
private final static char D = 4;
private final static char E = 5;
private final static char ZH = 6;
private final static char Z = 7;
private final static char I = 8;
private final static char I_ = 9;
private final static char K = 10;
private final static char L = 11;
private final static char M = 12;
private final static char N = 13;
private final static char O = 14;
private final static char P = 15;
private final static char R = 16;
private final static char S = 17;
private final static char T = 18;
private final static char U = 19;
//private final static char F = 20;
private final static char X = 21;
//private final static char TS = 22;
private final static char CH = 23;
private final static char SH = 24;
private final static char SHCH = 25;
//private final static char HARD = 26;
private final static char Y = 27;
private final static char SOFT = 28;
private final static char AE = 29;
private final static char IU = 30;
private final static char IA = 31;
/** /**
* List of typical Russian stopwords. * List of typical Russian stopwords.
*/ */
private static char[][] RUSSIAN_STOP_WORDS = { private static final String[] RUSSIAN_STOP_WORDS = {
{A}, "а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
{B, E, Z}, "вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где",
{B, O, L, E, E}, "да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть",
{B, Y}, "еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как",
{B, Y, L}, "ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо",
{B, Y, L, A}, "наш", "не", "него", "нее", "нет", "ни", "них", "но", "ну", "о", "об",
{B, Y, L, I}, "однако", "он", "она", "они", "оно", "от", "очень", "по", "под", "при",
{B, Y, L, O}, "с", "со", "так", "также", "такой", "там", "те", "тем", "то", "того",
{B, Y, T, SOFT}, "тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей",
{V}, "чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
{V, A, M},
{V, A, S},
{V, E, S, SOFT},
{V, O},
{V, O, T},
{V, S, E},
{V, S, E, G, O},
{V, S, E, X},
{V, Y},
{G, D, E},
{D, A},
{D, A, ZH, E},
{D, L, IA},
{D, O},
{E, G, O},
{E, E},
{E, I_,},
{E, IU},
{E, S, L, I},
{E, S, T, SOFT},
{E, SHCH, E},
{ZH, E},
{Z, A},
{Z, D, E, S, SOFT},
{I},
{I, Z},
{I, L, I},
{I, M},
{I, X},
{K},
{K, A, K},
{K, O},
{K, O, G, D, A},
{K, T, O},
{L, I},
{L, I, B, O},
{M, N, E},
{M, O, ZH, E, T},
{M, Y},
{N, A},
{N, A, D, O},
{N, A, SH},
{N, E},
{N, E, G, O},
{N, E, E},
{N, E, T},
{N, I},
{N, I, X},
{N, O},
{N, U},
{O},
{O, B},
{O, D, N, A, K, O},
{O, N},
{O, N, A},
{O, N, I},
{O, N, O},
{O, T},
{O, CH, E, N, SOFT},
{P, O},
{P, O, D},
{P, R, I},
{S},
{S, O},
{T, A, K},
{T, A, K, ZH, E},
{T, A, K, O, I_},
{T, A, M},
{T, E},
{T, E, M},
{T, O},
{T, O, G, O},
{T, O, ZH, E},
{T, O, I_},
{T, O, L, SOFT, K, O},
{T, O, M},
{T, Y},
{U},
{U, ZH, E},
{X, O, T, IA},
{CH, E, G, O},
{CH, E, I_},
{CH, E, M},
{CH, T, O},
{CH, T, O, B, Y},
{CH, SOFT, E},
{CH, SOFT, IA},
{AE, T, A},
{AE, T, I},
{AE, T, O},
{IA}
}; };
/** /**
@ -186,89 +62,26 @@ public final class RussianAnalyzer extends Analyzer
*/ */
private Set stopSet = new HashSet(); private Set stopSet = new HashSet();
/**
* Charset for Russian letters.
* Represents encoding for 32 lowercase Russian letters.
* Predefined charsets can be taken from RussianCharSets class
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
public RussianAnalyzer() { public RussianAnalyzer() {
charset = RussianCharsets.UnicodeRussian; this(RUSSIAN_STOP_WORDS);
stopSet = StopFilter.makeStopSet(
makeStopWords(RussianCharsets.UnicodeRussian));
} }
/**
* Builds an analyzer.
* @deprecated Use {@link #RussianAnalyzer()} instead.
*/
public RussianAnalyzer(char[] charset)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #RussianAnalyzer(String[])} instead.
*/
public RussianAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(stopwords);
}
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
*/ */
public RussianAnalyzer(String[] stopwords) public RussianAnalyzer(String[] stopwords)
{ {
this.charset = RussianCharsets.UnicodeRussian; super();
stopSet = StopFilter.makeStopSet(stopwords); stopSet = StopFilter.makeStopSet(stopwords);
} }
/** Takes russian stop words and translates them to a String array, using
* the given charset.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[RUSSIAN_STOP_WORDS.length];
for (int i = 0; i < res.length; i++)
{
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
// translate the word, using the charset
StringBuffer theWord = new StringBuffer();
for (int j = 0; j < theStopWord.length; j++)
{
theWord.append(charset[theStopWord[j]]);
}
res[i] = theWord.toString();
}
return res;
}
/**
* Builds an analyzer with the given stop words.
* TODO: create a Set version of this ctor
* @deprecated Use {@link #RussianAnalyzer(Map)} instead.
*/
public RussianAnalyzer(char[] charset, Map stopwords)
{
this.charset = charset;
stopSet = new HashSet(stopwords.keySet());
}
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* TODO: create a Set version of this ctor * TODO: create a Set version of this ctor
*/ */
public RussianAnalyzer(Map stopwords) public RussianAnalyzer(Map stopwords)
{ {
charset = RussianCharsets.UnicodeRussian; super();
stopSet = new HashSet(stopwords.keySet()); stopSet = new HashSet(stopwords.keySet());
} }
@ -283,10 +96,10 @@ public final class RussianAnalyzer extends Analyzer
*/ */
public TokenStream tokenStream(String fieldName, Reader reader) public TokenStream tokenStream(String fieldName, Reader reader)
{ {
TokenStream result = new RussianLetterTokenizer(reader, charset); TokenStream result = new RussianLetterTokenizer(reader);
result = new RussianLowerCaseFilter(result, charset); result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet); result = new StopFilter(result, stopSet);
result = new RussianStemFilter(result, charset); result = new RussianStemFilter(result);
return result; return result;
} }
@ -309,10 +122,10 @@ public final class RussianAnalyzer extends Analyzer
SavedStreams streams = (SavedStreams) getPreviousTokenStream(); SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) { if (streams == null) {
streams = new SavedStreams(); streams = new SavedStreams();
streams.source = new RussianLetterTokenizer(reader, charset); streams.source = new RussianLetterTokenizer(reader);
streams.result = new RussianLowerCaseFilter(streams.source, charset); streams.result = new LowerCaseFilter(streams.source);
streams.result = new StopFilter(streams.result, stopSet); streams.result = new StopFilter(streams.result, stopSet);
streams.result = new RussianStemFilter(streams.result, charset); streams.result = new RussianStemFilter(streams.result);
setPreviousTokenStream(streams); setPreviousTokenStream(streams);
} else { } else {
streams.source.reset(reader); streams.source.reset(reader);

View File

@ -1,314 +0,0 @@
package org.apache.lucene.analysis.ru;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
* for russian characters in Unicode, KOI8 and CP1252.
* <p>
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
* and adding logic to toLowerCase() method for that charset.
* </p>
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
* @version $Id$
*/
public class RussianCharsets
{
// Unicode Russian charset (lowercase letters only)
public static char[] UnicodeRussian = {
'\u0430',
'\u0431',
'\u0432',
'\u0433',
'\u0434',
'\u0435',
'\u0436',
'\u0437',
'\u0438',
'\u0439',
'\u043A',
'\u043B',
'\u043C',
'\u043D',
'\u043E',
'\u043F',
'\u0440',
'\u0441',
'\u0442',
'\u0443',
'\u0444',
'\u0445',
'\u0446',
'\u0447',
'\u0448',
'\u0449',
'\u044A',
'\u044B',
'\u044C',
'\u044D',
'\u044E',
'\u044F',
// upper case
'\u0410',
'\u0411',
'\u0412',
'\u0413',
'\u0414',
'\u0415',
'\u0416',
'\u0417',
'\u0418',
'\u0419',
'\u041A',
'\u041B',
'\u041C',
'\u041D',
'\u041E',
'\u041F',
'\u0420',
'\u0421',
'\u0422',
'\u0423',
'\u0424',
'\u0425',
'\u0426',
'\u0427',
'\u0428',
'\u0429',
'\u042A',
'\u042B',
'\u042C',
'\u042D',
'\u042E',
'\u042F',
// numbers
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9'
};
// KOI8 charset
public static char[] KOI8 = {
0xc1,
0xc2,
0xd7,
0xc7,
0xc4,
0xc5,
0xd6,
0xda,
0xc9,
0xca,
0xcb,
0xcc,
0xcd,
0xce,
0xcf,
0xd0,
0xd2,
0xd3,
0xd4,
0xd5,
0xc6,
0xc8,
0xc3,
0xde,
0xdb,
0xdd,
0xdf,
0xd9,
0xd8,
0xdc,
0xc0,
0xd1,
// upper case
0xe1,
0xe2,
0xf7,
0xe7,
0xe4,
0xe5,
0xf6,
0xfa,
0xe9,
0xea,
0xeb,
0xec,
0xed,
0xee,
0xef,
0xf0,
0xf2,
0xf3,
0xf4,
0xf5,
0xe6,
0xe8,
0xe3,
0xfe,
0xfb,
0xfd,
0xff,
0xf9,
0xf8,
0xfc,
0xe0,
0xf1,
// numbers
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9'
};
// CP1251 eharset
public static char[] CP1251 = {
0xE0,
0xE1,
0xE2,
0xE3,
0xE4,
0xE5,
0xE6,
0xE7,
0xE8,
0xE9,
0xEA,
0xEB,
0xEC,
0xED,
0xEE,
0xEF,
0xF0,
0xF1,
0xF2,
0xF3,
0xF4,
0xF5,
0xF6,
0xF7,
0xF8,
0xF9,
0xFA,
0xFB,
0xFC,
0xFD,
0xFE,
0xFF,
// upper case
0xC0,
0xC1,
0xC2,
0xC3,
0xC4,
0xC5,
0xC6,
0xC7,
0xC8,
0xC9,
0xCA,
0xCB,
0xCC,
0xCD,
0xCE,
0xCF,
0xD0,
0xD1,
0xD2,
0xD3,
0xD4,
0xD5,
0xD6,
0xD7,
0xD8,
0xD9,
0xDA,
0xDB,
0xDC,
0xDD,
0xDE,
0xDF,
// numbers
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9'
};
public static char toLowerCase(char letter, char[] charset)
{
if (charset == UnicodeRussian)
{
if (letter >= '\u0430' && letter <= '\u044F')
{
return letter;
}
if (letter >= '\u0410' && letter <= '\u042F')
{
return (char) (letter + 32);
}
}
if (charset == KOI8)
{
if (letter >= 0xe0 && letter <= 0xff)
{
return (char) (letter - 32);
}
if (letter >= 0xc0 && letter <= 0xdf)
{
return letter;
}
}
if (charset == CP1251)
{
if (letter >= 0xC0 && letter <= 0xDF)
{
return (char) (letter + 32);
}
if (letter >= 0xE0 && letter <= 0xFF)
{
return letter;
}
}
return Character.toLowerCase(letter);
}
}

View File

@ -25,49 +25,26 @@ import org.apache.lucene.util.AttributeSource;
/** /**
* A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer} * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
* by additionally looking up letters in a given "russian charset". * by also allowing the basic latin digits 0-9.
* <p>
* The problem with
* {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
* (well-known problems with 0xD7 and 0xF7 chars)
* </p>
* *
* @version $Id$ * @version $Id$
*/ */
public class RussianLetterTokenizer extends CharTokenizer public class RussianLetterTokenizer extends CharTokenizer
{ {
/**
* Charset this tokenizer uses.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
/**
* @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead.
*/
public RussianLetterTokenizer(Reader in, char[] charset)
{
super(in);
this.charset = charset;
}
public RussianLetterTokenizer(Reader in) public RussianLetterTokenizer(Reader in)
{ {
this(in, RussianCharsets.UnicodeRussian); super(in);
} }
public RussianLetterTokenizer(AttributeSource source, Reader in) public RussianLetterTokenizer(AttributeSource source, Reader in)
{ {
super(source, in); super(source, in);
this.charset = RussianCharsets.UnicodeRussian;
} }
public RussianLetterTokenizer(AttributeFactory factory, Reader in) public RussianLetterTokenizer(AttributeFactory factory, Reader in)
{ {
super(factory, in); super(factory, in);
this.charset = RussianCharsets.UnicodeRussian;
} }
/** /**
@ -76,14 +53,9 @@ public class RussianLetterTokenizer extends CharTokenizer
*/ */
protected boolean isTokenChar(char c) protected boolean isTokenChar(char c)
{ {
/* in the next release, this can be implemented as isLetter(c) or [0-9] */ if (Character.isLetter(c) || (c >= '0' && c <= '9'))
if (Character.isLetter(c))
return true; return true;
for (int i = 0; i < charset.length; i++) else
{ return false;
if (c == charset[i])
return true;
}
return false;
} }
} }

View File

@ -19,38 +19,26 @@ package org.apache.lucene.analysis.ru;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/** /**
* Normalizes token text to lower case, analyzing given ("russian") charset. * Normalizes token text to lower case.
* * @deprecated Use {@link LowerCaseFilter} instead, which has the same
* functionality. This filter will be removed in Lucene 3.1
* *
* @version $Id$ * @version $Id$
*/ */
public final class RussianLowerCaseFilter extends TokenFilter public final class RussianLowerCaseFilter extends TokenFilter
{ {
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
char[] charset;
private TermAttribute termAtt; private TermAttribute termAtt;
/**
* @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
*/
public RussianLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
termAtt = addAttribute(TermAttribute.class);
}
public RussianLowerCaseFilter(TokenStream in) public RussianLowerCaseFilter(TokenStream in)
{ {
this(in, RussianCharsets.UnicodeRussian); super(in);
termAtt = addAttribute(TermAttribute.class);
} }
public final boolean incrementToken() throws IOException public final boolean incrementToken() throws IOException
@ -60,7 +48,7 @@ public final class RussianLowerCaseFilter extends TokenFilter
int chLen = termAtt.termLength(); int chLen = termAtt.termLength();
for (int i = 0; i < chLen; i++) for (int i = 0; i < chLen; i++)
{ {
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); chArray[i] = Character.toLowerCase(chArray[i]);
} }
return true; return true;
} else { } else {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ru;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -28,8 +29,8 @@ import java.io.IOException;
* A {@link TokenFilter} that stems Russian words. * A {@link TokenFilter} that stems Russian words.
* <p> * <p>
* The implementation was inspired by GermanStemFilter. * The implementation was inspired by GermanStemFilter.
* The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter , * The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
* because RussianStemFilter only works with lowercase part of any "russian" charset. * because RussianStemFilter only works with lowercase characters.
* </p> * </p>
* *
* @version $Id$ * @version $Id$
@ -43,19 +44,11 @@ public final class RussianStemFilter extends TokenFilter
private TermAttribute termAtt; private TermAttribute termAtt;
/**
* @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
*/
public RussianStemFilter(TokenStream in, char[] charset)
{
super(in);
stemmer = new RussianStemmer(charset);
termAtt = addAttribute(TermAttribute.class);
}
public RussianStemFilter(TokenStream in) public RussianStemFilter(TokenStream in)
{ {
this(in, RussianCharsets.UnicodeRussian); super(in);
stemmer = new RussianStemmer();
termAtt = addAttribute(TermAttribute.class);
} }
/** /**
* Returns the next token in the stream, or null at EOS * Returns the next token in the stream, or null at EOS

View File

@ -25,47 +25,42 @@ package org.apache.lucene.analysis.ru;
*/ */
class RussianStemmer class RussianStemmer
{ {
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
// positions of RV, R1 and R2 respectively // positions of RV, R1 and R2 respectively
private int RV, R1, R2; private int RV, R1, R2;
// letters (currently unused letters are commented out) // letters (currently unused letters are commented out)
private final static char A = 0; private final static char A = '\u0430';
//private final static char B = 1; //private final static char B = '\u0431';
private final static char V = 2; private final static char V = '\u0432';
private final static char G = 3; private final static char G = '\u0433';
//private final static char D = 4; //private final static char D = '\u0434';
private final static char E = 5; private final static char E = '\u0435';
//private final static char ZH = 6; //private final static char ZH = '\u0436';
//private final static char Z = 7; //private final static char Z = '\u0437';
private final static char I = 8; private final static char I = '\u0438';
private final static char I_ = 9; private final static char I_ = '\u0439';
//private final static char K = 10; //private final static char K = '\u043A';
private final static char L = 11; private final static char L = '\u043B';
private final static char M = 12; private final static char M = '\u043C';
private final static char N = 13; private final static char N = '\u043D';
private final static char O = 14; private final static char O = '\u043E';
//private final static char P = 15; //private final static char P = '\u043F';
//private final static char R = 16; //private final static char R = '\u0440';
private final static char S = 17; private final static char S = '\u0441';
private final static char T = 18; private final static char T = '\u0442';
private final static char U = 19; private final static char U = '\u0443';
//private final static char F = 20; //private final static char F = '\u0444';
private final static char X = 21; private final static char X = '\u0445';
//private final static char TS = 22; //private final static char TS = '\u0446';
//private final static char CH = 23; //private final static char CH = '\u0447';
private final static char SH = 24; private final static char SH = '\u0448';
private final static char SHCH = 25; private final static char SHCH = '\u0449';
//private final static char HARD = 26; //private final static char HARD = '\u044A';
private final static char Y = 27; private final static char Y = '\u044B';
private final static char SOFT = 28; private final static char SOFT = '\u044C';
private final static char AE = 29; private final static char AE = '\u044D';
private final static char IU = 30; private final static char IU = '\u044E';
private final static char IA = 31; private final static char IA = '\u044F';
// stem definitions // stem definitions
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
@ -256,16 +251,6 @@ class RussianStemmer
super(); super();
} }
/**
* RussianStemmer constructor comment.
* @deprecated Use {@link #RussianStemmer()} instead.
*/
public RussianStemmer(char[] charset)
{
super();
this.charset = charset;
}
/** /**
* Adjectival ending is an adjective ending, * Adjectival ending is an adjective ending,
* optionally preceded by participle ending. * optionally preceded by participle ending.
@ -333,7 +318,7 @@ class RussianStemmer
int stemmingIndex = startIndex; int stemmingIndex = startIndex;
for (int j = theEnding.length - 1; j >= 0; j--) for (int j = theEnding.length - 1; j >= 0; j--)
{ {
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
{ {
match = false; match = false;
break; break;
@ -451,7 +436,7 @@ class RussianStemmer
{ {
for (int i = 0; i < vowels.length; i++) for (int i = 0; i < vowels.length; i++)
{ {
if (letter == charset[vowels[i]]) if (letter == vowels[i])
return true; return true;
} }
return false; return false;
@ -499,7 +484,7 @@ class RussianStemmer
private boolean removeI(StringBuffer stemmingZone) private boolean removeI(StringBuffer stemmingZone)
{ {
if (stemmingZone.length() > 0 if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) && stemmingZone.charAt(stemmingZone.length() - 1) == I)
{ {
stemmingZone.setLength(stemmingZone.length() - 1); stemmingZone.setLength(stemmingZone.length() - 1);
return true; return true;
@ -518,7 +503,7 @@ class RussianStemmer
private boolean removeSoft(StringBuffer stemmingZone) private boolean removeSoft(StringBuffer stemmingZone)
{ {
if (stemmingZone.length() > 0 if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
{ {
stemmingZone.setLength(stemmingZone.length() - 1); stemmingZone.setLength(stemmingZone.length() - 1);
return true; return true;
@ -529,17 +514,6 @@ class RussianStemmer
} }
} }
/**
* Insert the method's description here.
* Creation date: (16/03/2002 10:58:42 PM)
* @param newCharset char[]
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
public void setCharset(char[] newCharset)
{
charset = newCharset;
}
/** /**
* Finds the stem for given Russian word. * Finds the stem for given Russian word.
* Creation date: (16/03/2002 3:36:48 PM) * Creation date: (16/03/2002 3:36:48 PM)
@ -622,25 +596,13 @@ class RussianStemmer
verb1Predessors) verb1Predessors)
|| findAndRemoveEnding(stemmingZone, verbEndings2); || findAndRemoveEnding(stemmingZone, verbEndings2);
} }
/**
* Static method for stemming with different charsets
* @deprecated Use {@link #stemWord(String)} instead.
*/
public static String stem(String theWord, char[] charset)
{
RussianStemmer stemmer = new RussianStemmer();
stemmer.setCharset(charset);
return stemmer.stem(theWord);
}
/** /**
* Static method for stemming. * Static method for stemming.
*/ */
public static String stemWord(String theWord) public static String stemWord(String theWord)
{ {
RussianStemmer stemmer = new RussianStemmer(); RussianStemmer stemmer = new RussianStemmer();
stemmer.setCharset(RussianCharsets.UnicodeRussian);
return stemmer.stem(theWord); return stemmer.stem(theWord);
} }
} }

View File

@ -42,14 +42,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
private InputStreamReader sampleUnicode; private InputStreamReader sampleUnicode;
private Reader inWordsKOI8;
private Reader sampleKOI8;
private Reader inWords1251;
private Reader sample1251;
private File dataDir; private File dataDir;
protected void setUp() throws Exception protected void setUp() throws Exception
@ -97,76 +89,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
inWords.close(); inWords.close();
sampleUnicode.close(); sampleUnicode.close();
} }
public void testKOI8() throws IOException
{
//System.out.println(new java.util.Date());
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
// KOI8
inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
TokenStream in = ra.tokenStream("all", inWordsKOI8);
RussianLetterTokenizer sample =
new RussianLetterTokenizer(
sampleKOI8,
RussianCharsets.KOI8);
TermAttribute text = in.getAttribute(TermAttribute.class);
TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
for (;;)
{
if (in.incrementToken() == false)
break;
boolean nextSampleToken = sample.incrementToken();
assertEquals(
"KOI8",
text.term(),
nextSampleToken == false
? null
: sampleText.term());
}
inWordsKOI8.close();
sampleKOI8.close();
}
public void test1251() throws IOException
{
// 1251
inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
TokenStream in = ra.tokenStream("", inWords1251);
RussianLetterTokenizer sample =
new RussianLetterTokenizer(
sample1251,
RussianCharsets.CP1251);
TermAttribute text = in.getAttribute(TermAttribute.class);
TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
for (;;)
{
if (in.incrementToken() == false)
break;
boolean nextSampleToken = sample.incrementToken();
assertEquals(
"1251",
text.term(),
nextSampleToken == false
? null
: sampleText.term());
}
inWords1251.close();
sample1251.close();
}
public void testDigitsInRussianCharset() public void testDigitsInRussianCharset()
{ {

View File

@ -84,9 +84,8 @@ public class TestRussianStem extends LuceneTestCase
{ {
//if ( (i % 100) == 0 ) System.err.println(i); //if ( (i % 100) == 0 ) System.err.println(i);
String realStem = String realStem =
RussianStemmer.stem( RussianStemmer.stemWord(
(String) words.get(i), (String) words.get(i));
RussianCharsets.UnicodeRussian);
assertEquals("unicode", stems.get(i), realStem); assertEquals("unicode", stems.get(i), realStem);
} }
} }

View File

@ -1 +0,0 @@
[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]

View File

@ -1 +0,0 @@
[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃

View File

@ -1,2 +0,0 @@
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.

View File

@ -1,2 +0,0 @@
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.