mirror of https://github.com/apache/lucene.git
LUCENE-1936: Remove deprecated charset support from Greek and Russian analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@820756 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c1f5e753d7
commit
dd9c1b0101
|
@ -6,6 +6,10 @@ Changes in runtime behavior
|
|||
|
||||
API Changes
|
||||
|
||||
* LUCENE-1936: Deprecated RussianLowerCaseFilter, because it transforms
|
||||
text exactly the same as LowerCaseFilter. Please use LowerCaseFilter
|
||||
instead, which has the same functionality. (Robert Muir)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-1781: Fixed various issues with the lat/lng bounding box
|
||||
|
|
|
@ -39,111 +39,19 @@ import java.util.Set;
|
|||
*/
|
||||
public final class GreekAnalyzer extends Analyzer
|
||||
{
|
||||
// the letters are indexes to the charset array (see GreekCharsets.java)
|
||||
private static char A = 6;
|
||||
private static char B = 7;
|
||||
private static char G = 8;
|
||||
private static char D = 9;
|
||||
private static char E = 10;
|
||||
private static char Z = 11;
|
||||
private static char H = 12;
|
||||
private static char TH = 13;
|
||||
private static char I = 14;
|
||||
private static char K = 15;
|
||||
private static char L = 16;
|
||||
private static char M = 17;
|
||||
private static char N = 18;
|
||||
private static char KS = 19;
|
||||
private static char O = 20;
|
||||
private static char P = 21;
|
||||
private static char R = 22;
|
||||
private static char S = 24; // skip final sigma
|
||||
private static char T = 25;
|
||||
private static char Y = 26;
|
||||
private static char F = 27;
|
||||
private static char X = 28;
|
||||
private static char PS = 29;
|
||||
private static char W = 30;
|
||||
|
||||
/**
|
||||
* List of typical Greek stopwords.
|
||||
*/
|
||||
private static char[][] GREEK_STOP_WORDS = {
|
||||
{O},
|
||||
{H},
|
||||
{T, O},
|
||||
{O, I},
|
||||
{T, A},
|
||||
{T, O, Y},
|
||||
{T, H, S},
|
||||
{T, W, N},
|
||||
{T, O, N},
|
||||
{T, H, N},
|
||||
{K, A, I},
|
||||
{K, I},
|
||||
{K},
|
||||
{E, I, M, A, I},
|
||||
{E, I, S, A, I},
|
||||
{E, I, N, A, I},
|
||||
{E, I, M, A, S, T, E},
|
||||
{E, I, S, T, E},
|
||||
{S, T, O},
|
||||
{S, T, O, N},
|
||||
{S, T, H},
|
||||
{S, T, H, N},
|
||||
{M, A},
|
||||
{A, L, L, A},
|
||||
{A, P, O},
|
||||
{G, I, A},
|
||||
{P, R, O, S},
|
||||
{M, E},
|
||||
{S, E},
|
||||
{W, S},
|
||||
{P, A, R, A},
|
||||
{A, N, T, I},
|
||||
{K, A, T, A},
|
||||
{M, E, T, A},
|
||||
{TH, A},
|
||||
{N, A},
|
||||
{D, E},
|
||||
{D, E, N},
|
||||
{M, H},
|
||||
{M, H, N},
|
||||
{E, P, I},
|
||||
{E, N, W},
|
||||
{E, A, N},
|
||||
{A, N},
|
||||
{T, O, T, E},
|
||||
{P, O, Y},
|
||||
{P, W, S},
|
||||
{P, O, I, O, S},
|
||||
{P, O, I, A},
|
||||
{P, O, I, O},
|
||||
{P, O, I, O, I},
|
||||
{P, O, I, E, S},
|
||||
{P, O, I, W, N},
|
||||
{P, O, I, O, Y, S},
|
||||
{A, Y, T, O, S},
|
||||
{A, Y, T, H},
|
||||
{A, Y, T, O},
|
||||
{A, Y, T, O, I},
|
||||
{A, Y, T, W, N},
|
||||
{A, Y, T, O, Y, S},
|
||||
{A, Y, T, E, S},
|
||||
{A, Y, T, A},
|
||||
{E, K, E, I, N, O, S},
|
||||
{E, K, E, I, N, H},
|
||||
{E, K, E, I, N, O},
|
||||
{E, K, E, I, N, O, I},
|
||||
{E, K, E, I, N, E, S},
|
||||
{E, K, E, I, N, A},
|
||||
{E, K, E, I, N, W, N},
|
||||
{E, K, E, I, N, O, Y, S},
|
||||
{O, P, W, S},
|
||||
{O, M, W, S},
|
||||
{I, S, W, S},
|
||||
{O, S, O},
|
||||
{O, T, I}
|
||||
private static final String[] GREEK_STOP_WORDS = {
|
||||
"ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον", "την", "και",
|
||||
"κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε", "στο", "στον",
|
||||
"στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με", "σε", "ωσ",
|
||||
"παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν", "μη", "μην",
|
||||
"επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ", "ποια", "ποιο",
|
||||
"ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη", "αυτο", "αυτοι",
|
||||
"αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη", "εκεινο",
|
||||
"εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
|
||||
"ισωσ", "οσο", "οτι"
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -151,28 +59,8 @@ public final class GreekAnalyzer extends Analyzer
|
|||
*/
|
||||
private Set stopSet = new HashSet();
|
||||
|
||||
/**
|
||||
* Charset for Greek letters.
|
||||
* Represents encoding for 24 lowercase Greek letters.
|
||||
* Predefined charsets can be taken from {@link GreekCharsets} class
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
public GreekAnalyzer() {
|
||||
charset = GreekCharsets.UnicodeGreek;
|
||||
stopSet = StopFilter.makeStopSet(
|
||||
makeStopWords(GreekCharsets.UnicodeGreek));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
* @deprecated Use {@link #GreekAnalyzer()} instead.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
|
||||
this(GREEK_STOP_WORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -181,58 +69,16 @@ public final class GreekAnalyzer extends Analyzer
|
|||
*/
|
||||
public GreekAnalyzer(String [] stopwords)
|
||||
{
|
||||
charset = GreekCharsets.UnicodeGreek;
|
||||
super();
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated Use {@link #GreekAnalyzer(String[])} instead.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset, String[] stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes greek stop words and translates them to a String array, using
|
||||
* the given charset.
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private static String[] makeStopWords(char[] charset)
|
||||
{
|
||||
String[] res = new String[GREEK_STOP_WORDS.length];
|
||||
for (int i = 0; i < res.length; i++)
|
||||
{
|
||||
char[] theStopWord = GREEK_STOP_WORDS[i];
|
||||
// translate the word,using the charset
|
||||
StringBuffer theWord = new StringBuffer();
|
||||
for (int j = 0; j < theStopWord.length; j++)
|
||||
{
|
||||
theWord.append(charset[theStopWord[j]]);
|
||||
}
|
||||
res[i] = theWord.toString();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated Use {@link #GreekAnalyzer(Map)} instead.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset, Map stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public GreekAnalyzer(Map stopwords)
|
||||
{
|
||||
charset = GreekCharsets.UnicodeGreek;
|
||||
super();
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
|
@ -245,7 +91,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
TokenStream result = new StandardTokenizer(reader);
|
||||
result = new GreekLowerCaseFilter(result, charset);
|
||||
result = new GreekLowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopSet);
|
||||
return result;
|
||||
}
|
||||
|
@ -268,7 +114,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new GreekLowerCaseFilter(streams.source, charset);
|
||||
streams.result = new GreekLowerCaseFilter(streams.source);
|
||||
streams.result = new StopFilter(streams.result, stopSet);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
|
|
|
@ -1,482 +0,0 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
||||
* for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
|
||||
* <p>
|
||||
* Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
|
||||
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
|
||||
* the definition of a new charset as well as the required logic in the toLowerCase() method.
|
||||
* </p>
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
public class GreekCharsets
|
||||
{
|
||||
// Unicode Greek charset
|
||||
public static char[] UnicodeGreek = {
|
||||
// lower case
|
||||
'\u0390',
|
||||
'\u03AC',
|
||||
'\u03AD',
|
||||
'\u03AE',
|
||||
'\u03AF',
|
||||
'\u03B0',
|
||||
'\u03B1',
|
||||
'\u03B2',
|
||||
'\u03B3',
|
||||
'\u03B4',
|
||||
'\u03B5',
|
||||
'\u03B6',
|
||||
'\u03B7',
|
||||
'\u03B8',
|
||||
'\u03B9',
|
||||
'\u03BA',
|
||||
'\u03BB',
|
||||
'\u03BC',
|
||||
'\u03BD',
|
||||
'\u03BE',
|
||||
'\u03BF',
|
||||
'\u03C0',
|
||||
'\u03C1',
|
||||
'\u03C2',
|
||||
'\u03C3',
|
||||
'\u03C4',
|
||||
'\u03C5',
|
||||
'\u03C6',
|
||||
'\u03C7',
|
||||
'\u03C8',
|
||||
'\u03C9',
|
||||
'\u03CA',
|
||||
'\u03CB',
|
||||
'\u03CC',
|
||||
'\u03CD',
|
||||
'\u03CE',
|
||||
// upper case
|
||||
'\u0386',
|
||||
'\u0388',
|
||||
'\u0389',
|
||||
'\u038A',
|
||||
'\u038C',
|
||||
'\u038E',
|
||||
'\u038F',
|
||||
'\u0391',
|
||||
'\u0392',
|
||||
'\u0393',
|
||||
'\u0394',
|
||||
'\u0395',
|
||||
'\u0396',
|
||||
'\u0397',
|
||||
'\u0398',
|
||||
'\u0399',
|
||||
'\u039A',
|
||||
'\u039B',
|
||||
'\u039C',
|
||||
'\u039D',
|
||||
'\u039E',
|
||||
'\u039F',
|
||||
'\u03A0',
|
||||
'\u03A1',
|
||||
'\u03A3',
|
||||
'\u03A4',
|
||||
'\u03A5',
|
||||
'\u03A6',
|
||||
'\u03A7',
|
||||
'\u03A8',
|
||||
'\u03A9',
|
||||
'\u03AA',
|
||||
'\u03AB'
|
||||
};
|
||||
|
||||
// ISO-8859-7 charset (ELOT-928)
|
||||
public static char[] ISO = {
|
||||
// lower case
|
||||
0xc0,
|
||||
0xdc,
|
||||
0xdd,
|
||||
0xde,
|
||||
0xdf,
|
||||
0xe0,
|
||||
0xe1,
|
||||
0xe2,
|
||||
0xe3,
|
||||
0xe4,
|
||||
0xe5,
|
||||
0xe6,
|
||||
0xe7,
|
||||
0xe8,
|
||||
0xe9,
|
||||
0xea,
|
||||
0xeb,
|
||||
0xec,
|
||||
0xed,
|
||||
0xee,
|
||||
0xef,
|
||||
0xf0,
|
||||
0xf1,
|
||||
0xf2,
|
||||
0xf3,
|
||||
0xf4,
|
||||
0xf5,
|
||||
0xf6,
|
||||
0xf7,
|
||||
0xf8,
|
||||
0xf9,
|
||||
0xfa,
|
||||
0xfb,
|
||||
0xfc,
|
||||
0xfd,
|
||||
0xfe,
|
||||
// upper case
|
||||
0xb6,
|
||||
0xb8,
|
||||
0xb9,
|
||||
0xba,
|
||||
0xbc,
|
||||
0xbe,
|
||||
0xbf,
|
||||
0xc1,
|
||||
0xc2,
|
||||
0xc3,
|
||||
0xc4,
|
||||
0xc5,
|
||||
0xc6,
|
||||
0xc7,
|
||||
0xc8,
|
||||
0xc9,
|
||||
0xca,
|
||||
0xcb,
|
||||
0xcc,
|
||||
0xcd,
|
||||
0xce,
|
||||
0xcf,
|
||||
0xd0,
|
||||
0xd1,
|
||||
0xd3,
|
||||
0xd4,
|
||||
0xd5,
|
||||
0xd6,
|
||||
0xd7,
|
||||
0xd8,
|
||||
0xd9,
|
||||
0xda,
|
||||
0xdb
|
||||
};
|
||||
|
||||
// CP1253 charset
|
||||
public static char[] CP1253 = {
|
||||
// lower case
|
||||
0xc0,
|
||||
0xdc,
|
||||
0xdd,
|
||||
0xde,
|
||||
0xdf,
|
||||
0xe0,
|
||||
0xe1,
|
||||
0xe2,
|
||||
0xe3,
|
||||
0xe4,
|
||||
0xe5,
|
||||
0xe6,
|
||||
0xe7,
|
||||
0xe8,
|
||||
0xe9,
|
||||
0xea,
|
||||
0xeb,
|
||||
0xec,
|
||||
0xed,
|
||||
0xee,
|
||||
0xef,
|
||||
0xf0,
|
||||
0xf1,
|
||||
0xf2,
|
||||
0xf3,
|
||||
0xf4,
|
||||
0xf5,
|
||||
0xf6,
|
||||
0xf7,
|
||||
0xf8,
|
||||
0xf9,
|
||||
0xfa,
|
||||
0xfb,
|
||||
0xfc,
|
||||
0xfd,
|
||||
0xfe,
|
||||
// upper case
|
||||
0xa2,
|
||||
0xb8,
|
||||
0xb9,
|
||||
0xba,
|
||||
0xbc,
|
||||
0xbe,
|
||||
0xbf,
|
||||
0xc1,
|
||||
0xc2,
|
||||
0xc3,
|
||||
0xc4,
|
||||
0xc5,
|
||||
0xc6,
|
||||
0xc7,
|
||||
0xc8,
|
||||
0xc9,
|
||||
0xca,
|
||||
0xcb,
|
||||
0xcc,
|
||||
0xcd,
|
||||
0xce,
|
||||
0xcf,
|
||||
0xd0,
|
||||
0xd1,
|
||||
0xd3,
|
||||
0xd4,
|
||||
0xd5,
|
||||
0xd6,
|
||||
0xd7,
|
||||
0xd8,
|
||||
0xd9,
|
||||
0xda,
|
||||
0xdb
|
||||
};
|
||||
|
||||
public static char toLowerCase(char letter, char[] charset)
|
||||
{
|
||||
if (charset == UnicodeGreek) {
|
||||
// First deal with lower case, not accented letters
|
||||
if (letter >= '\u03B1' && letter <= '\u03C9')
|
||||
{
|
||||
// Special case 'small final sigma', where we return 'small sigma'
|
||||
if (letter == '\u03C2') {
|
||||
return '\u03C3';
|
||||
} else {
|
||||
return letter;
|
||||
}
|
||||
}
|
||||
// Then deal with lower case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == '\u03AC') {
|
||||
return '\u03B1';
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == '\u03AD') {
|
||||
return '\u03B5';
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == '\u03AE') {
|
||||
return '\u03B7';
|
||||
}
|
||||
// iota with acute, iota with diaeresis, iota with acute and diaeresis
|
||||
if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') {
|
||||
return '\u03B9';
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
|
||||
if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') {
|
||||
return '\u03C5';
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == '\u03CC') {
|
||||
return '\u03BF';
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == '\u03CE') {
|
||||
return '\u03C9';
|
||||
}
|
||||
// After that, deal with upper case, not accented letters
|
||||
if (letter >= '\u0391' && letter <= '\u03A9')
|
||||
{
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
// Finally deal with upper case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == '\u0386') {
|
||||
return '\u03B1';
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == '\u0388') {
|
||||
return '\u03B5';
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == '\u0389') {
|
||||
return '\u03B7';
|
||||
}
|
||||
// iota with acute, iota with diaeresis
|
||||
if (letter == '\u038A' || letter == '\u03AA') {
|
||||
return '\u03B9';
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis
|
||||
if (letter == '\u038E' || letter == '\u03AB') {
|
||||
return '\u03C5';
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == '\u038C') {
|
||||
return '\u03BF';
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == '\u038F') {
|
||||
return '\u03C9';
|
||||
}
|
||||
} else if (charset == ISO) {
|
||||
// First deal with lower case, not accented letters
|
||||
if (letter >= 0xe1 && letter <= 0xf9)
|
||||
{
|
||||
// Special case 'small final sigma', where we return 'small sigma'
|
||||
if (letter == 0xf2) {
|
||||
return 0xf3;
|
||||
} else {
|
||||
return letter;
|
||||
}
|
||||
}
|
||||
// Then deal with lower case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == 0xdc) {
|
||||
return 0xe1;
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == 0xdd) {
|
||||
return 0xe5;
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == 0xde) {
|
||||
return 0xe7;
|
||||
}
|
||||
// iota with acute, iota with diaeresis, iota with acute and diaeresis
|
||||
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
|
||||
return '\u03B9';
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
|
||||
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
|
||||
return 0xf5;
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == 0xfc) {
|
||||
return 0xef;
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == 0xfe) {
|
||||
return 0xf9;
|
||||
}
|
||||
// After that, deal with upper case, not accented letters
|
||||
if (letter >= 0xc1 && letter <= 0xd9) {
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
// Finally deal with upper case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == 0xb6) {
|
||||
return 0xe1;
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == 0xb8) {
|
||||
return 0xe5;
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == 0xb9) {
|
||||
return 0xe7;
|
||||
}
|
||||
// iota with acute, iota with diaeresis
|
||||
if (letter == 0xba || letter == 0xda) {
|
||||
return 0xe9;
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis
|
||||
if (letter == 0xbe || letter == 0xdb) {
|
||||
return 0xf5;
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == 0xbc) {
|
||||
return 0xef;
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == 0xbf) {
|
||||
return 0xf9;
|
||||
}
|
||||
} else if (charset == CP1253) {
|
||||
// First deal with lower case, not accented letters
|
||||
if (letter >= 0xe1 && letter <= 0xf9)
|
||||
{
|
||||
// Special case 'small final sigma', where we return 'small sigma'
|
||||
if (letter == 0xf2) {
|
||||
return 0xf3;
|
||||
} else {
|
||||
return letter;
|
||||
}
|
||||
}
|
||||
// Then deal with lower case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == 0xdc) {
|
||||
return 0xe1;
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == 0xdd) {
|
||||
return 0xe5;
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == 0xde) {
|
||||
return 0xe7;
|
||||
}
|
||||
// iota with acute, iota with diaeresis, iota with acute and diaeresis
|
||||
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
|
||||
return '\u03B9';
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
|
||||
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
|
||||
return 0xf5;
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == 0xfc) {
|
||||
return 0xef;
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == 0xfe) {
|
||||
return 0xf9;
|
||||
}
|
||||
// After that, deal with upper case, not accented letters
|
||||
if (letter >= 0xc1 && letter <= 0xd9) {
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
// Finally deal with upper case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == 0xa2) {
|
||||
return 0xe1;
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == 0xb8) {
|
||||
return 0xe5;
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == 0xb9) {
|
||||
return 0xe7;
|
||||
}
|
||||
// iota with acute, iota with diaeresis
|
||||
if (letter == 0xba || letter == 0xda) {
|
||||
return 0xe9;
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis
|
||||
if (letter == 0xbe || letter == 0xdb) {
|
||||
return 0xf5;
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == 0xbc) {
|
||||
return 0xef;
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == 0xbf) {
|
||||
return 0xf9;
|
||||
}
|
||||
}
|
||||
|
||||
return Character.toLowerCase(letter);
|
||||
}
|
||||
}
|
|
@ -23,44 +23,93 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case, analyzing given ("greek") charset.
|
||||
* Normalizes token text to lower case, removes some Greek diacritics,
|
||||
* and standardizes final sigma to sigma.
|
||||
*
|
||||
*/
|
||||
public final class GreekLowerCaseFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
char[] charset;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
|
||||
*/
|
||||
public GreekLowerCaseFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
this.charset = charset;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public GreekLowerCaseFilter(TokenStream in)
|
||||
{
|
||||
this(in, GreekCharsets.UnicodeGreek);
|
||||
super(in);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char[] chArray = termAtt.termBuffer();
|
||||
int chLen = termAtt.termLength();
|
||||
// TODO: iterate codepoints to support supp. characters
|
||||
for (int i = 0; i < chLen; i++)
|
||||
{
|
||||
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
|
||||
chArray[i] = (char) lowerCase(chArray[i]);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private int lowerCase(int codepoint) {
|
||||
switch(codepoint) {
|
||||
/* There are two lowercase forms of sigma:
|
||||
* U+03C2: small final sigma (end of word)
|
||||
* U+03C3: small sigma (otherwise)
|
||||
*
|
||||
* Standardize both to U+03C3
|
||||
*/
|
||||
case '\u03C2': /* small final sigma */
|
||||
return '\u03C3'; /* small sigma */
|
||||
|
||||
/* Some greek characters contain diacritics.
|
||||
* This filter removes these, converting to the lowercase base form.
|
||||
*/
|
||||
|
||||
case '\u0386': /* capital alpha with tonos */
|
||||
case '\u03AC': /* small alpha with tonos */
|
||||
return '\u03B1'; /* small alpha */
|
||||
|
||||
case '\u0388': /* capital epsilon with tonos */
|
||||
case '\u03AD': /* small epsilon with tonos */
|
||||
return '\u03B5'; /* small epsilon */
|
||||
|
||||
case '\u0389': /* capital eta with tonos */
|
||||
case '\u03AE': /* small eta with tonos */
|
||||
return '\u03B7'; /* small eta */
|
||||
|
||||
case '\u038A': /* capital iota with tonos */
|
||||
case '\u03AA': /* capital iota with dialytika */
|
||||
case '\u03AF': /* small iota with tonos */
|
||||
case '\u03CA': /* small iota with dialytika */
|
||||
case '\u0390': /* small iota with dialytika and tonos */
|
||||
return '\u03B9'; /* small iota */
|
||||
|
||||
case '\u038E': /* capital upsilon with tonos */
|
||||
case '\u03AB': /* capital upsilon with dialytika */
|
||||
case '\u03CD': /* small upsilon with tonos */
|
||||
case '\u03CB': /* small upsilon with dialytika */
|
||||
case '\u03B0': /* small upsilon with dialytika and tonos */
|
||||
return '\u03C5'; /* small upsilon */
|
||||
|
||||
case '\u038C': /* capital omicron with tonos */
|
||||
case '\u03CC': /* small omicron with tonos */
|
||||
return '\u03BF'; /* small omicron */
|
||||
|
||||
case '\u038F': /* capital omega with tonos */
|
||||
case '\u03CE': /* small omega with tonos */
|
||||
return '\u03C9'; /* small omega */
|
||||
|
||||
/* The previous implementation did the conversion below.
|
||||
* Only implemented for backwards compatibility with old indexes.
|
||||
*/
|
||||
|
||||
case '\u03A2': /* reserved */
|
||||
return '\u03C2'; /* small final sigma */
|
||||
|
||||
default:
|
||||
return Character.toLowerCase(codepoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -40,145 +41,20 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
*/
|
||||
public final class RussianAnalyzer extends Analyzer
|
||||
{
|
||||
// letters (currently unused letters are commented out)
|
||||
private final static char A = 0;
|
||||
private final static char B = 1;
|
||||
private final static char V = 2;
|
||||
private final static char G = 3;
|
||||
private final static char D = 4;
|
||||
private final static char E = 5;
|
||||
private final static char ZH = 6;
|
||||
private final static char Z = 7;
|
||||
private final static char I = 8;
|
||||
private final static char I_ = 9;
|
||||
private final static char K = 10;
|
||||
private final static char L = 11;
|
||||
private final static char M = 12;
|
||||
private final static char N = 13;
|
||||
private final static char O = 14;
|
||||
private final static char P = 15;
|
||||
private final static char R = 16;
|
||||
private final static char S = 17;
|
||||
private final static char T = 18;
|
||||
private final static char U = 19;
|
||||
//private final static char F = 20;
|
||||
private final static char X = 21;
|
||||
//private final static char TS = 22;
|
||||
private final static char CH = 23;
|
||||
private final static char SH = 24;
|
||||
private final static char SHCH = 25;
|
||||
//private final static char HARD = 26;
|
||||
private final static char Y = 27;
|
||||
private final static char SOFT = 28;
|
||||
private final static char AE = 29;
|
||||
private final static char IU = 30;
|
||||
private final static char IA = 31;
|
||||
|
||||
/**
|
||||
* List of typical Russian stopwords.
|
||||
*/
|
||||
private static char[][] RUSSIAN_STOP_WORDS = {
|
||||
{A},
|
||||
{B, E, Z},
|
||||
{B, O, L, E, E},
|
||||
{B, Y},
|
||||
{B, Y, L},
|
||||
{B, Y, L, A},
|
||||
{B, Y, L, I},
|
||||
{B, Y, L, O},
|
||||
{B, Y, T, SOFT},
|
||||
{V},
|
||||
{V, A, M},
|
||||
{V, A, S},
|
||||
{V, E, S, SOFT},
|
||||
{V, O},
|
||||
{V, O, T},
|
||||
{V, S, E},
|
||||
{V, S, E, G, O},
|
||||
{V, S, E, X},
|
||||
{V, Y},
|
||||
{G, D, E},
|
||||
{D, A},
|
||||
{D, A, ZH, E},
|
||||
{D, L, IA},
|
||||
{D, O},
|
||||
{E, G, O},
|
||||
{E, E},
|
||||
{E, I_,},
|
||||
{E, IU},
|
||||
{E, S, L, I},
|
||||
{E, S, T, SOFT},
|
||||
{E, SHCH, E},
|
||||
{ZH, E},
|
||||
{Z, A},
|
||||
{Z, D, E, S, SOFT},
|
||||
{I},
|
||||
{I, Z},
|
||||
{I, L, I},
|
||||
{I, M},
|
||||
{I, X},
|
||||
{K},
|
||||
{K, A, K},
|
||||
{K, O},
|
||||
{K, O, G, D, A},
|
||||
{K, T, O},
|
||||
{L, I},
|
||||
{L, I, B, O},
|
||||
{M, N, E},
|
||||
{M, O, ZH, E, T},
|
||||
{M, Y},
|
||||
{N, A},
|
||||
{N, A, D, O},
|
||||
{N, A, SH},
|
||||
{N, E},
|
||||
{N, E, G, O},
|
||||
{N, E, E},
|
||||
{N, E, T},
|
||||
{N, I},
|
||||
{N, I, X},
|
||||
{N, O},
|
||||
{N, U},
|
||||
{O},
|
||||
{O, B},
|
||||
{O, D, N, A, K, O},
|
||||
{O, N},
|
||||
{O, N, A},
|
||||
{O, N, I},
|
||||
{O, N, O},
|
||||
{O, T},
|
||||
{O, CH, E, N, SOFT},
|
||||
{P, O},
|
||||
{P, O, D},
|
||||
{P, R, I},
|
||||
{S},
|
||||
{S, O},
|
||||
{T, A, K},
|
||||
{T, A, K, ZH, E},
|
||||
{T, A, K, O, I_},
|
||||
{T, A, M},
|
||||
{T, E},
|
||||
{T, E, M},
|
||||
{T, O},
|
||||
{T, O, G, O},
|
||||
{T, O, ZH, E},
|
||||
{T, O, I_},
|
||||
{T, O, L, SOFT, K, O},
|
||||
{T, O, M},
|
||||
{T, Y},
|
||||
{U},
|
||||
{U, ZH, E},
|
||||
{X, O, T, IA},
|
||||
{CH, E, G, O},
|
||||
{CH, E, I_},
|
||||
{CH, E, M},
|
||||
{CH, T, O},
|
||||
{CH, T, O, B, Y},
|
||||
{CH, SOFT, E},
|
||||
{CH, SOFT, IA},
|
||||
{AE, T, A},
|
||||
{AE, T, I},
|
||||
{AE, T, O},
|
||||
{IA}
|
||||
private static final String[] RUSSIAN_STOP_WORDS = {
|
||||
"а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
|
||||
"вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где",
|
||||
"да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть",
|
||||
"еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как",
|
||||
"ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо",
|
||||
"наш", "не", "него", "нее", "нет", "ни", "них", "но", "ну", "о", "об",
|
||||
"однако", "он", "она", "они", "оно", "от", "очень", "по", "под", "при",
|
||||
"с", "со", "так", "также", "такой", "там", "те", "тем", "то", "того",
|
||||
"тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей",
|
||||
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -186,89 +62,26 @@ public final class RussianAnalyzer extends Analyzer
|
|||
*/
|
||||
private Set stopSet = new HashSet();
|
||||
|
||||
/**
|
||||
* Charset for Russian letters.
|
||||
* Represents encoding for 32 lowercase Russian letters.
|
||||
* Predefined charsets can be taken from RussianCharSets class
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
|
||||
public RussianAnalyzer() {
|
||||
charset = RussianCharsets.UnicodeRussian;
|
||||
stopSet = StopFilter.makeStopSet(
|
||||
makeStopWords(RussianCharsets.UnicodeRussian));
|
||||
this(RUSSIAN_STOP_WORDS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
* @deprecated Use {@link #RussianAnalyzer()} instead.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated Use {@link #RussianAnalyzer(String[])} instead.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset, String[] stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public RussianAnalyzer(String[] stopwords)
|
||||
{
|
||||
this.charset = RussianCharsets.UnicodeRussian;
|
||||
super();
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
/** Takes russian stop words and translates them to a String array, using
|
||||
* the given charset.
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private static String[] makeStopWords(char[] charset)
|
||||
{
|
||||
String[] res = new String[RUSSIAN_STOP_WORDS.length];
|
||||
for (int i = 0; i < res.length; i++)
|
||||
{
|
||||
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
|
||||
// translate the word, using the charset
|
||||
StringBuffer theWord = new StringBuffer();
|
||||
for (int j = 0; j < theStopWord.length; j++)
|
||||
{
|
||||
theWord.append(charset[theStopWord[j]]);
|
||||
}
|
||||
res[i] = theWord.toString();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* TODO: create a Set version of this ctor
|
||||
* @deprecated Use {@link #RussianAnalyzer(Map)} instead.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset, Map stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* TODO: create a Set version of this ctor
|
||||
*/
|
||||
public RussianAnalyzer(Map stopwords)
|
||||
{
|
||||
charset = RussianCharsets.UnicodeRussian;
|
||||
super();
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
|
@ -283,10 +96,10 @@ public final class RussianAnalyzer extends Analyzer
|
|||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
TokenStream result = new RussianLetterTokenizer(reader, charset);
|
||||
result = new RussianLowerCaseFilter(result, charset);
|
||||
TokenStream result = new RussianLetterTokenizer(reader);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopSet);
|
||||
result = new RussianStemFilter(result, charset);
|
||||
result = new RussianStemFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -309,10 +122,10 @@ public final class RussianAnalyzer extends Analyzer
|
|||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new RussianLetterTokenizer(reader, charset);
|
||||
streams.result = new RussianLowerCaseFilter(streams.source, charset);
|
||||
streams.source = new RussianLetterTokenizer(reader);
|
||||
streams.result = new LowerCaseFilter(streams.source);
|
||||
streams.result = new StopFilter(streams.result, stopSet);
|
||||
streams.result = new RussianStemFilter(streams.result, charset);
|
||||
streams.result = new RussianStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
|
|
|
@ -1,314 +0,0 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
||||
* for russian characters in Unicode, KOI8 and CP1252.
|
||||
* <p>
|
||||
* Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
|
||||
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
||||
* and adding logic to toLowerCase() method for that charset.
|
||||
* </p>
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
* @version $Id$
|
||||
*/
|
||||
public class RussianCharsets
|
||||
{
|
||||
// Unicode Russian charset (lowercase letters only)
|
||||
public static char[] UnicodeRussian = {
|
||||
'\u0430',
|
||||
'\u0431',
|
||||
'\u0432',
|
||||
'\u0433',
|
||||
'\u0434',
|
||||
'\u0435',
|
||||
'\u0436',
|
||||
'\u0437',
|
||||
'\u0438',
|
||||
'\u0439',
|
||||
'\u043A',
|
||||
'\u043B',
|
||||
'\u043C',
|
||||
'\u043D',
|
||||
'\u043E',
|
||||
'\u043F',
|
||||
'\u0440',
|
||||
'\u0441',
|
||||
'\u0442',
|
||||
'\u0443',
|
||||
'\u0444',
|
||||
'\u0445',
|
||||
'\u0446',
|
||||
'\u0447',
|
||||
'\u0448',
|
||||
'\u0449',
|
||||
'\u044A',
|
||||
'\u044B',
|
||||
'\u044C',
|
||||
'\u044D',
|
||||
'\u044E',
|
||||
'\u044F',
|
||||
// upper case
|
||||
'\u0410',
|
||||
'\u0411',
|
||||
'\u0412',
|
||||
'\u0413',
|
||||
'\u0414',
|
||||
'\u0415',
|
||||
'\u0416',
|
||||
'\u0417',
|
||||
'\u0418',
|
||||
'\u0419',
|
||||
'\u041A',
|
||||
'\u041B',
|
||||
'\u041C',
|
||||
'\u041D',
|
||||
'\u041E',
|
||||
'\u041F',
|
||||
'\u0420',
|
||||
'\u0421',
|
||||
'\u0422',
|
||||
'\u0423',
|
||||
'\u0424',
|
||||
'\u0425',
|
||||
'\u0426',
|
||||
'\u0427',
|
||||
'\u0428',
|
||||
'\u0429',
|
||||
'\u042A',
|
||||
'\u042B',
|
||||
'\u042C',
|
||||
'\u042D',
|
||||
'\u042E',
|
||||
'\u042F',
|
||||
// numbers
|
||||
'0',
|
||||
'1',
|
||||
'2',
|
||||
'3',
|
||||
'4',
|
||||
'5',
|
||||
'6',
|
||||
'7',
|
||||
'8',
|
||||
'9'
|
||||
};
|
||||
|
||||
// KOI8 charset
|
||||
public static char[] KOI8 = {
|
||||
0xc1,
|
||||
0xc2,
|
||||
0xd7,
|
||||
0xc7,
|
||||
0xc4,
|
||||
0xc5,
|
||||
0xd6,
|
||||
0xda,
|
||||
0xc9,
|
||||
0xca,
|
||||
0xcb,
|
||||
0xcc,
|
||||
0xcd,
|
||||
0xce,
|
||||
0xcf,
|
||||
0xd0,
|
||||
0xd2,
|
||||
0xd3,
|
||||
0xd4,
|
||||
0xd5,
|
||||
0xc6,
|
||||
0xc8,
|
||||
0xc3,
|
||||
0xde,
|
||||
0xdb,
|
||||
0xdd,
|
||||
0xdf,
|
||||
0xd9,
|
||||
0xd8,
|
||||
0xdc,
|
||||
0xc0,
|
||||
0xd1,
|
||||
// upper case
|
||||
0xe1,
|
||||
0xe2,
|
||||
0xf7,
|
||||
0xe7,
|
||||
0xe4,
|
||||
0xe5,
|
||||
0xf6,
|
||||
0xfa,
|
||||
0xe9,
|
||||
0xea,
|
||||
0xeb,
|
||||
0xec,
|
||||
0xed,
|
||||
0xee,
|
||||
0xef,
|
||||
0xf0,
|
||||
0xf2,
|
||||
0xf3,
|
||||
0xf4,
|
||||
0xf5,
|
||||
0xe6,
|
||||
0xe8,
|
||||
0xe3,
|
||||
0xfe,
|
||||
0xfb,
|
||||
0xfd,
|
||||
0xff,
|
||||
0xf9,
|
||||
0xf8,
|
||||
0xfc,
|
||||
0xe0,
|
||||
0xf1,
|
||||
// numbers
|
||||
'0',
|
||||
'1',
|
||||
'2',
|
||||
'3',
|
||||
'4',
|
||||
'5',
|
||||
'6',
|
||||
'7',
|
||||
'8',
|
||||
'9'
|
||||
};
|
||||
|
||||
// CP1251 eharset
|
||||
public static char[] CP1251 = {
|
||||
0xE0,
|
||||
0xE1,
|
||||
0xE2,
|
||||
0xE3,
|
||||
0xE4,
|
||||
0xE5,
|
||||
0xE6,
|
||||
0xE7,
|
||||
0xE8,
|
||||
0xE9,
|
||||
0xEA,
|
||||
0xEB,
|
||||
0xEC,
|
||||
0xED,
|
||||
0xEE,
|
||||
0xEF,
|
||||
0xF0,
|
||||
0xF1,
|
||||
0xF2,
|
||||
0xF3,
|
||||
0xF4,
|
||||
0xF5,
|
||||
0xF6,
|
||||
0xF7,
|
||||
0xF8,
|
||||
0xF9,
|
||||
0xFA,
|
||||
0xFB,
|
||||
0xFC,
|
||||
0xFD,
|
||||
0xFE,
|
||||
0xFF,
|
||||
// upper case
|
||||
0xC0,
|
||||
0xC1,
|
||||
0xC2,
|
||||
0xC3,
|
||||
0xC4,
|
||||
0xC5,
|
||||
0xC6,
|
||||
0xC7,
|
||||
0xC8,
|
||||
0xC9,
|
||||
0xCA,
|
||||
0xCB,
|
||||
0xCC,
|
||||
0xCD,
|
||||
0xCE,
|
||||
0xCF,
|
||||
0xD0,
|
||||
0xD1,
|
||||
0xD2,
|
||||
0xD3,
|
||||
0xD4,
|
||||
0xD5,
|
||||
0xD6,
|
||||
0xD7,
|
||||
0xD8,
|
||||
0xD9,
|
||||
0xDA,
|
||||
0xDB,
|
||||
0xDC,
|
||||
0xDD,
|
||||
0xDE,
|
||||
0xDF,
|
||||
// numbers
|
||||
'0',
|
||||
'1',
|
||||
'2',
|
||||
'3',
|
||||
'4',
|
||||
'5',
|
||||
'6',
|
||||
'7',
|
||||
'8',
|
||||
'9'
|
||||
};
|
||||
|
||||
public static char toLowerCase(char letter, char[] charset)
|
||||
{
|
||||
if (charset == UnicodeRussian)
|
||||
{
|
||||
if (letter >= '\u0430' && letter <= '\u044F')
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
if (letter >= '\u0410' && letter <= '\u042F')
|
||||
{
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
}
|
||||
|
||||
if (charset == KOI8)
|
||||
{
|
||||
if (letter >= 0xe0 && letter <= 0xff)
|
||||
{
|
||||
return (char) (letter - 32);
|
||||
}
|
||||
if (letter >= 0xc0 && letter <= 0xdf)
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (charset == CP1251)
|
||||
{
|
||||
if (letter >= 0xC0 && letter <= 0xDF)
|
||||
{
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
if (letter >= 0xE0 && letter <= 0xFF)
|
||||
{
|
||||
return letter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return Character.toLowerCase(letter);
|
||||
}
|
||||
}
|
|
@ -25,49 +25,26 @@ import org.apache.lucene.util.AttributeSource;
|
|||
|
||||
/**
|
||||
* A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
|
||||
* by additionally looking up letters in a given "russian charset".
|
||||
* <p>
|
||||
* The problem with
|
||||
* {@link LetterTokenizer} is that it uses {@link Character#isLetter(char)} method,
|
||||
* which doesn't know how to detect letters in encodings like CP1252 and KOI8
|
||||
* (well-known problems with 0xD7 and 0xF7 chars)
|
||||
* </p>
|
||||
* by also allowing the basic latin digits 0-9.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
public class RussianLetterTokenizer extends CharTokenizer
|
||||
{
|
||||
/**
|
||||
* Charset this tokenizer uses.
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead.
|
||||
*/
|
||||
public RussianLetterTokenizer(Reader in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
{
|
||||
public RussianLetterTokenizer(Reader in)
|
||||
{
|
||||
this(in, RussianCharsets.UnicodeRussian);
|
||||
super(in);
|
||||
}
|
||||
|
||||
public RussianLetterTokenizer(AttributeSource source, Reader in)
|
||||
{
|
||||
super(source, in);
|
||||
this.charset = RussianCharsets.UnicodeRussian;
|
||||
}
|
||||
|
||||
public RussianLetterTokenizer(AttributeFactory factory, Reader in)
|
||||
{
|
||||
super(factory, in);
|
||||
this.charset = RussianCharsets.UnicodeRussian;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -76,14 +53,9 @@ public class RussianLetterTokenizer extends CharTokenizer
|
|||
*/
|
||||
protected boolean isTokenChar(char c)
|
||||
{
|
||||
/* in the next release, this can be implemented as isLetter(c) or [0-9] */
|
||||
if (Character.isLetter(c))
|
||||
if (Character.isLetter(c) || (c >= '0' && c <= '9'))
|
||||
return true;
|
||||
for (int i = 0; i < charset.length; i++)
|
||||
{
|
||||
if (c == charset[i])
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,38 +19,26 @@ package org.apache.lucene.analysis.ru;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case, analyzing given ("russian") charset.
|
||||
*
|
||||
* Normalizes token text to lower case.
|
||||
* @deprecated Use {@link LowerCaseFilter} instead, which has the same
|
||||
* functionality. This filter will be removed in Lucene 3.1
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class RussianLowerCaseFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
char[] charset;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
|
||||
*/
|
||||
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
this.charset = charset;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
public RussianLowerCaseFilter(TokenStream in)
|
||||
{
|
||||
this(in, RussianCharsets.UnicodeRussian);
|
||||
super(in);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public final boolean incrementToken() throws IOException
|
||||
|
@ -60,7 +48,7 @@ public final class RussianLowerCaseFilter extends TokenFilter
|
|||
int chLen = termAtt.termLength();
|
||||
for (int i = 0; i < chLen; i++)
|
||||
{
|
||||
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
|
||||
chArray[i] = Character.toLowerCase(chArray[i]);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ru;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.LowerCaseFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
@ -28,8 +29,8 @@ import java.io.IOException;
|
|||
* A {@link TokenFilter} that stems Russian words.
|
||||
* <p>
|
||||
* The implementation was inspired by GermanStemFilter.
|
||||
* The input should be filtered by {@link RussianLowerCaseFilter} before passing it to RussianStemFilter ,
|
||||
* because RussianStemFilter only works with lowercase part of any "russian" charset.
|
||||
* The input should be filtered by {@link LowerCaseFilter} before passing it to RussianStemFilter ,
|
||||
* because RussianStemFilter only works with lowercase characters.
|
||||
* </p>
|
||||
*
|
||||
* @version $Id$
|
||||
|
@ -43,19 +44,11 @@ public final class RussianStemFilter extends TokenFilter
|
|||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
|
||||
*/
|
||||
public RussianStemFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
stemmer = new RussianStemmer(charset);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public RussianStemFilter(TokenStream in)
|
||||
{
|
||||
this(in, RussianCharsets.UnicodeRussian);
|
||||
super(in);
|
||||
stemmer = new RussianStemmer();
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
/**
|
||||
* Returns the next token in the stream, or null at EOS
|
||||
|
|
|
@ -25,47 +25,42 @@ package org.apache.lucene.analysis.ru;
|
|||
*/
|
||||
class RussianStemmer
|
||||
{
|
||||
/**
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
// positions of RV, R1 and R2 respectively
|
||||
private int RV, R1, R2;
|
||||
|
||||
// letters (currently unused letters are commented out)
|
||||
private final static char A = 0;
|
||||
//private final static char B = 1;
|
||||
private final static char V = 2;
|
||||
private final static char G = 3;
|
||||
//private final static char D = 4;
|
||||
private final static char E = 5;
|
||||
//private final static char ZH = 6;
|
||||
//private final static char Z = 7;
|
||||
private final static char I = 8;
|
||||
private final static char I_ = 9;
|
||||
//private final static char K = 10;
|
||||
private final static char L = 11;
|
||||
private final static char M = 12;
|
||||
private final static char N = 13;
|
||||
private final static char O = 14;
|
||||
//private final static char P = 15;
|
||||
//private final static char R = 16;
|
||||
private final static char S = 17;
|
||||
private final static char T = 18;
|
||||
private final static char U = 19;
|
||||
//private final static char F = 20;
|
||||
private final static char X = 21;
|
||||
//private final static char TS = 22;
|
||||
//private final static char CH = 23;
|
||||
private final static char SH = 24;
|
||||
private final static char SHCH = 25;
|
||||
//private final static char HARD = 26;
|
||||
private final static char Y = 27;
|
||||
private final static char SOFT = 28;
|
||||
private final static char AE = 29;
|
||||
private final static char IU = 30;
|
||||
private final static char IA = 31;
|
||||
private final static char A = '\u0430';
|
||||
//private final static char B = '\u0431';
|
||||
private final static char V = '\u0432';
|
||||
private final static char G = '\u0433';
|
||||
//private final static char D = '\u0434';
|
||||
private final static char E = '\u0435';
|
||||
//private final static char ZH = '\u0436';
|
||||
//private final static char Z = '\u0437';
|
||||
private final static char I = '\u0438';
|
||||
private final static char I_ = '\u0439';
|
||||
//private final static char K = '\u043A';
|
||||
private final static char L = '\u043B';
|
||||
private final static char M = '\u043C';
|
||||
private final static char N = '\u043D';
|
||||
private final static char O = '\u043E';
|
||||
//private final static char P = '\u043F';
|
||||
//private final static char R = '\u0440';
|
||||
private final static char S = '\u0441';
|
||||
private final static char T = '\u0442';
|
||||
private final static char U = '\u0443';
|
||||
//private final static char F = '\u0444';
|
||||
private final static char X = '\u0445';
|
||||
//private final static char TS = '\u0446';
|
||||
//private final static char CH = '\u0447';
|
||||
private final static char SH = '\u0448';
|
||||
private final static char SHCH = '\u0449';
|
||||
//private final static char HARD = '\u044A';
|
||||
private final static char Y = '\u044B';
|
||||
private final static char SOFT = '\u044C';
|
||||
private final static char AE = '\u044D';
|
||||
private final static char IU = '\u044E';
|
||||
private final static char IA = '\u044F';
|
||||
|
||||
// stem definitions
|
||||
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
|
||||
|
@ -256,16 +251,6 @@ class RussianStemmer
|
|||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* RussianStemmer constructor comment.
|
||||
* @deprecated Use {@link #RussianStemmer()} instead.
|
||||
*/
|
||||
public RussianStemmer(char[] charset)
|
||||
{
|
||||
super();
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjectival ending is an adjective ending,
|
||||
* optionally preceded by participle ending.
|
||||
|
@ -333,7 +318,7 @@ class RussianStemmer
|
|||
int stemmingIndex = startIndex;
|
||||
for (int j = theEnding.length - 1; j >= 0; j--)
|
||||
{
|
||||
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
|
||||
if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
|
||||
{
|
||||
match = false;
|
||||
break;
|
||||
|
@ -451,7 +436,7 @@ class RussianStemmer
|
|||
{
|
||||
for (int i = 0; i < vowels.length; i++)
|
||||
{
|
||||
if (letter == charset[vowels[i]])
|
||||
if (letter == vowels[i])
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -499,7 +484,7 @@ class RussianStemmer
|
|||
private boolean removeI(StringBuffer stemmingZone)
|
||||
{
|
||||
if (stemmingZone.length() > 0
|
||||
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
|
||||
&& stemmingZone.charAt(stemmingZone.length() - 1) == I)
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||
return true;
|
||||
|
@ -518,7 +503,7 @@ class RussianStemmer
|
|||
private boolean removeSoft(StringBuffer stemmingZone)
|
||||
{
|
||||
if (stemmingZone.length() > 0
|
||||
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
|
||||
&& stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
|
||||
{
|
||||
stemmingZone.setLength(stemmingZone.length() - 1);
|
||||
return true;
|
||||
|
@ -529,17 +514,6 @@ class RussianStemmer
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert the method's description here.
|
||||
* Creation date: (16/03/2002 10:58:42 PM)
|
||||
* @param newCharset char[]
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
public void setCharset(char[] newCharset)
|
||||
{
|
||||
charset = newCharset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the stem for given Russian word.
|
||||
* Creation date: (16/03/2002 3:36:48 PM)
|
||||
|
@ -622,25 +596,13 @@ class RussianStemmer
|
|||
verb1Predessors)
|
||||
|| findAndRemoveEnding(stemmingZone, verbEndings2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Static method for stemming with different charsets
|
||||
* @deprecated Use {@link #stemWord(String)} instead.
|
||||
*/
|
||||
public static String stem(String theWord, char[] charset)
|
||||
{
|
||||
RussianStemmer stemmer = new RussianStemmer();
|
||||
stemmer.setCharset(charset);
|
||||
return stemmer.stem(theWord);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Static method for stemming.
|
||||
*/
|
||||
public static String stemWord(String theWord)
|
||||
{
|
||||
RussianStemmer stemmer = new RussianStemmer();
|
||||
stemmer.setCharset(RussianCharsets.UnicodeRussian);
|
||||
return stemmer.stem(theWord);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,14 +42,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
|||
|
||||
private InputStreamReader sampleUnicode;
|
||||
|
||||
private Reader inWordsKOI8;
|
||||
|
||||
private Reader sampleKOI8;
|
||||
|
||||
private Reader inWords1251;
|
||||
|
||||
private Reader sample1251;
|
||||
|
||||
private File dataDir;
|
||||
|
||||
protected void setUp() throws Exception
|
||||
|
@ -97,76 +89,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
|||
inWords.close();
|
||||
sampleUnicode.close();
|
||||
}
|
||||
|
||||
public void testKOI8() throws IOException
|
||||
{
|
||||
//System.out.println(new java.util.Date());
|
||||
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
|
||||
// KOI8
|
||||
inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
|
||||
|
||||
sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
|
||||
|
||||
TokenStream in = ra.tokenStream("all", inWordsKOI8);
|
||||
RussianLetterTokenizer sample =
|
||||
new RussianLetterTokenizer(
|
||||
sampleKOI8,
|
||||
RussianCharsets.KOI8);
|
||||
|
||||
TermAttribute text = in.getAttribute(TermAttribute.class);
|
||||
TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (in.incrementToken() == false)
|
||||
break;
|
||||
|
||||
boolean nextSampleToken = sample.incrementToken();
|
||||
assertEquals(
|
||||
"KOI8",
|
||||
text.term(),
|
||||
nextSampleToken == false
|
||||
? null
|
||||
: sampleText.term());
|
||||
}
|
||||
inWordsKOI8.close();
|
||||
sampleKOI8.close();
|
||||
}
|
||||
|
||||
public void test1251() throws IOException
|
||||
{
|
||||
// 1251
|
||||
inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
|
||||
|
||||
sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
|
||||
|
||||
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
|
||||
TokenStream in = ra.tokenStream("", inWords1251);
|
||||
RussianLetterTokenizer sample =
|
||||
new RussianLetterTokenizer(
|
||||
sample1251,
|
||||
RussianCharsets.CP1251);
|
||||
|
||||
TermAttribute text = in.getAttribute(TermAttribute.class);
|
||||
TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (in.incrementToken() == false)
|
||||
break;
|
||||
|
||||
boolean nextSampleToken = sample.incrementToken();
|
||||
assertEquals(
|
||||
"1251",
|
||||
text.term(),
|
||||
nextSampleToken == false
|
||||
? null
|
||||
: sampleText.term());
|
||||
}
|
||||
|
||||
inWords1251.close();
|
||||
sample1251.close();
|
||||
}
|
||||
|
||||
public void testDigitsInRussianCharset()
|
||||
{
|
||||
|
|
|
@ -84,9 +84,8 @@ public class TestRussianStem extends LuceneTestCase
|
|||
{
|
||||
//if ( (i % 100) == 0 ) System.err.println(i);
|
||||
String realStem =
|
||||
RussianStemmer.stem(
|
||||
(String) words.get(i),
|
||||
RussianCharsets.UnicodeRussian);
|
||||
RussianStemmer.stemWord(
|
||||
(String) words.get(i));
|
||||
assertEquals("unicode", stems.get(i), realStem);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
[вмест][сил][электромагнитн][энерг][имел][представлен][скаж][жрец][древн][египт][знан][хран][тайн][узк][круг][посвящен][всяк][времен][виток][прин][соб][нов][технолог][сам][дел][раскрыва][потаен][знан][прежн][век][говор][нов][информац][станов][доступн][широк][круг][пользовател][тех][случа][сознан][обществ][готов][восприня][воспользова]
|
|
@ -1 +0,0 @@
|
|||
[淄庞註[由蘛[芴潘砸贤燎紊晕][芪乓荾[赏盘][幸拍釉磷膛蝅[铀林][忠琶][囊抛蝅[徘尚註[谖廖][纫廖][粤饰][遮薦[艘涨][邢幼演盼][子阉][滓磐盼][咨韵薦[幸晌][酉耛[蜗譣[耘任咸锨][恿蚞[呐蘛[伊铀屹琢][邢粤盼][谖廖][幸胖蝅[着薦[窍紫襗[蜗譣[晌葡彝撩][釉廖献][南釉招蝅[凵蚁薦[艘涨][邢特谙琢耘蘛[耘萞[犹辙羃[酉谖廖][下菖釉譣[窍韵譣[紫有疑窝][紫有咸刳献羃
|
|
@ -1,2 +0,0 @@
|
|||
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
|
||||
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
|
|
@ -1,2 +0,0 @@
|
|||
Вместе с тем о силе электромагнитной энергии имели представление еще, скажем, жрецы Древнего Египта. Но знание это хранилось в тайне, в
|
||||
узком кругу посвященных. Всякий временной виток, принося с собой новые технологии, на самом деле раскрывает потаенное знание прежних веков. Мы уже говорили, что новая информация становится доступной широкому кругу пользователей только в тех случаях, когда сознание общества готово ее воспринять и воспользоваться ею.
|
Loading…
Reference in New Issue