mirror of https://github.com/apache/lucene.git
add GreekAnalyzer, contributed by Panagiotis Astithas (past@ebs.gr)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@164686 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
346ed71644
commit
d650384d4b
|
@ -0,0 +1,222 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Analyzer for the Greek language. Supports an external list of stopwords (words
|
||||
* that will not be indexed at all).
|
||||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
*
|
||||
* @author Panagiotis Astithas, past@ebs.gr
|
||||
*/
|
||||
public final class GreekAnalyzer extends Analyzer
|
||||
{
|
||||
// the letters are indexes to the charset array (see GreekCharsets.java)
|
||||
private static char A = 6;
|
||||
private static char B = 7;
|
||||
private static char G = 8;
|
||||
private static char D = 9;
|
||||
private static char E = 10;
|
||||
private static char Z = 11;
|
||||
private static char H = 12;
|
||||
private static char TH = 13;
|
||||
private static char I = 14;
|
||||
private static char K = 15;
|
||||
private static char L = 16;
|
||||
private static char M = 17;
|
||||
private static char N = 18;
|
||||
private static char KS = 19;
|
||||
private static char O = 20;
|
||||
private static char P = 21;
|
||||
private static char R = 22;
|
||||
private static char S = 24; // skip final sigma
|
||||
private static char T = 25;
|
||||
private static char Y = 26;
|
||||
private static char F = 27;
|
||||
private static char X = 28;
|
||||
private static char PS = 29;
|
||||
private static char W = 30;
|
||||
|
||||
/**
|
||||
* List of typical Greek stopwords.
|
||||
*/
|
||||
private static char[][] GREEK_STOP_WORDS = {
|
||||
{O},
|
||||
{H},
|
||||
{T, O},
|
||||
{O, I},
|
||||
{T, A},
|
||||
{T, O, Y},
|
||||
{T, H, S},
|
||||
{T, W, N},
|
||||
{T, O, N},
|
||||
{T, H, N},
|
||||
{K, A, I},
|
||||
{K, I},
|
||||
{K},
|
||||
{E, I, M, A, I},
|
||||
{E, I, S, A, I},
|
||||
{E, I, N, A, I},
|
||||
{E, I, M, A, S, T, E},
|
||||
{E, I, S, T, E},
|
||||
{S, T, O},
|
||||
{S, T, O, N},
|
||||
{S, T, H},
|
||||
{S, T, H, N},
|
||||
{M, A},
|
||||
{A, L, L, A},
|
||||
{A, P, O},
|
||||
{G, I, A},
|
||||
{P, R, O, S},
|
||||
{M, E},
|
||||
{S, E},
|
||||
{W, S},
|
||||
{P, A, R, A},
|
||||
{A, N, T, I},
|
||||
{K, A, T, A},
|
||||
{M, E, T, A},
|
||||
{TH, A},
|
||||
{N, A},
|
||||
{D, E},
|
||||
{D, E, N},
|
||||
{M, H},
|
||||
{M, H, N},
|
||||
{E, P, I},
|
||||
{E, N, W},
|
||||
{E, A, N},
|
||||
{A, N},
|
||||
{T, O, T, E},
|
||||
{P, O, Y},
|
||||
{P, W, S},
|
||||
{P, O, I, O, S},
|
||||
{P, O, I, A},
|
||||
{P, O, I, O},
|
||||
{P, O, I, O, I},
|
||||
{P, O, I, E, S},
|
||||
{P, O, I, W, N},
|
||||
{P, O, I, O, Y, S},
|
||||
{A, Y, T, O, S},
|
||||
{A, Y, T, H},
|
||||
{A, Y, T, O},
|
||||
{A, Y, T, O, I},
|
||||
{A, Y, T, W, N},
|
||||
{A, Y, T, O, Y, S},
|
||||
{A, Y, T, E, S},
|
||||
{A, Y, T, A},
|
||||
{E, K, E, I, N, O, S},
|
||||
{E, K, E, I, N, H},
|
||||
{E, K, E, I, N, O},
|
||||
{E, K, E, I, N, O, I},
|
||||
{E, K, E, I, N, E, S},
|
||||
{E, K, E, I, N, A},
|
||||
{E, K, E, I, N, W, N},
|
||||
{E, K, E, I, N, O, Y, S},
|
||||
{O, P, W, S},
|
||||
{O, M, W, S},
|
||||
{I, S, W, S},
|
||||
{O, S, O},
|
||||
{O, T, I}
|
||||
};
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Set stopSet = new HashSet();
|
||||
|
||||
/**
|
||||
* Charset for Greek letters.
|
||||
* Represents encoding for 24 lowercase Greek letters.
|
||||
* Predefined charsets can be taken from GreekCharSets class
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
public GreekAnalyzer() {
|
||||
charset = GreekCharsets.UnicodeGreek;
|
||||
stopSet = StopFilter.makeStopSet(
|
||||
makeStopWords(GreekCharsets.UnicodeGreek));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset, String[] stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
// Takes greek stop words and translates them to a String array, using
|
||||
// the given charset
|
||||
private static String[] makeStopWords(char[] charset)
|
||||
{
|
||||
String[] res = new String[GREEK_STOP_WORDS.length];
|
||||
for (int i = 0; i < res.length; i++)
|
||||
{
|
||||
char[] theStopWord = GREEK_STOP_WORDS[i];
|
||||
// translate the word,using the charset
|
||||
StringBuffer theWord = new StringBuffer();
|
||||
for (int j = 0; j < theStopWord.length; j++)
|
||||
{
|
||||
theWord.append(charset[theStopWord[j]]);
|
||||
}
|
||||
res[i] = theWord.toString();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset, Hashtable stopwords)
|
||||
{
|
||||
this.charset = charset;
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||
* GreekLowerCaseFilter and StopFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
{
|
||||
TokenStream result = new StandardTokenizer(reader);
|
||||
result = new GreekLowerCaseFilter(result, charset);
|
||||
result = new StopFilter(result, stopSet);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,481 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
|
||||
* for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
|
||||
* Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
|
||||
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
|
||||
* the definition of a new charset as well as the required logic in the toLowerCase() method.
|
||||
*
|
||||
* @author Panagiotis Astithas, past@ebs.gr
|
||||
*/
|
||||
public class GreekCharsets
|
||||
{
|
||||
// Unicode Greek charset
|
||||
public static char[] UnicodeGreek = {
|
||||
// lower case
|
||||
'\u0390',
|
||||
'\u03AC',
|
||||
'\u03AD',
|
||||
'\u03AE',
|
||||
'\u03AF',
|
||||
'\u03B0',
|
||||
'\u03B1',
|
||||
'\u03B2',
|
||||
'\u03B3',
|
||||
'\u03B4',
|
||||
'\u03B5',
|
||||
'\u03B6',
|
||||
'\u03B7',
|
||||
'\u03B8',
|
||||
'\u03B9',
|
||||
'\u03BA',
|
||||
'\u03BB',
|
||||
'\u03BC',
|
||||
'\u03BD',
|
||||
'\u03BE',
|
||||
'\u03BF',
|
||||
'\u03C0',
|
||||
'\u03C1',
|
||||
'\u03C2',
|
||||
'\u03C3',
|
||||
'\u03C4',
|
||||
'\u03C5',
|
||||
'\u03C6',
|
||||
'\u03C7',
|
||||
'\u03C8',
|
||||
'\u03C9',
|
||||
'\u03CA',
|
||||
'\u03CB',
|
||||
'\u03CC',
|
||||
'\u03CD',
|
||||
'\u03CE',
|
||||
// upper case
|
||||
'\u0386',
|
||||
'\u0388',
|
||||
'\u0389',
|
||||
'\u038A',
|
||||
'\u038C',
|
||||
'\u038E',
|
||||
'\u038F',
|
||||
'\u0391',
|
||||
'\u0392',
|
||||
'\u0393',
|
||||
'\u0394',
|
||||
'\u0395',
|
||||
'\u0396',
|
||||
'\u0397',
|
||||
'\u0398',
|
||||
'\u0399',
|
||||
'\u039A',
|
||||
'\u039B',
|
||||
'\u039C',
|
||||
'\u039D',
|
||||
'\u039E',
|
||||
'\u039F',
|
||||
'\u03A0',
|
||||
'\u03A1',
|
||||
'\u03A3',
|
||||
'\u03A4',
|
||||
'\u03A5',
|
||||
'\u03A6',
|
||||
'\u03A7',
|
||||
'\u03A8',
|
||||
'\u03A9',
|
||||
'\u03AA',
|
||||
'\u03AB'
|
||||
};
|
||||
|
||||
// ISO-8859-7 charset (ELOT-928)
|
||||
public static char[] ISO = {
|
||||
// lower case
|
||||
0xc0,
|
||||
0xdc,
|
||||
0xdd,
|
||||
0xde,
|
||||
0xdf,
|
||||
0xe0,
|
||||
0xe1,
|
||||
0xe2,
|
||||
0xe3,
|
||||
0xe4,
|
||||
0xe5,
|
||||
0xe6,
|
||||
0xe7,
|
||||
0xe8,
|
||||
0xe9,
|
||||
0xea,
|
||||
0xeb,
|
||||
0xec,
|
||||
0xed,
|
||||
0xee,
|
||||
0xef,
|
||||
0xf0,
|
||||
0xf1,
|
||||
0xf2,
|
||||
0xf3,
|
||||
0xf4,
|
||||
0xf5,
|
||||
0xf6,
|
||||
0xf7,
|
||||
0xf8,
|
||||
0xf9,
|
||||
0xfa,
|
||||
0xfb,
|
||||
0xfc,
|
||||
0xfd,
|
||||
0xfe,
|
||||
// upper case
|
||||
0xb6,
|
||||
0xb8,
|
||||
0xb9,
|
||||
0xba,
|
||||
0xbc,
|
||||
0xbe,
|
||||
0xbf,
|
||||
0xc1,
|
||||
0xc2,
|
||||
0xc3,
|
||||
0xc4,
|
||||
0xc5,
|
||||
0xc6,
|
||||
0xc7,
|
||||
0xc8,
|
||||
0xc9,
|
||||
0xca,
|
||||
0xcb,
|
||||
0xcc,
|
||||
0xcd,
|
||||
0xce,
|
||||
0xcf,
|
||||
0xd0,
|
||||
0xd1,
|
||||
0xd3,
|
||||
0xd4,
|
||||
0xd5,
|
||||
0xd6,
|
||||
0xd7,
|
||||
0xd8,
|
||||
0xd9,
|
||||
0xda,
|
||||
0xdb
|
||||
};
|
||||
|
||||
// CP1253 charset
|
||||
public static char[] CP1253 = {
|
||||
// lower case
|
||||
0xc0,
|
||||
0xdc,
|
||||
0xdd,
|
||||
0xde,
|
||||
0xdf,
|
||||
0xe0,
|
||||
0xe1,
|
||||
0xe2,
|
||||
0xe3,
|
||||
0xe4,
|
||||
0xe5,
|
||||
0xe6,
|
||||
0xe7,
|
||||
0xe8,
|
||||
0xe9,
|
||||
0xea,
|
||||
0xeb,
|
||||
0xec,
|
||||
0xed,
|
||||
0xee,
|
||||
0xef,
|
||||
0xf0,
|
||||
0xf1,
|
||||
0xf2,
|
||||
0xf3,
|
||||
0xf4,
|
||||
0xf5,
|
||||
0xf6,
|
||||
0xf7,
|
||||
0xf8,
|
||||
0xf9,
|
||||
0xfa,
|
||||
0xfb,
|
||||
0xfc,
|
||||
0xfd,
|
||||
0xfe,
|
||||
// upper case
|
||||
0xa2,
|
||||
0xb8,
|
||||
0xb9,
|
||||
0xba,
|
||||
0xbc,
|
||||
0xbe,
|
||||
0xbf,
|
||||
0xc1,
|
||||
0xc2,
|
||||
0xc3,
|
||||
0xc4,
|
||||
0xc5,
|
||||
0xc6,
|
||||
0xc7,
|
||||
0xc8,
|
||||
0xc9,
|
||||
0xca,
|
||||
0xcb,
|
||||
0xcc,
|
||||
0xcd,
|
||||
0xce,
|
||||
0xcf,
|
||||
0xd0,
|
||||
0xd1,
|
||||
0xd3,
|
||||
0xd4,
|
||||
0xd5,
|
||||
0xd6,
|
||||
0xd7,
|
||||
0xd8,
|
||||
0xd9,
|
||||
0xda,
|
||||
0xdb
|
||||
};
|
||||
|
||||
public static char toLowerCase(char letter, char[] charset)
|
||||
{
|
||||
if (charset == UnicodeGreek) {
|
||||
// First deal with lower case, not accented letters
|
||||
if (letter >= '\u03B1' && letter <= '\u03C9')
|
||||
{
|
||||
// Special case 'small final sigma', where we return 'small sigma'
|
||||
if (letter == '\u03C2') {
|
||||
return '\u03C3';
|
||||
} else {
|
||||
return letter;
|
||||
}
|
||||
}
|
||||
// Then deal with lower case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == '\u03AC') {
|
||||
return '\u03B1';
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == '\u03AD') {
|
||||
return '\u03B5';
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == '\u03AE') {
|
||||
return '\u03B7';
|
||||
}
|
||||
// iota with acute, iota with diaeresis, iota with acute and diaeresis
|
||||
if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') {
|
||||
return '\u03B9';
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
|
||||
if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') {
|
||||
return '\u03C5';
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == '\u03CC') {
|
||||
return '\u03BF';
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == '\u03CE') {
|
||||
return '\u03C9';
|
||||
}
|
||||
// After that, deal with upper case, not accented letters
|
||||
if (letter >= '\u0391' && letter <= '\u03A9')
|
||||
{
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
// Finally deal with upper case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == '\u0386') {
|
||||
return '\u03B1';
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == '\u0388') {
|
||||
return '\u03B5';
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == '\u0389') {
|
||||
return '\u03B7';
|
||||
}
|
||||
// iota with acute, iota with diaeresis
|
||||
if (letter == '\u038A' || letter == '\u03AA') {
|
||||
return '\u03B9';
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis
|
||||
if (letter == '\u038E' || letter == '\u03AB') {
|
||||
return '\u03C5';
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == '\u038C') {
|
||||
return '\u03BF';
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == '\u038F') {
|
||||
return '\u03C9';
|
||||
}
|
||||
} else if (charset == ISO) {
|
||||
// First deal with lower case, not accented letters
|
||||
if (letter >= 0xe1 && letter <= 0xf9)
|
||||
{
|
||||
// Special case 'small final sigma', where we return 'small sigma'
|
||||
if (letter == 0xf2) {
|
||||
return 0xf3;
|
||||
} else {
|
||||
return letter;
|
||||
}
|
||||
}
|
||||
// Then deal with lower case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == 0xdc) {
|
||||
return 0xe1;
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == 0xdd) {
|
||||
return 0xe5;
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == 0xde) {
|
||||
return 0xe7;
|
||||
}
|
||||
// iota with acute, iota with diaeresis, iota with acute and diaeresis
|
||||
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
|
||||
return '\u03B9';
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
|
||||
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
|
||||
return 0xf5;
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == 0xfc) {
|
||||
return 0xef;
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == 0xfe) {
|
||||
return 0xf9;
|
||||
}
|
||||
// After that, deal with upper case, not accented letters
|
||||
if (letter >= 0xc1 && letter <= 0xd9) {
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
// Finally deal with upper case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == 0xb6) {
|
||||
return 0xe1;
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == 0xb8) {
|
||||
return 0xe5;
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == 0xb9) {
|
||||
return 0xe7;
|
||||
}
|
||||
// iota with acute, iota with diaeresis
|
||||
if (letter == 0xba || letter == 0xda) {
|
||||
return 0xe9;
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis
|
||||
if (letter == 0xbe || letter == 0xdb) {
|
||||
return 0xf5;
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == 0xbc) {
|
||||
return 0xef;
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == 0xbf) {
|
||||
return 0xf9;
|
||||
}
|
||||
} else if (charset == CP1253) {
|
||||
// First deal with lower case, not accented letters
|
||||
if (letter >= 0xe1 && letter <= 0xf9)
|
||||
{
|
||||
// Special case 'small final sigma', where we return 'small sigma'
|
||||
if (letter == 0xf2) {
|
||||
return 0xf3;
|
||||
} else {
|
||||
return letter;
|
||||
}
|
||||
}
|
||||
// Then deal with lower case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == 0xdc) {
|
||||
return 0xe1;
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == 0xdd) {
|
||||
return 0xe5;
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == 0xde) {
|
||||
return 0xe7;
|
||||
}
|
||||
// iota with acute, iota with diaeresis, iota with acute and diaeresis
|
||||
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
|
||||
return '\u03B9';
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
|
||||
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
|
||||
return 0xf5;
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == 0xfc) {
|
||||
return 0xef;
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == 0xfe) {
|
||||
return 0xf9;
|
||||
}
|
||||
// After that, deal with upper case, not accented letters
|
||||
if (letter >= 0xc1 && letter <= 0xd9) {
|
||||
return (char) (letter + 32);
|
||||
}
|
||||
// Finally deal with upper case, accented letters
|
||||
// alpha with acute
|
||||
if (letter == 0xa2) {
|
||||
return 0xe1;
|
||||
}
|
||||
// epsilon with acute
|
||||
if (letter == 0xb8) {
|
||||
return 0xe5;
|
||||
}
|
||||
// eta with acute
|
||||
if (letter == 0xb9) {
|
||||
return 0xe7;
|
||||
}
|
||||
// iota with acute, iota with diaeresis
|
||||
if (letter == 0xba || letter == 0xda) {
|
||||
return 0xe9;
|
||||
}
|
||||
// upsilon with acute, upsilon with diaeresis
|
||||
if (letter == 0xbe || letter == 0xdb) {
|
||||
return 0xf5;
|
||||
}
|
||||
// omicron with acute
|
||||
if (letter == 0xbc) {
|
||||
return 0xef;
|
||||
}
|
||||
// omega with acute
|
||||
if (letter == 0xbf) {
|
||||
return 0xf9;
|
||||
}
|
||||
}
|
||||
|
||||
return Character.toLowerCase(letter);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case, analyzing given ("greek") charset.
|
||||
*
|
||||
* @author Panagiotis Astithas, past@ebs.gr
|
||||
*/
|
||||
public final class GreekLowerCaseFilter extends TokenFilter
|
||||
{
|
||||
char[] charset;
|
||||
|
||||
public GreekLowerCaseFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException
|
||||
{
|
||||
Token t = input.next();
|
||||
|
||||
if (t == null)
|
||||
return null;
|
||||
|
||||
String txt = t.termText();
|
||||
|
||||
char[] chArray = txt.toCharArray();
|
||||
for (int i = 0; i < chArray.length; i++)
|
||||
{
|
||||
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
|
||||
}
|
||||
|
||||
String newTxt = new String(chArray);
|
||||
// create new token
|
||||
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
|
||||
|
||||
return newToken;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
||||
/**
|
||||
* A unit test class for verifying the correct operation of the GreekAnalyzer.
|
||||
*
|
||||
* @author past
|
||||
*/
|
||||
public class GreekAnalyzerTest extends TestCase {
|
||||
|
||||
/**
|
||||
* A helper method copied from org.apache.lucene.analysis.TestAnalyzers.
|
||||
*
|
||||
* @param a the Analyzer to test
|
||||
* @param input an input String to analyze
|
||||
* @param output a String[] with the results of the analysis
|
||||
* @throws Exception in case an error occurs
|
||||
*/
|
||||
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
for (int i=0; i<output.length; i++) {
|
||||
Token t = ts.next();
|
||||
assertNotNull(t);
|
||||
assertEquals(t.termText(), output[i]);
|
||||
}
|
||||
assertNull(ts.next());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the analysis of various greek strings.
|
||||
*
|
||||
* @throws Exception in case an error occurs
|
||||
*/
|
||||
public void testAnalyzer() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer();
|
||||
// Verify the correct analysis of capitals and small accented letters
|
||||
assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
|
||||
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
|
||||
"\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
|
||||
// Verify the correct analysis of small letters with diaeresis and the elimination
|
||||
// of punctuation marks
|
||||
assertAnalyzesTo(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
|
||||
// Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
|
||||
// as well as the elimination of stop words
|
||||
assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue