add GreekAnalyzer, contributed by Panagiotis Astithas (past@ebs.gr)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@164686 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2005-04-25 23:23:37 +00:00
parent 346ed71644
commit d650384d4b
4 changed files with 837 additions and 0 deletions

View File

@ -0,0 +1,222 @@
package org.apache.lucene.analysis.el;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.Reader;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
/**
* Analyzer for the Greek language. Supports an external list of stopwords (words
* that will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
*
* @author Panagiotis Astithas, past@ebs.gr
*/
public final class GreekAnalyzer extends Analyzer
{
// the letters are indexes to the charset array (see GreekCharsets.java)
private static char A = 6;
private static char B = 7;
private static char G = 8;
private static char D = 9;
private static char E = 10;
private static char Z = 11;
private static char H = 12;
private static char TH = 13;
private static char I = 14;
private static char K = 15;
private static char L = 16;
private static char M = 17;
private static char N = 18;
private static char KS = 19;
private static char O = 20;
private static char P = 21;
private static char R = 22;
private static char S = 24; // skip final sigma
private static char T = 25;
private static char Y = 26;
private static char F = 27;
private static char X = 28;
private static char PS = 29;
private static char W = 30;
/**
* List of typical Greek stopwords.
*/
private static char[][] GREEK_STOP_WORDS = {
{O},
{H},
{T, O},
{O, I},
{T, A},
{T, O, Y},
{T, H, S},
{T, W, N},
{T, O, N},
{T, H, N},
{K, A, I},
{K, I},
{K},
{E, I, M, A, I},
{E, I, S, A, I},
{E, I, N, A, I},
{E, I, M, A, S, T, E},
{E, I, S, T, E},
{S, T, O},
{S, T, O, N},
{S, T, H},
{S, T, H, N},
{M, A},
{A, L, L, A},
{A, P, O},
{G, I, A},
{P, R, O, S},
{M, E},
{S, E},
{W, S},
{P, A, R, A},
{A, N, T, I},
{K, A, T, A},
{M, E, T, A},
{TH, A},
{N, A},
{D, E},
{D, E, N},
{M, H},
{M, H, N},
{E, P, I},
{E, N, W},
{E, A, N},
{A, N},
{T, O, T, E},
{P, O, Y},
{P, W, S},
{P, O, I, O, S},
{P, O, I, A},
{P, O, I, O},
{P, O, I, O, I},
{P, O, I, E, S},
{P, O, I, W, N},
{P, O, I, O, Y, S},
{A, Y, T, O, S},
{A, Y, T, H},
{A, Y, T, O},
{A, Y, T, O, I},
{A, Y, T, W, N},
{A, Y, T, O, Y, S},
{A, Y, T, E, S},
{A, Y, T, A},
{E, K, E, I, N, O, S},
{E, K, E, I, N, H},
{E, K, E, I, N, O},
{E, K, E, I, N, O, I},
{E, K, E, I, N, E, S},
{E, K, E, I, N, A},
{E, K, E, I, N, W, N},
{E, K, E, I, N, O, Y, S},
{O, P, W, S},
{O, M, W, S},
{I, S, W, S},
{O, S, O},
{O, T, I}
};
/**
* Contains the stopwords used with the StopFilter.
*/
private Set stopSet = new HashSet();
/**
* Charset for Greek letters.
* Represents encoding for 24 lowercase Greek letters.
* Predefined charsets can be taken from GreekCharSets class
*/
private char[] charset;
public GreekAnalyzer() {
charset = GreekCharsets.UnicodeGreek;
stopSet = StopFilter.makeStopSet(
makeStopWords(GreekCharsets.UnicodeGreek));
}
/**
* Builds an analyzer.
*/
public GreekAnalyzer(char[] charset)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
}
/**
* Builds an analyzer with the given stop words.
*/
public GreekAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(stopwords);
}
// Takes greek stop words and translates them to a String array, using
// the given charset
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[GREEK_STOP_WORDS.length];
for (int i = 0; i < res.length; i++)
{
char[] theStopWord = GREEK_STOP_WORDS[i];
// translate the word,using the charset
StringBuffer theWord = new StringBuffer();
for (int j = 0; j < theStopWord.length; j++)
{
theWord.append(charset[theStopWord[j]]);
}
res[i] = theWord.toString();
}
return res;
}
/**
* Builds an analyzer with the given stop words.
*/
public GreekAnalyzer(char[] charset, Hashtable stopwords)
{
this.charset = charset;
stopSet = new HashSet(stopwords.keySet());
}
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
* GreekLowerCaseFilter and StopFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new StandardTokenizer(reader);
result = new GreekLowerCaseFilter(result, charset);
result = new StopFilter(result, stopSet);
return result;
}
}

View File

@ -0,0 +1,481 @@
package org.apache.lucene.analysis.el;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
* for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
* Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
* the definition of a new charset as well as the required logic in the toLowerCase() method.
*
* @author Panagiotis Astithas, past@ebs.gr
*/
public class GreekCharsets
{
// Unicode Greek charset
public static char[] UnicodeGreek = {
// lower case
'\u0390',
'\u03AC',
'\u03AD',
'\u03AE',
'\u03AF',
'\u03B0',
'\u03B1',
'\u03B2',
'\u03B3',
'\u03B4',
'\u03B5',
'\u03B6',
'\u03B7',
'\u03B8',
'\u03B9',
'\u03BA',
'\u03BB',
'\u03BC',
'\u03BD',
'\u03BE',
'\u03BF',
'\u03C0',
'\u03C1',
'\u03C2',
'\u03C3',
'\u03C4',
'\u03C5',
'\u03C6',
'\u03C7',
'\u03C8',
'\u03C9',
'\u03CA',
'\u03CB',
'\u03CC',
'\u03CD',
'\u03CE',
// upper case
'\u0386',
'\u0388',
'\u0389',
'\u038A',
'\u038C',
'\u038E',
'\u038F',
'\u0391',
'\u0392',
'\u0393',
'\u0394',
'\u0395',
'\u0396',
'\u0397',
'\u0398',
'\u0399',
'\u039A',
'\u039B',
'\u039C',
'\u039D',
'\u039E',
'\u039F',
'\u03A0',
'\u03A1',
'\u03A3',
'\u03A4',
'\u03A5',
'\u03A6',
'\u03A7',
'\u03A8',
'\u03A9',
'\u03AA',
'\u03AB'
};
// ISO-8859-7 charset (ELOT-928)
public static char[] ISO = {
// lower case
0xc0,
0xdc,
0xdd,
0xde,
0xdf,
0xe0,
0xe1,
0xe2,
0xe3,
0xe4,
0xe5,
0xe6,
0xe7,
0xe8,
0xe9,
0xea,
0xeb,
0xec,
0xed,
0xee,
0xef,
0xf0,
0xf1,
0xf2,
0xf3,
0xf4,
0xf5,
0xf6,
0xf7,
0xf8,
0xf9,
0xfa,
0xfb,
0xfc,
0xfd,
0xfe,
// upper case
0xb6,
0xb8,
0xb9,
0xba,
0xbc,
0xbe,
0xbf,
0xc1,
0xc2,
0xc3,
0xc4,
0xc5,
0xc6,
0xc7,
0xc8,
0xc9,
0xca,
0xcb,
0xcc,
0xcd,
0xce,
0xcf,
0xd0,
0xd1,
0xd3,
0xd4,
0xd5,
0xd6,
0xd7,
0xd8,
0xd9,
0xda,
0xdb
};
// CP1253 charset
public static char[] CP1253 = {
// lower case
0xc0,
0xdc,
0xdd,
0xde,
0xdf,
0xe0,
0xe1,
0xe2,
0xe3,
0xe4,
0xe5,
0xe6,
0xe7,
0xe8,
0xe9,
0xea,
0xeb,
0xec,
0xed,
0xee,
0xef,
0xf0,
0xf1,
0xf2,
0xf3,
0xf4,
0xf5,
0xf6,
0xf7,
0xf8,
0xf9,
0xfa,
0xfb,
0xfc,
0xfd,
0xfe,
// upper case
0xa2,
0xb8,
0xb9,
0xba,
0xbc,
0xbe,
0xbf,
0xc1,
0xc2,
0xc3,
0xc4,
0xc5,
0xc6,
0xc7,
0xc8,
0xc9,
0xca,
0xcb,
0xcc,
0xcd,
0xce,
0xcf,
0xd0,
0xd1,
0xd3,
0xd4,
0xd5,
0xd6,
0xd7,
0xd8,
0xd9,
0xda,
0xdb
};
public static char toLowerCase(char letter, char[] charset)
{
if (charset == UnicodeGreek) {
// First deal with lower case, not accented letters
if (letter >= '\u03B1' && letter <= '\u03C9')
{
// Special case 'small final sigma', where we return 'small sigma'
if (letter == '\u03C2') {
return '\u03C3';
} else {
return letter;
}
}
// Then deal with lower case, accented letters
// alpha with acute
if (letter == '\u03AC') {
return '\u03B1';
}
// epsilon with acute
if (letter == '\u03AD') {
return '\u03B5';
}
// eta with acute
if (letter == '\u03AE') {
return '\u03B7';
}
// iota with acute, iota with diaeresis, iota with acute and diaeresis
if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') {
return '\u03B9';
}
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') {
return '\u03C5';
}
// omicron with acute
if (letter == '\u03CC') {
return '\u03BF';
}
// omega with acute
if (letter == '\u03CE') {
return '\u03C9';
}
// After that, deal with upper case, not accented letters
if (letter >= '\u0391' && letter <= '\u03A9')
{
return (char) (letter + 32);
}
// Finally deal with upper case, accented letters
// alpha with acute
if (letter == '\u0386') {
return '\u03B1';
}
// epsilon with acute
if (letter == '\u0388') {
return '\u03B5';
}
// eta with acute
if (letter == '\u0389') {
return '\u03B7';
}
// iota with acute, iota with diaeresis
if (letter == '\u038A' || letter == '\u03AA') {
return '\u03B9';
}
// upsilon with acute, upsilon with diaeresis
if (letter == '\u038E' || letter == '\u03AB') {
return '\u03C5';
}
// omicron with acute
if (letter == '\u038C') {
return '\u03BF';
}
// omega with acute
if (letter == '\u038F') {
return '\u03C9';
}
} else if (charset == ISO) {
// First deal with lower case, not accented letters
if (letter >= 0xe1 && letter <= 0xf9)
{
// Special case 'small final sigma', where we return 'small sigma'
if (letter == 0xf2) {
return 0xf3;
} else {
return letter;
}
}
// Then deal with lower case, accented letters
// alpha with acute
if (letter == 0xdc) {
return 0xe1;
}
// epsilon with acute
if (letter == 0xdd) {
return 0xe5;
}
// eta with acute
if (letter == 0xde) {
return 0xe7;
}
// iota with acute, iota with diaeresis, iota with acute and diaeresis
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
return '\u03B9';
}
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
return 0xf5;
}
// omicron with acute
if (letter == 0xfc) {
return 0xef;
}
// omega with acute
if (letter == 0xfe) {
return 0xf9;
}
// After that, deal with upper case, not accented letters
if (letter >= 0xc1 && letter <= 0xd9) {
return (char) (letter + 32);
}
// Finally deal with upper case, accented letters
// alpha with acute
if (letter == 0xb6) {
return 0xe1;
}
// epsilon with acute
if (letter == 0xb8) {
return 0xe5;
}
// eta with acute
if (letter == 0xb9) {
return 0xe7;
}
// iota with acute, iota with diaeresis
if (letter == 0xba || letter == 0xda) {
return 0xe9;
}
// upsilon with acute, upsilon with diaeresis
if (letter == 0xbe || letter == 0xdb) {
return 0xf5;
}
// omicron with acute
if (letter == 0xbc) {
return 0xef;
}
// omega with acute
if (letter == 0xbf) {
return 0xf9;
}
} else if (charset == CP1253) {
// First deal with lower case, not accented letters
if (letter >= 0xe1 && letter <= 0xf9)
{
// Special case 'small final sigma', where we return 'small sigma'
if (letter == 0xf2) {
return 0xf3;
} else {
return letter;
}
}
// Then deal with lower case, accented letters
// alpha with acute
if (letter == 0xdc) {
return 0xe1;
}
// epsilon with acute
if (letter == 0xdd) {
return 0xe5;
}
// eta with acute
if (letter == 0xde) {
return 0xe7;
}
// iota with acute, iota with diaeresis, iota with acute and diaeresis
if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
return '\u03B9';
}
// upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
return 0xf5;
}
// omicron with acute
if (letter == 0xfc) {
return 0xef;
}
// omega with acute
if (letter == 0xfe) {
return 0xf9;
}
// After that, deal with upper case, not accented letters
if (letter >= 0xc1 && letter <= 0xd9) {
return (char) (letter + 32);
}
// Finally deal with upper case, accented letters
// alpha with acute
if (letter == 0xa2) {
return 0xe1;
}
// epsilon with acute
if (letter == 0xb8) {
return 0xe5;
}
// eta with acute
if (letter == 0xb9) {
return 0xe7;
}
// iota with acute, iota with diaeresis
if (letter == 0xba || letter == 0xda) {
return 0xe9;
}
// upsilon with acute, upsilon with diaeresis
if (letter == 0xbe || letter == 0xdb) {
return 0xf5;
}
// omicron with acute
if (letter == 0xbc) {
return 0xef;
}
// omega with acute
if (letter == 0xbf) {
return 0xf9;
}
}
return Character.toLowerCase(letter);
}
}

View File

@ -0,0 +1,59 @@
package org.apache.lucene.analysis.el;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* Normalizes token text to lower case, analyzing given ("greek") charset.
*
* @author Panagiotis Astithas, past@ebs.gr
*/
public final class GreekLowerCaseFilter extends TokenFilter
{
char[] charset;
public GreekLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
}
public final Token next() throws java.io.IOException
{
Token t = input.next();
if (t == null)
return null;
String txt = t.termText();
char[] chArray = txt.toCharArray();
for (int i = 0; i < chArray.length; i++)
{
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
}
String newTxt = new String(chArray);
// create new token
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
return newToken;
}
}

View File

@ -0,0 +1,75 @@
package org.apache.lucene.analysis.el;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import junit.framework.TestCase;
/**
* A unit test class for verifying the correct operation of the GreekAnalyzer.
*
* @author past
*/
public class GreekAnalyzerTest extends TestCase {
/**
* A helper method copied from org.apache.lucene.analysis.TestAnalyzers.
*
* @param a the Analyzer to test
* @param input an input String to analyze
* @param output a String[] with the results of the analysis
* @throws Exception in case an error occurs
*/
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
for (int i=0; i<output.length; i++) {
Token t = ts.next();
assertNotNull(t);
assertEquals(t.termText(), output[i]);
}
assertNull(ts.next());
ts.close();
}
/**
* Test the analysis of various greek strings.
*
* @throws Exception in case an error occurs
*/
public void testAnalyzer() throws Exception {
Analyzer a = new GreekAnalyzer();
// Verify the correct analysis of capitals and small accented letters
assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
"\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
// Verify the correct analysis of small letters with diaeresis and the elimination
// of punctuation marks
assertAnalyzesTo(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
// Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
// as well as the elimination of stop words
assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
}
}