LUCENE-2463: Improve Greek analysis

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@945090 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-17 11:28:04 +00:00
parent 97a95c3a6a
commit acbf053b7c
12 changed files with 1774 additions and 188 deletions

View File

@ -157,6 +157,9 @@ New features
* LUCENE-2393: The HighFreqTerms tool (in misc) can now optionally
also include the total termFreq. (Tom Burton-West via Mike McCandless)
* LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words
when Version is set to 3.1 or higher. (Robert Muir)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -16,9 +16,7 @@ package org.apache.lucene.analysis.el;
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
@ -28,8 +26,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
@ -45,7 +43,7 @@ import java.util.Set;
* <p>You must specify the required {@link Version}
* compatibility when creating GreekAnalyzer:
* <ul>
* <li> As of 3.1, StandardFilter is used by default.
* <li> As of 3.1, StandardFilter and GreekStemmer are used by default.
* <li> As of 2.9, StopFilter preserves position
* increments
* </ul>
@ -53,72 +51,73 @@ import java.util.Set;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class GreekAnalyzer extends StopwordAnalyzerBase
{
/**
* List of typical Greek stopwords.
*/
private static final String[] GREEK_STOP_WORDS = {
"ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον", "την", "και",
"κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε", "στο", "στον",
"στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με", "σε", "ωσ",
"παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν", "μη", "μην",
"επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ", "ποια", "ποιο",
"ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη", "αυτο", "αυτοι",
"αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη", "εκεινο",
"εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
"ισωσ", "οσο", "οτι"
};
public final class GreekAnalyzer extends StopwordAnalyzerBase {
/** File containing default Greek stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns a set of default Greek-stopwords
* @return a set of default Greek-stopwords
*/
public static final Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET;
}
/**
* Returns a set of default Greek-stopwords
* @return a set of default Greek-stopwords
*/
public static final Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET;
}
private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
}
private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET;
public GreekAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
static {
try {
DEFAULT_SET = loadStopwordSet(false, GreekAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
super(matchVersion, stopwords);
}
/**
* Builds an analyzer with the default stop words.
* @param matchVersion Lucene compatibility version,
* See <a href="#version">above</a>
*/
public GreekAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
/**
* Builds an analyzer with the given stop words.
* @param stopwords Array of stopwords to use.
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
*/
@Deprecated
public GreekAnalyzer(Version matchVersion, String... stopwords)
{
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
* Builds an analyzer with the given stop words.
* <p>
* <b>NOTE:</b> The stopwords set should be pre-processed with the logic of
* {@link GreekLowerCaseFilter} for best results.
*
* @param matchVersion Lucene compatibility version,
* See <a href="#version">above</a>
* @param stopwords a stopword set
*/
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
super(matchVersion, stopwords);
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
*/
@Deprecated
public GreekAnalyzer(Version matchVersion, Map<?,?> stopwords)
{
this(matchVersion, stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
* @param stopwords Array of stopwords to use.
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
*/
@Deprecated
public GreekAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
*/
@Deprecated
public GreekAnalyzer(Version matchVersion, Map<?,?> stopwords) {
this(matchVersion, stopwords.keySet());
}
/**
* Creates
@ -127,16 +126,19 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase
*
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link GreekLowerCaseFilter}, {@link StandardFilter} and
* {@link StopFilter}
* {@link GreekLowerCaseFilter}, {@link StandardFilter},
* {@link StopFilter}, and {@link GreekStemFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new GreekLowerCaseFilter(source);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new StandardFilter(result);
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new StandardFilter(result);
result = new StopFilter(matchVersion, result, stopwords);
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new GreekStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

View File

@ -20,97 +20,115 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
* Normalizes token text to lower case, removes some Greek diacritics,
* and standardizes final sigma to sigma.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating GreekLowerCaseFilter:
* <ul>
* <li> As of 3.1, supplementary characters are properly lowercased.
* </ul>
*/
public final class GreekLowerCaseFilter extends TokenFilter
{
private TermAttribute termAtt;
public final class GreekLowerCaseFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final CharacterUtils charUtils;
public GreekLowerCaseFilter(TokenStream in)
{
super(in);
termAtt = addAttribute(TermAttribute.class);
/** @deprecated Use {@link #GreekLowerCaseFilter(Version, TokenStream)} instead. */
@Deprecated
public GreekLowerCaseFilter(TokenStream in) {
this(Version.LUCENE_30, in);
}
/**
* Create a GreekLowerCaseFilter that normalizes Greek token text.
*
* @param matchVersion Lucene compatibility version,
* See <a href="#version">above</a>
* @param in TokenStream to filter
*/
public GreekLowerCaseFilter(Version matchVersion, TokenStream in) {
super(in);
this.charUtils = CharacterUtils.getInstance(matchVersion);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char[] chArray = termAtt.buffer();
int chLen = termAtt.length();
for (int i = 0; i < chLen;) {
i += Character.toChars(
lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
}
return true;
} else {
return false;
}
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char[] chArray = termAtt.termBuffer();
int chLen = termAtt.termLength();
// TODO: iterate codepoints to support supp. characters
for (int i = 0; i < chLen; i++)
{
chArray[i] = (char) lowerCase(chArray[i]);
}
return true;
} else {
return false;
}
}
private int lowerCase(int codepoint) {
switch(codepoint) {
/* There are two lowercase forms of sigma:
* U+03C2: small final sigma (end of word)
* U+03C3: small sigma (otherwise)
*
* Standardize both to U+03C3
*/
case '\u03C2': /* small final sigma */
return '\u03C3'; /* small sigma */
/* Some greek characters contain diacritics.
* This filter removes these, converting to the lowercase base form.
*/
case '\u0386': /* capital alpha with tonos */
case '\u03AC': /* small alpha with tonos */
return '\u03B1'; /* small alpha */
case '\u0388': /* capital epsilon with tonos */
case '\u03AD': /* small epsilon with tonos */
return '\u03B5'; /* small epsilon */
case '\u0389': /* capital eta with tonos */
case '\u03AE': /* small eta with tonos */
return '\u03B7'; /* small eta */
case '\u038A': /* capital iota with tonos */
case '\u03AA': /* capital iota with dialytika */
case '\u03AF': /* small iota with tonos */
case '\u03CA': /* small iota with dialytika */
case '\u0390': /* small iota with dialytika and tonos */
return '\u03B9'; /* small iota */
case '\u038E': /* capital upsilon with tonos */
case '\u03AB': /* capital upsilon with dialytika */
case '\u03CD': /* small upsilon with tonos */
case '\u03CB': /* small upsilon with dialytika */
case '\u03B0': /* small upsilon with dialytika and tonos */
return '\u03C5'; /* small upsilon */
case '\u038C': /* capital omicron with tonos */
case '\u03CC': /* small omicron with tonos */
return '\u03BF'; /* small omicron */
case '\u038F': /* capital omega with tonos */
case '\u03CE': /* small omega with tonos */
return '\u03C9'; /* small omega */
/* The previous implementation did the conversion below.
* Only implemented for backwards compatibility with old indexes.
*/
case '\u03A2': /* reserved */
return '\u03C2'; /* small final sigma */
default:
return Character.toLowerCase(codepoint);
}
private int lowerCase(int codepoint) {
switch(codepoint) {
/* There are two lowercase forms of sigma:
* U+03C2: small final sigma (end of word)
* U+03C3: small sigma (otherwise)
*
* Standardize both to U+03C3
*/
case '\u03C2': /* small final sigma */
return '\u03C3'; /* small sigma */
/* Some greek characters contain diacritics.
* This filter removes these, converting to the lowercase base form.
*/
case '\u0386': /* capital alpha with tonos */
case '\u03AC': /* small alpha with tonos */
return '\u03B1'; /* small alpha */
case '\u0388': /* capital epsilon with tonos */
case '\u03AD': /* small epsilon with tonos */
return '\u03B5'; /* small epsilon */
case '\u0389': /* capital eta with tonos */
case '\u03AE': /* small eta with tonos */
return '\u03B7'; /* small eta */
case '\u038A': /* capital iota with tonos */
case '\u03AA': /* capital iota with dialytika */
case '\u03AF': /* small iota with tonos */
case '\u03CA': /* small iota with dialytika */
case '\u0390': /* small iota with dialytika and tonos */
return '\u03B9'; /* small iota */
case '\u038E': /* capital upsilon with tonos */
case '\u03AB': /* capital upsilon with dialytika */
case '\u03CD': /* small upsilon with tonos */
case '\u03CB': /* small upsilon with dialytika */
case '\u03B0': /* small upsilon with dialytika and tonos */
return '\u03C5'; /* small upsilon */
case '\u038C': /* capital omicron with tonos */
case '\u03CC': /* small omicron with tonos */
return '\u03BF'; /* small omicron */
case '\u038F': /* capital omega with tonos */
case '\u03CE': /* small omega with tonos */
return '\u03C9'; /* small omega */
/* The previous implementation did the conversion below.
* Only implemented for backwards compatibility with old indexes.
*/
case '\u03A2': /* reserved */
return '\u03C2'; /* small final sigma */
default:
return Character.toLowerCase(codepoint);
}
}
}

View File

@ -0,0 +1,63 @@
package org.apache.lucene.analysis.el;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.KeywordMarkerFilter; // for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* A {@link TokenFilter} that applies {@link GreekStemmer} to stem Greek
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* <p>
* NOTE: Input is expected to be casefolded for Greek (including folding of final
* sigma to sigma), and with diacritics removed. This can be achieved by using
* either {@link GreekLowerCaseFilter} or ICUFoldingFilter before GreekStemFilter.
* @lucene.experimental
*/
public final class GreekStemFilter extends TokenFilter {
private final GreekStemmer stemmer = new GreekStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public GreekStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if(!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,819 @@
package org.apache.lucene.analysis.el;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
import java.util.Arrays;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A stemmer for Greek words, according to: <i>Development of a Stemmer for the
* Greek Language.</i> Georgios Ntais
* <p>
* NOTE: Input is expected to be casefolded for Greek (including folding of final
* sigma to sigma), and with diacritics removed. This can be achieved with
* either {@link GreekLowerCaseFilter} or ICUFoldingFilter.
* @lucene.experimental
*/
public class GreekStemmer {
public int stem(char s[], int len) {
if (len < 4) // too short
return len;
final int origLen = len;
// "short rules": if it hits one of these, it skips the "long list"
len = rule0(s, len);
len = rule1(s, len);
len = rule2(s, len);
len = rule3(s, len);
len = rule4(s, len);
len = rule5(s, len);
len = rule6(s, len);
len = rule7(s, len);
len = rule8(s, len);
len = rule9(s, len);
len = rule10(s, len);
len = rule11(s, len);
len = rule12(s, len);
len = rule13(s, len);
len = rule14(s, len);
len = rule15(s, len);
len = rule16(s, len);
len = rule17(s, len);
len = rule18(s, len);
len = rule19(s, len);
len = rule20(s, len);
// "long list"
if (len == origLen)
len = rule21(s, len);
return rule22(s, len);
}
private int rule0(char s[], int len) {
if (len > 9 && (endsWith(s, len, "καθεστωτοσ")
|| endsWith(s, len, "καθεστωτων")))
return len - 4;
if (len > 8 && (endsWith(s, len, "γεγονοτοσ")
|| endsWith(s, len, "γεγονοτων")))
return len - 4;
if (len > 8 && endsWith(s, len, "καθεστωτα"))
return len - 3;
if (len > 7 && (endsWith(s, len, "τατογιου")
|| endsWith(s, len, "τατογιων")))
return len - 4;
if (len > 7 && endsWith(s, len, "γεγονοτα"))
return len - 3;
if (len > 7 && endsWith(s, len, "καθεστωσ"))
return len - 2;
if (len > 6 && (endsWith(s, len, "σκαγιου"))
|| endsWith(s, len, "σκαγιων")
|| endsWith(s, len, "ολογιου")
|| endsWith(s, len, "ολογιων")
|| endsWith(s, len, "κρεατοσ")
|| endsWith(s, len, "κρεατων")
|| endsWith(s, len, "περατοσ")
|| endsWith(s, len, "περατων")
|| endsWith(s, len, "τερατοσ")
|| endsWith(s, len, "τερατων"))
return len - 4;
if (len > 6 && endsWith(s, len, "τατογια"))
return len - 3;
if (len > 6 && endsWith(s, len, "γεγονοσ"))
return len - 2;
if (len > 5 && (endsWith(s, len, "φαγιου")
|| endsWith(s, len, "φαγιων")
|| endsWith(s, len, "σογιου")
|| endsWith(s, len, "σογιων")))
return len - 4;
if (len > 5 && (endsWith(s, len, "σκαγια")
|| endsWith(s, len, "ολογια")
|| endsWith(s, len, "κρεατα")
|| endsWith(s, len, "περατα")
|| endsWith(s, len, "τερατα")))
return len - 3;
if (len > 4 && (endsWith(s, len, "φαγια")
|| endsWith(s, len, "σογια")
|| endsWith(s, len, "φωτοσ")
|| endsWith(s, len, "φωτων")))
return len - 3;
if (len > 4 && (endsWith(s, len, "κρεασ")
|| endsWith(s, len, "περασ")
|| endsWith(s, len, "τερασ")))
return len - 2;
if (len > 3 && endsWith(s, len, "φωτα"))
return len - 2;
if (len > 2 && endsWith(s, len, "φωσ"))
return len - 1;
return len;
}
private int rule1(char s[], int len) {
if (len > 4 && (endsWith(s, len, "αδεσ") || endsWith(s, len, "αδων"))) {
len -= 4;
if (!(endsWith(s, len, "οκ") ||
endsWith(s, len, "μαμ") ||
endsWith(s, len, "μαν") ||
endsWith(s, len, "μπαμπ") ||
endsWith(s, len, "πατερ") ||
endsWith(s, len, "γιαγι") ||
endsWith(s, len, "νταντ") ||
endsWith(s, len, "κυρ") ||
endsWith(s, len, "θει") ||
endsWith(s, len, "πεθερ")))
len += 2; // add back -αδ
}
return len;
}
private int rule2(char s[], int len) {
if (len > 4 && (endsWith(s, len, "εδεσ") || endsWith(s, len, "εδων"))) {
len -= 4;
if (endsWith(s, len, "οπ") ||
endsWith(s, len, "ιπ") ||
endsWith(s, len, "εμπ") ||
endsWith(s, len, "υπ") ||
endsWith(s, len, "γηπ") ||
endsWith(s, len, "δαπ") ||
endsWith(s, len, "κρασπ") ||
endsWith(s, len, "μιλ"))
len += 2; // add back -εδ
}
return len;
}
private int rule3(char s[], int len) {
if (len > 5 && (endsWith(s, len, "ουδεσ") || endsWith(s, len, "ουδων"))) {
len -= 5;
if (endsWith(s, len, "αρκ") ||
endsWith(s, len, "καλιακ") ||
endsWith(s, len, "πεταλ") ||
endsWith(s, len, "λιχ") ||
endsWith(s, len, "πλεξ") ||
endsWith(s, len, "σκ") ||
endsWith(s, len, "σ") ||
endsWith(s, len, "φλ") ||
endsWith(s, len, "φρ") ||
endsWith(s, len, "βελ") ||
endsWith(s, len, "λουλ") ||
endsWith(s, len, "χν") ||
endsWith(s, len, "σπ") ||
endsWith(s, len, "τραγ") ||
endsWith(s, len, "φε"))
len += 3; // add back -ουδ
}
return len;
}
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
false);
private int rule4(char s[], int len) {
if (len > 3 && (endsWith(s, len, "εωσ") || endsWith(s, len, "εων"))) {
len -= 3;
if (exc4.contains(s, 0, len))
len++; // add back -ε
}
return len;
}
private int rule5(char s[], int len) {
if (len > 2 && endsWith(s, len, "ια")) {
len -= 2;
if (endsWithVowel(s, len))
len++; // add back -ι
} else if (len > 3 && (endsWith(s, len, "ιου") || endsWith(s, len, "ιων"))) {
len -= 3;
if (endsWithVowel(s, len))
len++; // add back -ι
}
return len;
}
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
"πετσ", "πιτσ", "πικαντ", "πλιατσ", "ποστελν", "πρωτοδ", "σερτ",
"συναδ", "τσαμ", "υποδ", "φιλον", "φυλοδ", "χασ"),
false);
private int rule6(char s[], int len) {
boolean removed = false;
if (len > 3 && (endsWith(s, len, "ικα") || endsWith(s, len, "ικο"))) {
len -= 3;
removed = true;
} else if (len > 4 && (endsWith(s, len, "ικου") || endsWith(s, len, "ικων"))) {
len -= 4;
removed = true;
}
if (removed) {
if (endsWithVowel(s, len) || exc6.contains(s, 0, len))
len += 2; // add back -ικ
}
return len;
}
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
"πεθ", "πικρ", "ποτ", "σιχ", "χ"),
false);
private int rule7(char s[], int len) {
if (len == 5 && endsWith(s, len, "αγαμε"))
return len - 1;
if (len > 7 && endsWith(s, len, "ηθηκαμε"))
len -= 7;
else if (len > 6 && endsWith(s, len, "ουσαμε"))
len -= 6;
else if (len > 5 && (endsWith(s, len, "αγαμε") ||
endsWith(s, len, "ησαμε") ||
endsWith(s, len, "ηκαμε")))
len -= 5;
if (len > 3 && endsWith(s, len, "αμε")) {
len -= 3;
if (exc7.contains(s, 0, len))
len += 2; // add back -αμ
}
return len;
}
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_31,
Arrays.asList("τρ", "τσ"),
false);
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_31,
Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
"τσαρλατ", "ορφ", "τσιγγ", "τσοπ", "φωτοστεφ", "χ", "ψυχοπλ", "αγ",
"ορφ", "γαλ", "γερ", "δεκ", "διπλ", "αμερικαν", "ουρ", "πιθ",
"πουριτ", "σ", "ζωντ", "ικ", "καστ", "κοπ", "λιχ", "λουθηρ", "μαιντ",
"μελ", "σιγ", "σπ", "στεγ", "τραγ", "τσαγ", "φ", "ερ", "αδαπ",
"αθιγγ", "αμηχ", "ανικ", "ανοργ", "απηγ", "απιθ", "ατσιγγ", "βασ",
"βασκ", "βαθυγαλ", "βιομηχ", "βραχυκ", "διατ", "διαφ", "ενοργ",
"θυσ", "καπνοβιομηχ", "καταγαλ", "κλιβ", "κοιλαρφ", "λιβ",
"μεγλοβιομηχ", "μικροβιομηχ", "νταβ", "ξηροκλιβ", "ολιγοδαμ",
"ολογαλ", "πενταρφ", "περηφ", "περιτρ", "πλατ", "πολυδαπ", "πολυμηχ",
"στεφ", "ταβ", "τετ", "υπερηφ", "υποκοπ", "χαμηλοδαπ", "ψηλοταβ"),
false);
private int rule8(char s[], int len) {
boolean removed = false;
if (len > 8 && endsWith(s, len, "ιουντανε")) {
len -= 8;
removed = true;
} else if (len > 7 && endsWith(s, len, "ιοντανε") ||
endsWith(s, len, "ουντανε") ||
endsWith(s, len, "ηθηκανε")) {
len -= 7;
removed = true;
} else if (len > 6 && endsWith(s, len, "ιοτανε") ||
endsWith(s, len, "οντανε") ||
endsWith(s, len, "ουσανε")) {
len -= 6;
removed = true;
} else if (len > 5 && endsWith(s, len, "αγανε") ||
endsWith(s, len, "ησανε") ||
endsWith(s, len, "οτανε") ||
endsWith(s, len, "ηκανε")) {
len -= 5;
removed = true;
}
if (removed && exc8a.contains(s, 0, len)) {
// add -αγαν (we removed > 4 chars so its safe)
len += 4;
s[len - 4] = 'α';
s[len - 3] = 'γ';
s[len - 2] = 'α';
s[len - 1] = 'ν';
}
if (len > 3 && endsWith(s, len, "ανε")) {
len -= 3;
if (endsWithVowelNoY(s, len) || exc8b.contains(s, 0, len)) {
len += 2; // add back -αν
}
}
return len;
}
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
false);
private int rule9(char s[], int len) {
if (len > 5 && endsWith(s, len, "ησετε"))
len -= 5;
if (len > 3 && endsWith(s, len, "ετε")) {
len -= 3;
if (exc9.contains(s, 0, len) ||
endsWithVowelNoY(s, len) ||
endsWith(s, len, "οδ") ||
endsWith(s, len, "αιρ") ||
endsWith(s, len, "φορ") ||
endsWith(s, len, "ταθ") ||
endsWith(s, len, "διαθ") ||
endsWith(s, len, "σχ") ||
endsWith(s, len, "ενδ") ||
endsWith(s, len, "ευρ") ||
endsWith(s, len, "τιθ") ||
endsWith(s, len, "υπερθ") ||
endsWith(s, len, "ραθ") ||
endsWith(s, len, "ενθ") ||
endsWith(s, len, "ροθ") ||
endsWith(s, len, "σθ") ||
endsWith(s, len, "πυρ") ||
endsWith(s, len, "αιν") ||
endsWith(s, len, "συνδ") ||
endsWith(s, len, "συν") ||
endsWith(s, len, "συνθ") ||
endsWith(s, len, "χωρ") ||
endsWith(s, len, "πον") ||
endsWith(s, len, "βρ") ||
endsWith(s, len, "καθ") ||
endsWith(s, len, "ευθ") ||
endsWith(s, len, "εκθ") ||
endsWith(s, len, "νετ") ||
endsWith(s, len, "ρον") ||
endsWith(s, len, "αρκ") ||
endsWith(s, len, "βαρ") ||
endsWith(s, len, "βολ") ||
endsWith(s, len, "ωφελ")) {
len += 2; // add back -ετ
}
}
return len;
}
private int rule10(char s[], int len) {
if (len > 5 && (endsWith(s, len, "οντασ") || endsWith(s, len, "ωντασ"))) {
len -= 5;
if (len == 3 && endsWith(s, len, "αρχ")) {
len += 3; // add back *ντ
s[len - 3] = 'ο';
}
if (endsWith(s, len, "κρε")) {
len += 3; // add back *ντ
s[len - 3] = 'ω';
}
}
return len;
}
private int rule11(char s[], int len) {
if (len > 6 && endsWith(s, len, "ομαστε")) {
len -= 6;
if (len == 2 && endsWith(s, len, "ον")) {
len += 5; // add back -ομαστ
}
} else if (len > 7 && endsWith(s, len, "ιομαστε")) {
len -= 7;
if (len == 2 && endsWith(s, len, "ον")) {
len += 5;
s[len - 5] = 'ο';
s[len - 4] = 'μ';
s[len - 3] = 'α';
s[len - 2] = 'σ';
s[len - 1] = 'τ';
}
}
return len;
}
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_31,
Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
false);
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_31,
Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
false);
private int rule12(char s[], int len) {
if (len > 5 && endsWith(s, len, "ιεστε")) {
len -= 5;
if (exc12a.contains(s, 0, len))
len += 4; // add back -ιεστ
}
if (len > 4 && endsWith(s, len, "εστε")) {
len -= 4;
if (exc12b.contains(s, 0, len))
len += 3; // add back -εστ
}
return len;
}
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
false);
private int rule13(char s[], int len) {
if (len > 6 && endsWith(s, len, "ηθηκεσ")) {
len -= 6;
} else if (len > 5 && (endsWith(s, len, "ηθηκα") || endsWith(s, len, "ηθηκε"))) {
len -= 5;
}
boolean removed = false;
if (len > 4 && endsWith(s, len, "ηκεσ")) {
len -= 4;
removed = true;
} else if (len > 3 && (endsWith(s, len, "ηκα") || endsWith(s, len, "ηκε"))) {
len -= 3;
removed = true;
}
if (removed && (exc13.contains(s, 0, len)
|| endsWith(s, len, "σκωλ")
|| endsWith(s, len, "σκουλ")
|| endsWith(s, len, "ναρθ")
|| endsWith(s, len, "σφ")
|| endsWith(s, len, "οθ")
|| endsWith(s, len, "πιθ"))) {
len += 2; // add back the -ηκ
}
return len;
}
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
"τσα"),
false);
private int rule14(char s[], int len) {
boolean removed = false;
if (len > 5 && endsWith(s, len, "ουσεσ")) {
len -= 5;
removed = true;
} else if (len > 4 && (endsWith(s, len, "ουσα") || endsWith(s, len, "ουσε"))) {
len -= 4;
removed = true;
}
if (removed && (exc14.contains(s, 0, len)
|| endsWithVowel(s, len)
|| endsWith(s, len, "ποδαρ")
|| endsWith(s, len, "βλεπ")
|| endsWith(s, len, "πανταχ")
|| endsWith(s, len, "φρυδ")
|| endsWith(s, len, "μαντιλ")
|| endsWith(s, len, "μαλλ")
|| endsWith(s, len, "κυματ")
|| endsWith(s, len, "λαχ")
|| endsWith(s, len, "ληγ")
|| endsWith(s, len, "φαγ")
|| endsWith(s, len, "ομ")
|| endsWith(s, len, "πρωτ"))) {
len += 3; // add back -ουσ
}
return len;
}
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_31,
Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
"συντ", "τ", "υποτ", "χαρ", "αειπ", "αιμοστ", "ανυπ", "αποτ",
"αρτιπ", "διατ", "εν", "επιτ", "κροκαλοπ", "σιδηροπ", "λ", "ναυ",
"ουλαμ", "ουρ", "π", "τρ", "μ"),
false);
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_31,
Arrays.asList("ψοφ", "ναυλοχ"),
false);
private int rule15(char s[], int len) {
boolean removed = false;
if (len > 4 && endsWith(s, len, "αγεσ")) {
len -= 4;
removed = true;
} else if (len > 3 && (endsWith(s, len, "αγα") || endsWith(s, len, "αγε"))) {
len -= 3;
removed = true;
}
if (removed) {
final boolean cond1 = exc15a.contains(s, 0, len)
|| endsWith(s, len, "οφ")
|| endsWith(s, len, "πελ")
|| endsWith(s, len, "χορτ")
|| endsWith(s, len, "λλ")
|| endsWith(s, len, "σφ")
|| endsWith(s, len, "ρπ")
|| endsWith(s, len, "φρ")
|| endsWith(s, len, "πρ")
|| endsWith(s, len, "λοχ")
|| endsWith(s, len, "σμην");
final boolean cond2 = exc15b.contains(s, 0, len)
|| endsWith(s, len, "κολλ");
if (cond1 && !cond2)
len += 2; // add back -αγ
}
return len;
}
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
false);
private int rule16(char s[], int len) {
boolean removed = false;
if (len > 4 && endsWith(s, len, "ησου")) {
len -= 4;
removed = true;
} else if (len > 3 && (endsWith(s, len, "ησε") || endsWith(s, len, "ησα"))) {
len -= 3;
removed = true;
}
if (removed && exc16.contains(s, 0, len))
len += 2; // add back -ησ
return len;
}
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
false);
private int rule17(char s[], int len) {
if (len > 4 && endsWith(s, len, "ηστε")) {
len -= 4;
if (exc17.contains(s, 0, len))
len += 3; // add back the -ηστ
}
return len;
}
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
false);
private int rule18(char s[], int len) {
boolean removed = false;
if (len > 6 && (endsWith(s, len, "ησουνε") || endsWith(s, len, "ηθουνε"))) {
len -= 6;
removed = true;
} else if (len > 4 && endsWith(s, len, "ουνε")) {
len -= 4;
removed = true;
}
if (removed && exc18.contains(s, 0, len)) {
len += 3;
s[len - 3] = 'ο';
s[len - 2] = 'υ';
s[len - 1] = 'ν';
}
return len;
}
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_31,
Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
false);
private int rule19(char s[], int len) {
boolean removed = false;
if (len > 6 && (endsWith(s, len, "ησουμε") || endsWith(s, len, "ηθουμε"))) {
len -= 6;
removed = true;
} else if (len > 4 && endsWith(s, len, "ουμε")) {
len -= 4;
removed = true;
}
if (removed && exc19.contains(s, 0, len)) {
len += 3;
s[len - 3] = 'ο';
s[len - 2] = 'υ';
s[len - 1] = 'μ';
}
return len;
}
private int rule20(char s[], int len) {
if (len > 5 && (endsWith(s, len, "ματων") || endsWith(s, len, "ματοσ")))
len -= 3;
else if (len > 4 && endsWith(s, len, "ματα"))
len -= 2;
return len;
}
private int rule21(char s[], int len) {
if (len > 9 && endsWith(s, len, "ιοντουσαν"))
return len - 9;
if (len > 8 && (endsWith(s, len, "ιομασταν") ||
endsWith(s, len, "ιοσασταν") ||
endsWith(s, len, "ιουμαστε") ||
endsWith(s, len, "οντουσαν")))
return len - 8;
if (len > 7 && (endsWith(s, len, "ιεμαστε") ||
endsWith(s, len, "ιεσαστε") ||
endsWith(s, len, "ιομουνα") ||
endsWith(s, len, "ιοσαστε") ||
endsWith(s, len, "ιοσουνα") ||
endsWith(s, len, "ιουνται") ||
endsWith(s, len, "ιουνταν") ||
endsWith(s, len, "ηθηκατε") ||
endsWith(s, len, "ομασταν") ||
endsWith(s, len, "οσασταν") ||
endsWith(s, len, "ουμαστε")))
return len - 7;
if (len > 6 && (endsWith(s, len, "ιομουν") ||
endsWith(s, len, "ιονταν") ||
endsWith(s, len, "ιοσουν") ||
endsWith(s, len, "ηθειτε") ||
endsWith(s, len, "ηθηκαν") ||
endsWith(s, len, "ομουνα") ||
endsWith(s, len, "οσαστε") ||
endsWith(s, len, "οσουνα") ||
endsWith(s, len, "ουνται") ||
endsWith(s, len, "ουνταν") ||
endsWith(s, len, "ουσατε")))
return len - 6;
if (len > 5 && (endsWith(s, len, "αγατε") ||
endsWith(s, len, "ιεμαι") ||
endsWith(s, len, "ιεται") ||
endsWith(s, len, "ιεσαι") ||
endsWith(s, len, "ιοταν") ||
endsWith(s, len, "ιουμα") ||
endsWith(s, len, "ηθεισ") ||
endsWith(s, len, "ηθουν") ||
endsWith(s, len, "ηκατε") ||
endsWith(s, len, "ησατε") ||
endsWith(s, len, "ησουν") ||
endsWith(s, len, "ομουν") ||
endsWith(s, len, "ονται") ||
endsWith(s, len, "ονταν") ||
endsWith(s, len, "οσουν") ||
endsWith(s, len, "ουμαι") ||
endsWith(s, len, "ουσαν")))
return len - 5;
if (len > 4 && (endsWith(s, len, "αγαν") ||
endsWith(s, len, "αμαι") ||
endsWith(s, len, "ασαι") ||
endsWith(s, len, "αται") ||
endsWith(s, len, "ειτε") ||
endsWith(s, len, "εσαι") ||
endsWith(s, len, "εται") ||
endsWith(s, len, "ηδεσ") ||
endsWith(s, len, "ηδων") ||
endsWith(s, len, "ηθει") ||
endsWith(s, len, "ηκαν") ||
endsWith(s, len, "ησαν") ||
endsWith(s, len, "ησει") ||
endsWith(s, len, "ησεσ") ||
endsWith(s, len, "ομαι") ||
endsWith(s, len, "οταν")))
return len - 4;
if (len > 3 && (endsWith(s, len, "αει") ||
endsWith(s, len, "εισ") ||
endsWith(s, len, "ηθω") ||
endsWith(s, len, "ησω") ||
endsWith(s, len, "ουν") ||
endsWith(s, len, "ουσ")))
return len - 3;
if (len > 2 && (endsWith(s, len, "αν") ||
endsWith(s, len, "ασ") ||
endsWith(s, len, "αω") ||
endsWith(s, len, "ει") ||
endsWith(s, len, "εσ") ||
endsWith(s, len, "ησ") ||
endsWith(s, len, "οι") ||
endsWith(s, len, "οσ") ||
endsWith(s, len, "ου") ||
endsWith(s, len, "υσ") ||
endsWith(s, len, "ων")))
return len - 2;
if (len > 1 && endsWithVowel(s, len))
return len - 1;
return len;
}
private int rule22(char s[], int len) {
if (endsWith(s, len, "εστερ") ||
endsWith(s, len, "εστατ"))
return len - 5;
if (endsWith(s, len, "οτερ") ||
endsWith(s, len, "οτατ") ||
endsWith(s, len, "υτερ") ||
endsWith(s, len, "υτατ") ||
endsWith(s, len, "ωτερ") ||
endsWith(s, len, "ωτατ"))
return len - 4;
return len;
}
private boolean endsWith(char s[], int len, String suffix) {
final int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
private boolean endsWithVowel(char s[], int len) {
if (len == 0)
return false;
switch(s[len - 1]) {
case 'α':
case 'ε':
case 'η':
case 'ι':
case 'ο':
case 'υ':
case 'ω':
return true;
default:
return false;
}
}
private boolean endsWithVowelNoY(char s[], int len) {
if (len == 0)
return false;
switch(s[len - 1]) {
case 'α':
case 'ε':
case 'η':
case 'ι':
case 'ο':
case 'ω':
return true;
default:
return false;
}
}
}

View File

@ -0,0 +1,76 @@
# Lucene Greek Stopwords list
ο
η
το
οι
τα
του
τησ
των
τον
την
και
κι
κ
ειμαι
εισαι
ειναι
ειμαστε
ειστε
στο
στον
στη
στην
μα
αλλα
απο
για
προσ
με
σε
ωσ
παρα
αντι
κατα
μετα
θα
να
δε
δεν
μη
μην
επι
ενω
εαν
αν
τοτε
που
πωσ
ποιοσ
ποια
ποιο
ποιοι
ποιεσ
ποιων
ποιουσ
αυτοσ
αυτη
αυτο
αυτοι
αυτων
αυτουσ
αυτεσ
αυτα
εκεινοσ
εκεινη
εκεινο
εκεινοι
εκεινεσ
εκεινα
εκεινων
εκεινουσ
οπωσ
ομωσ
ισωσ
οσο
οτι

View File

@ -26,42 +26,67 @@ import org.apache.lucene.util.Version;
*/
public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
/**
* Test the analysis of various greek strings.
*
* @throws Exception in case an error occurs
*/
public void testAnalyzer() throws Exception {
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
// Verify the correct analysis of capitals and small accented letters, and
// stemming
assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ",
"ελληνικ", "γλωσσ" });
// Verify the correct analysis of small letters with diaeresis and the elimination
// of punctuation marks
assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
new String[] { "προιοντ", "πολλαπλ", "αναγκ" });
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
// as well as the elimination of stop words
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
}
/**
* Test the analysis of various greek strings.
*
* @throws Exception in case an error occurs
* @deprecated Remove this test when support for 3.0 is no longer needed
*/
public void testAnalyzer() throws Exception {
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
@Deprecated
public void testAnalyzerBWCompat() throws Exception {
Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
// Verify the correct analysis of capitals and small accented letters
assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
"\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
new String[] { "μια", "εξαιρετικα", "καλη", "πλουσια", "σειρα", "χαρακτηρων",
"ελληνικησ", "γλωσσασ" });
// Verify the correct analysis of small letters with diaeresis and the elimination
// of punctuation marks
assertAnalyzesTo(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
// Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
new String[] { "προιοντα", "πολλαπλεσ", "αναγκεσ" });
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
// as well as the elimination of stop words
assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
new String[] { "προυποθεσεισ", "αψογοσ", "μεστοσ", "αλλοι" });
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
// Verify the correct analysis of capitals and small accented letters
assertAnalyzesToReuse(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
"\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
// Verify the correct analysis of small letters with diaeresis and the elimination
// of punctuation marks
assertAnalyzesToReuse(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
// Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
// as well as the elimination of stop words
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
}
public void testReusableTokenStream() throws Exception {
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
// Verify the correct analysis of capitals and small accented letters, and
// stemming
assertAnalyzesToReuse(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ",
"ελληνικ", "γλωσσ" });
// Verify the correct analysis of small letters with diaeresis and the elimination
// of punctuation marks
assertAnalyzesToReuse(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
new String[] { "προιοντ", "πολλαπλ", "αναγκ" });
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
// as well as the elimination of stop words
assertAnalyzesToReuse(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
}
/**
* Greek Analyzer didn't call standardFilter, so no normalization of acronyms.

View File

@ -0,0 +1,508 @@
package org.apache.lucene.analysis.el;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestGreekStemmer extends BaseTokenStreamTestCase {
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
public void testMasculineNouns() throws Exception {
// -ος
checkOneTerm(a, "άνθρωπος", "ανθρωπ");
checkOneTerm(a, "ανθρώπου", "ανθρωπ");
checkOneTerm(a, "άνθρωπο", "ανθρωπ");
checkOneTerm(a, "άνθρωπε", "ανθρωπ");
checkOneTerm(a, "άνθρωποι", "ανθρωπ");
checkOneTerm(a, "ανθρώπων", "ανθρωπ");
checkOneTerm(a, "ανθρώπους", "ανθρωπ");
checkOneTerm(a, "άνθρωποι", "ανθρωπ");
// -ης
checkOneTerm(a, "πελάτης", "πελατ");
checkOneTerm(a, "πελάτη", "πελατ");
checkOneTerm(a, "πελάτες", "πελατ");
checkOneTerm(a, "πελατών", "πελατ");
// -ας/-ες
checkOneTerm(a, "ελέφαντας", "ελεφαντ");
checkOneTerm(a, "ελέφαντα", "ελεφαντ");
checkOneTerm(a, "ελέφαντες", "ελεφαντ");
checkOneTerm(a, "ελεφάντων", "ελεφαντ");
// -ας/-αδες
checkOneTerm(a, "μπαμπάς", "μπαμπ");
checkOneTerm(a, "μπαμπά", "μπαμπ");
checkOneTerm(a, "μπαμπάδες", "μπαμπ");
checkOneTerm(a, "μπαμπάδων", "μπαμπ");
// -ης/-ηδες
checkOneTerm(a, "μπακάλης", "μπακαλ");
checkOneTerm(a, "μπακάλη", "μπακαλ");
checkOneTerm(a, "μπακάληδες", "μπακαλ");
checkOneTerm(a, "μπακάληδων", "μπακαλ");
// -ες
checkOneTerm(a, "καφές", "καφ");
checkOneTerm(a, "καφέ", "καφ");
checkOneTerm(a, "καφέδες", "καφ");
checkOneTerm(a, "καφέδων", "καφ");
// -έας/είς
checkOneTerm(a, "γραμματέας", "γραμματε");
checkOneTerm(a, "γραμματέα", "γραμματε");
// plural forms conflate w/ each other, not w/ the sing forms
checkOneTerm(a, "γραμματείς", "γραμματ");
checkOneTerm(a, "γραμματέων", "γραμματ");
// -ους/οι
checkOneTerm(a, "απόπλους", "αποπλ");
checkOneTerm(a, "απόπλου", "αποπλ");
checkOneTerm(a, "απόπλοι", "αποπλ");
checkOneTerm(a, "απόπλων", "αποπλ");
// -ους/-ουδες
checkOneTerm(a, "παππούς", "παππ");
checkOneTerm(a, "παππού", "παππ");
checkOneTerm(a, "παππούδες", "παππ");
checkOneTerm(a, "παππούδων", "παππ");
// -ης/-εις
checkOneTerm(a, "λάτρης", "λατρ");
checkOneTerm(a, "λάτρη", "λατρ");
checkOneTerm(a, "λάτρεις", "λατρ");
checkOneTerm(a, "λάτρεων", "λατρ");
// -υς
checkOneTerm(a, "πέλεκυς", "πελεκ");
checkOneTerm(a, "πέλεκυ", "πελεκ");
checkOneTerm(a, "πελέκεις", "πελεκ");
checkOneTerm(a, "πελέκεων", "πελεκ");
// -ωρ
// note: nom./voc. doesn't conflate w/ the rest
checkOneTerm(a, "μέντωρ", "μεντωρ");
checkOneTerm(a, "μέντορος", "μεντορ");
checkOneTerm(a, "μέντορα", "μεντορ");
checkOneTerm(a, "μέντορες", "μεντορ");
checkOneTerm(a, "μεντόρων", "μεντορ");
// -ων
checkOneTerm(a, "αγώνας", "αγων");
checkOneTerm(a, "αγώνος", "αγων");
checkOneTerm(a, "αγώνα", "αγων");
checkOneTerm(a, "αγώνα", "αγων");
checkOneTerm(a, "αγώνες", "αγων");
checkOneTerm(a, "αγώνων", "αγων");
// -ας/-ηδες
checkOneTerm(a, "αέρας", "αερ");
checkOneTerm(a, "αέρα", "αερ");
checkOneTerm(a, "αέρηδες", "αερ");
checkOneTerm(a, "αέρηδων", "αερ");
// -ης/-ητες
checkOneTerm(a, "γόης", "γο");
checkOneTerm(a, "γόη", "γοη"); // too short
// the two plural forms conflate
checkOneTerm(a, "γόητες", "γοητ");
checkOneTerm(a, "γοήτων", "γοητ");
}
public void testFeminineNouns() throws Exception {
// -α/-ες,-ών
checkOneTerm(a, "φορά", "φορ");
checkOneTerm(a, "φοράς", "φορ");
checkOneTerm(a, "φορές", "φορ");
checkOneTerm(a, "φορών", "φορ");
// -α/-ες,-ων
checkOneTerm(a, "αγελάδα", "αγελαδ");
checkOneTerm(a, "αγελάδας", "αγελαδ");
checkOneTerm(a, "αγελάδες", "αγελαδ");
checkOneTerm(a, "αγελάδων", "αγελαδ");
// -η/-ες
checkOneTerm(a, "ζάχαρη", "ζαχαρ");
checkOneTerm(a, "ζάχαρης", "ζαχαρ");
checkOneTerm(a, "ζάχαρες", "ζαχαρ");
checkOneTerm(a, "ζαχάρεων", "ζαχαρ");
// -η/-εις
checkOneTerm(a, "τηλεόραση", "τηλεορασ");
checkOneTerm(a, "τηλεόρασης", "τηλεορασ");
checkOneTerm(a, "τηλεοράσεις", "τηλεορασ");
checkOneTerm(a, "τηλεοράσεων", "τηλεορασ");
// -α/-αδες
checkOneTerm(a, "μαμά", "μαμ");
checkOneTerm(a, "μαμάς", "μαμ");
checkOneTerm(a, "μαμάδες", "μαμ");
checkOneTerm(a, "μαμάδων", "μαμ");
// -ος
checkOneTerm(a, "λεωφόρος", "λεωφορ");
checkOneTerm(a, "λεωφόρου", "λεωφορ");
checkOneTerm(a, "λεωφόρο", "λεωφορ");
checkOneTerm(a, "λεωφόρε", "λεωφορ");
checkOneTerm(a, "λεωφόροι", "λεωφορ");
checkOneTerm(a, "λεωφόρων", "λεωφορ");
checkOneTerm(a, "λεωφόρους", "λεωφορ");
// -ου
checkOneTerm(a, "αλεπού", "αλεπ");
checkOneTerm(a, "αλεπούς", "αλεπ");
checkOneTerm(a, "αλεπούδες", "αλεπ");
checkOneTerm(a, "αλεπούδων", "αλεπ");
// -έας/είς
// note: not all forms conflate
checkOneTerm(a, "γραμματέας", "γραμματε");
checkOneTerm(a, "γραμματέως", "γραμματ");
checkOneTerm(a, "γραμματέα", "γραμματε");
checkOneTerm(a, "γραμματείς", "γραμματ");
checkOneTerm(a, "γραμματέων", "γραμματ");
}
public void testNeuterNouns() throws Exception {
// ending with -ο
// note: nom doesnt conflate
checkOneTerm(a, "βιβλίο", "βιβλι");
checkOneTerm(a, "βιβλίου", "βιβλ");
checkOneTerm(a, "βιβλία", "βιβλ");
checkOneTerm(a, "βιβλίων", "βιβλ");
// ending with -ι
checkOneTerm(a, "πουλί", "πουλ");
checkOneTerm(a, "πουλιού", "πουλ");
checkOneTerm(a, "πουλιά", "πουλ");
checkOneTerm(a, "πουλιών", "πουλ");
// ending with -α
// note: nom. doesnt conflate
checkOneTerm(a, "πρόβλημα", "προβλημ");
checkOneTerm(a, "προβλήματος", "προβλημα");
checkOneTerm(a, "προβλήματα", "προβλημα");
checkOneTerm(a, "προβλημάτων", "προβλημα");
// ending with -ος/-ους
checkOneTerm(a, "πέλαγος", "πελαγ");
checkOneTerm(a, "πελάγους", "πελαγ");
checkOneTerm(a, "πελάγη", "πελαγ");
checkOneTerm(a, "πελάγων", "πελαγ");
// ending with -ός/-ότος
checkOneTerm(a, "γεγονός", "γεγον");
checkOneTerm(a, "γεγονότος", "γεγον");
checkOneTerm(a, "γεγονότα", "γεγον");
checkOneTerm(a, "γεγονότων", "γεγον");
// ending with -υ/-ιου
checkOneTerm(a, "βράδυ", "βραδ");
checkOneTerm(a, "βράδι", "βραδ");
checkOneTerm(a, "βραδιού", "βραδ");
checkOneTerm(a, "βράδια", "βραδ");
checkOneTerm(a, "βραδιών", "βραδ");
// ending with -υ/-ατος
// note: nom. doesnt conflate
checkOneTerm(a, "δόρυ", "δορ");
checkOneTerm(a, "δόρατος", "δορατ");
checkOneTerm(a, "δόρατα", "δορατ");
checkOneTerm(a, "δοράτων", "δορατ");
// ending with -ας
checkOneTerm(a, "κρέας", "κρε");
checkOneTerm(a, "κρέατος", "κρε");
checkOneTerm(a, "κρέατα", "κρε");
checkOneTerm(a, "κρεάτων", "κρε");
// ending with -ως
checkOneTerm(a, "λυκόφως", "λυκοφω");
checkOneTerm(a, "λυκόφωτος", "λυκοφω");
checkOneTerm(a, "λυκόφωτα", "λυκοφω");
checkOneTerm(a, "λυκοφώτων", "λυκοφω");
// ending with -ον/-ου
// note: nom. doesnt conflate
checkOneTerm(a, "μέσον", "μεσον");
checkOneTerm(a, "μέσου", "μεσ");
checkOneTerm(a, "μέσα", "μεσ");
checkOneTerm(a, "μέσων", "μεσ");
// ending in -ον/-οντος
// note: nom. doesnt conflate
checkOneTerm(a, "ενδιαφέρον", "ενδιαφερον");
checkOneTerm(a, "ενδιαφέροντος", "ενδιαφεροντ");
checkOneTerm(a, "ενδιαφέροντα", "ενδιαφεροντ");
checkOneTerm(a, "ενδιαφερόντων", "ενδιαφεροντ");
// ending with -εν/-εντος
checkOneTerm(a, "ανακοινωθέν", "ανακοινωθεν");
checkOneTerm(a, "ανακοινωθέντος", "ανακοινωθεντ");
checkOneTerm(a, "ανακοινωθέντα", "ανακοινωθεντ");
checkOneTerm(a, "ανακοινωθέντων", "ανακοινωθεντ");
// ending with -αν/-αντος
checkOneTerm(a, "σύμπαν", "συμπ");
checkOneTerm(a, "σύμπαντος", "συμπαντ");
checkOneTerm(a, "σύμπαντα", "συμπαντ");
checkOneTerm(a, "συμπάντων", "συμπαντ");
// ending with -α/-ακτος
checkOneTerm(a, "γάλα", "γαλ");
checkOneTerm(a, "γάλακτος", "γαλακτ");
checkOneTerm(a, "γάλατα", "γαλατ");
checkOneTerm(a, "γαλάκτων", "γαλακτ");
}
public void testAdjectives() throws Exception {
// ending with -ής, -ές/-είς, -ή
checkOneTerm(a, "συνεχής", "συνεχ");
checkOneTerm(a, "συνεχούς", "συνεχ");
checkOneTerm(a, "συνεχή", "συνεχ");
checkOneTerm(a, "συνεχών", "συνεχ");
checkOneTerm(a, "συνεχείς", "συνεχ");
checkOneTerm(a, "συνεχές", "συνεχ");
// ending with -ης, -ες/-εις, -η
checkOneTerm(a, "συνήθης", "συνηθ");
checkOneTerm(a, "συνήθους", "συνηθ");
checkOneTerm(a, "συνήθη", "συνηθ");
// note: doesn't conflate
checkOneTerm(a, "συνήθεις", "συν");
checkOneTerm(a, "συνήθων", "συνηθ");
checkOneTerm(a, "σύνηθες", "συνηθ");
// ending with -υς, -υ/-εις, -ια
checkOneTerm(a, "βαθύς", "βαθ");
checkOneTerm(a, "βαθέος", "βαθε");
checkOneTerm(a, "βαθύ", "βαθ");
checkOneTerm(a, "βαθείς", "βαθ");
checkOneTerm(a, "βαθέων", "βαθ");
checkOneTerm(a, "βαθιά", "βαθ");
checkOneTerm(a, "βαθιάς", "βαθι");
checkOneTerm(a, "βαθιές", "βαθι");
checkOneTerm(a, "βαθιών", "βαθ");
checkOneTerm(a, "βαθέα", "βαθε");
// comparative/superlative
checkOneTerm(a, "ψηλός", "ψηλ");
checkOneTerm(a, "ψηλότερος", "ψηλ");
checkOneTerm(a, "ψηλότατος", "ψηλ");
checkOneTerm(a, "ωραίος", "ωραι");
checkOneTerm(a, "ωραιότερος", "ωραι");
checkOneTerm(a, "ωραιότατος", "ωραι");
checkOneTerm(a, "επιεικής", "επιεικ");
checkOneTerm(a, "επιεικέστερος", "επιεικ");
checkOneTerm(a, "επιεικέστατος", "επιεικ");
}
public void testVerbs() throws Exception {
// note, past/present verb stems will not conflate (from the paper)
//-ω,-α/-.ω,-.α
checkOneTerm(a, "ορίζω", "οριζ");
checkOneTerm(a, "όριζα", "οριζ");
checkOneTerm(a, "όριζε", "οριζ");
checkOneTerm(a, "ορίζοντας", "οριζ");
checkOneTerm(a, "ορίζομαι", "οριζ");
checkOneTerm(a, "οριζόμουν", "οριζ");
checkOneTerm(a, "ορίζεσαι", "οριζ");
checkOneTerm(a, "όρισα", "ορισ");
checkOneTerm(a, "ορίσω", "ορισ");
checkOneTerm(a, "όρισε", "ορισ");
checkOneTerm(a, "ορίσει", "ορισ");
checkOneTerm(a, "ορίστηκα", "οριστ");
checkOneTerm(a, "οριστώ", "οριστ");
checkOneTerm(a, "οριστείς", "οριστ");
checkOneTerm(a, "οριστεί", "οριστ");
checkOneTerm(a, "ορισμένο", "ορισμεν");
checkOneTerm(a, "ορισμένη", "ορισμεν");
checkOneTerm(a, "ορισμένος", "ορισμεν");
// -ω,-α/-ξω,-ξα
checkOneTerm(a, "ανοίγω", "ανοιγ");
checkOneTerm(a, "άνοιγα", "ανοιγ");
checkOneTerm(a, "άνοιγε", "ανοιγ");
checkOneTerm(a, "ανοίγοντας", "ανοιγ");
checkOneTerm(a, "ανοίγομαι", "ανοιγ");
checkOneTerm(a, "ανοιγόμουν", "ανοιγ");
checkOneTerm(a, "άνοιξα", "ανοιξ");
checkOneTerm(a, "ανοίξω", "ανοιξ");
checkOneTerm(a, "άνοιξε", "ανοιξ");
checkOneTerm(a, "ανοίξει", "ανοιξ");
checkOneTerm(a, "ανοίχτηκα", "ανοιχτ");
checkOneTerm(a, "ανοιχτώ", "ανοιχτ");
checkOneTerm(a, "ανοίχτηκα", "ανοιχτ");
checkOneTerm(a, "ανοιχτείς", "ανοιχτ");
checkOneTerm(a, "ανοιχτεί", "ανοιχτ");
checkOneTerm(a, "ανοίξου", "ανοιξ");
//-ώ/-άω,-ούσα/-άσω,-ασα
checkOneTerm(a, "περνώ", "περν");
checkOneTerm(a, "περνάω", "περν");
checkOneTerm(a, "περνούσα", "περν");
checkOneTerm(a, "πέρναγα", "περν");
checkOneTerm(a, "πέρνα", "περν");
checkOneTerm(a, "περνώντας", "περν");
checkOneTerm(a, "πέρασα", "περασ");
checkOneTerm(a, "περάσω", "περασ");
checkOneTerm(a, "πέρασε", "περασ");
checkOneTerm(a, "περάσει", "περασ");
checkOneTerm(a, "περνιέμαι", "περν");
checkOneTerm(a, "περνιόμουν", "περν");
checkOneTerm(a, "περάστηκα", "περαστ");
checkOneTerm(a, "περαστώ", "περαστ");
checkOneTerm(a, "περαστείς", "περαστ");
checkOneTerm(a, "περαστεί", "περαστ");
checkOneTerm(a, "περασμένο", "περασμεν");
checkOneTerm(a, "περασμένη", "περασμεν");
checkOneTerm(a, "περασμένος", "περασμεν");
// -ώ/-άω,-ούσα/-άξω,-αξα
checkOneTerm(a, "πετώ", "πετ");
checkOneTerm(a, "πετάω", "πετ");
checkOneTerm(a, "πετούσα", "πετ");
checkOneTerm(a, "πέταγα", "πετ");
checkOneTerm(a, "πέτα", "πετ");
checkOneTerm(a, "πετώντας", "πετ");
checkOneTerm(a, "πετιέμαι", "πετ");
checkOneTerm(a, "πετιόμουν", "πετ");
checkOneTerm(a, "πέταξα", "πεταξ");
checkOneTerm(a, "πετάξω", "πεταξ");
checkOneTerm(a, "πέταξε", "πεταξ");
checkOneTerm(a, "πετάξει", "πεταξ");
checkOneTerm(a, "πετάχτηκα", "πεταχτ");
checkOneTerm(a, "πεταχτώ", "πεταχτ");
checkOneTerm(a, "πεταχτείς", "πεταχτ");
checkOneTerm(a, "πεταχτεί", "πεταχτ");
checkOneTerm(a, "πεταμένο", "πεταμεν");
checkOneTerm(a, "πεταμένη", "πεταμεν");
checkOneTerm(a, "πεταμένος", "πεταμεν");
// -ώ/-άω,-ούσα / -έσω,-εσα
checkOneTerm(a, "καλώ", "καλ");
checkOneTerm(a, "καλούσα", "καλ");
checkOneTerm(a, "καλείς", "καλ");
checkOneTerm(a, "καλώντας", "καλ");
checkOneTerm(a, "καλούμαι", "καλ");
// pass. imperfect /imp. progressive doesnt conflate
checkOneTerm(a, "καλούμουν", "καλουμ");
checkOneTerm(a, "καλείσαι", "καλεισα");
checkOneTerm(a, "καλέστηκα", "καλεστ");
checkOneTerm(a, "καλεστώ", "καλεστ");
checkOneTerm(a, "καλεστείς", "καλεστ");
checkOneTerm(a, "καλεστεί", "καλεστ");
checkOneTerm(a, "καλεσμένο", "καλεσμεν");
checkOneTerm(a, "καλεσμένη", "καλεσμεν");
checkOneTerm(a, "καλεσμένος", "καλεσμεν");
checkOneTerm(a, "φορώ", "φορ");
checkOneTerm(a, "φοράω", "φορ");
checkOneTerm(a, "φορούσα", "φορ");
checkOneTerm(a, "φόραγα", "φορ");
checkOneTerm(a, "φόρα", "φορ");
checkOneTerm(a, "φορώντας", "φορ");
checkOneTerm(a, "φοριέμαι", "φορ");
checkOneTerm(a, "φοριόμουν", "φορ");
checkOneTerm(a, "φοριέσαι", "φορ");
checkOneTerm(a, "φόρεσα", "φορεσ");
checkOneTerm(a, "φορέσω", "φορεσ");
checkOneTerm(a, "φόρεσε", "φορεσ");
checkOneTerm(a, "φορέσει", "φορεσ");
checkOneTerm(a, "φορέθηκα", "φορεθ");
checkOneTerm(a, "φορεθώ", "φορεθ");
checkOneTerm(a, "φορεθείς", "φορεθ");
checkOneTerm(a, "φορεθεί", "φορεθ");
checkOneTerm(a, "φορεμένο", "φορεμεν");
checkOneTerm(a, "φορεμένη", "φορεμεν");
checkOneTerm(a, "φορεμένος", "φορεμεν");
// -ώ/-άω,-ούσα / -ήσω,-ησα
checkOneTerm(a, "κρατώ", "κρατ");
checkOneTerm(a, "κρατάω", "κρατ");
checkOneTerm(a, "κρατούσα", "κρατ");
checkOneTerm(a, "κράταγα", "κρατ");
checkOneTerm(a, "κράτα", "κρατ");
checkOneTerm(a, "κρατώντας", "κρατ");
checkOneTerm(a, "κράτησα", "κρατ");
checkOneTerm(a, "κρατήσω", "κρατ");
checkOneTerm(a, "κράτησε", "κρατ");
checkOneTerm(a, "κρατήσει", "κρατ");
checkOneTerm(a, "κρατούμαι", "κρατ");
checkOneTerm(a, "κρατιέμαι", "κρατ");
// this imperfect form doesnt conflate
checkOneTerm(a, "κρατούμουν", "κρατουμ");
checkOneTerm(a, "κρατιόμουν", "κρατ");
// this imp. prog form doesnt conflate
checkOneTerm(a, "κρατείσαι", "κρατεισα");
checkOneTerm(a, "κρατήθηκα", "κρατ");
checkOneTerm(a, "κρατηθώ", "κρατ");
checkOneTerm(a, "κρατηθείς", "κρατ");
checkOneTerm(a, "κρατηθεί", "κρατ");
checkOneTerm(a, "κρατήσου", "κρατ");
checkOneTerm(a, "κρατημένο", "κρατημεν");
checkOneTerm(a, "κρατημένη", "κρατημεν");
checkOneTerm(a, "κρατημένος", "κρατημεν");
// -.μαι,-.μουν / -.ώ,-.ηκα
checkOneTerm(a, "κοιμάμαι", "κοιμ");
checkOneTerm(a, "κοιμόμουν", "κοιμ");
checkOneTerm(a, "κοιμάσαι", "κοιμ");
checkOneTerm(a, "κοιμήθηκα", "κοιμ");
checkOneTerm(a, "κοιμηθώ", "κοιμ");
checkOneTerm(a, "κοιμήσου", "κοιμ");
checkOneTerm(a, "κοιμηθεί", "κοιμ");
checkOneTerm(a, "κοιμισμένο", "κοιμισμεν");
checkOneTerm(a, "κοιμισμένη", "κοιμισμεν");
checkOneTerm(a, "κοιμισμένος", "κοιμισμεν");
}
public void testExceptions() throws Exception {
checkOneTerm(a, "καθεστώτα", "καθεστ");
checkOneTerm(a, "καθεστώτος", "καθεστ");
checkOneTerm(a, "καθεστώς", "καθεστ");
checkOneTerm(a, "καθεστώτων", "καθεστ");
checkOneTerm(a, "χουμε", "χουμ");
checkOneTerm(a, "χουμ", "χουμ");
checkOneTerm(a, "υποταγεσ", "υποταγ");
checkOneTerm(a, "υποταγ", "υποταγ");
checkOneTerm(a, "εμετε", "εμετ");
checkOneTerm(a, "εμετ", "εμετ");
checkOneTerm(a, "αρχοντασ", "αρχοντ");
checkOneTerm(a, "αρχοντων", "αρχοντ");
}
}

View File

@ -33,6 +33,7 @@ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
@Override
public void init(Map<String, String> args) {
super.init(args);
assureMatchVersion();
if (args.containsKey("charset"))
throw new SolrException(ErrorCode.SERVER_ERROR,
"The charset parameter is no longer supported. "

View File

@ -0,0 +1,30 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.el.GreekStemFilter;
/** Factory for {@link GreekStemFilter} */
public class GreekStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new GreekStemFilter(input);
}
}

View File

@ -31,10 +31,11 @@ public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually lowercases (and a bit more) greek text.
*/
public void testStemming() throws Exception {
public void testNormalization() throws Exception {
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
}

View File

@ -0,0 +1,40 @@
package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Simple tests to ensure the Greek stem filter factory is working.
*/
public class TestGreekStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("άνθρωπος");
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
TokenStream normalized = new GreekLowerCaseFilter(DEFAULT_VERSION, tokenizer);
GreekStemFilterFactory factory = new GreekStemFilterFactory();
TokenStream stream = factory.create(normalized);
assertTokenStreamContents(stream, new String[] { "ανθρωπ" });
}
}