mirror of https://github.com/apache/lucene.git
LUCENE-2463: Improve Greek analysis
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@945090 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
97a95c3a6a
commit
acbf053b7c
|
@ -157,6 +157,9 @@ New features
|
|||
* LUCENE-2393: The HighFreqTerms tool (in misc) can now optionally
|
||||
also include the total termFreq. (Tom Burton-West via Mike McCandless)
|
||||
|
||||
* LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words
|
||||
when Version is set to 3.1 or higher. (Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -16,9 +16,7 @@ package org.apache.lucene.analysis.el;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -28,8 +26,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -45,7 +43,7 @@ import java.util.Set;
|
|||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating GreekAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StandardFilter is used by default.
|
||||
* <li> As of 3.1, StandardFilter and GreekStemmer are used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
|
@ -53,73 +51,74 @@ import java.util.Set;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class GreekAnalyzer extends StopwordAnalyzerBase
|
||||
{
|
||||
/**
|
||||
* List of typical Greek stopwords.
|
||||
*/
|
||||
private static final String[] GREEK_STOP_WORDS = {
|
||||
"ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον", "την", "και",
|
||||
"κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε", "στο", "στον",
|
||||
"στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με", "σε", "ωσ",
|
||||
"παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν", "μη", "μην",
|
||||
"επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ", "ποια", "ποιο",
|
||||
"ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη", "αυτο", "αυτοι",
|
||||
"αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη", "εκεινο",
|
||||
"εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
|
||||
"ισωσ", "οσο", "οτι"
|
||||
};
|
||||
public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
||||
/** File containing default Greek stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
* Returns a set of default Greek-stopwords
|
||||
* @return a set of default Greek-stopwords
|
||||
*/
|
||||
public static final Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET;
|
||||
|
||||
/**
|
||||
* Returns a set of default Greek-stopwords
|
||||
* @return a set of default Greek-stopwords
|
||||
*/
|
||||
public static final Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
static {
|
||||
try {
|
||||
DEFAULT_SET = loadStopwordSet(false, GreekAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
public GreekAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @param stopwords Array of stopwords to use.
|
||||
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public GreekAnalyzer(Version matchVersion, String... stopwords)
|
||||
{
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public GreekAnalyzer(Version matchVersion, Map<?,?> stopwords)
|
||||
{
|
||||
this(matchVersion, stopwords.keySet());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words.
|
||||
* @param matchVersion Lucene compatibility version,
|
||||
* See <a href="#version">above</a>
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* <p>
|
||||
* <b>NOTE:</b> The stopwords set should be pre-processed with the logic of
|
||||
* {@link GreekLowerCaseFilter} for best results.
|
||||
*
|
||||
* @param matchVersion Lucene compatibility version,
|
||||
* See <a href="#version">above</a>
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
super(matchVersion, stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @param stopwords Array of stopwords to use.
|
||||
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public GreekAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public GreekAnalyzer(Version matchVersion, Map<?,?> stopwords) {
|
||||
this(matchVersion, stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates
|
||||
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
|
@ -127,16 +126,19 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase
|
|||
*
|
||||
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link GreekLowerCaseFilter}, {@link StandardFilter} and
|
||||
* {@link StopFilter}
|
||||
* {@link GreekLowerCaseFilter}, {@link StandardFilter},
|
||||
* {@link StopFilter}, and {@link GreekStemFilter}
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new GreekLowerCaseFilter(source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new StandardFilter(result);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
}
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new GreekStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,97 +20,115 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case, removes some Greek diacritics,
|
||||
* and standardizes final sigma to sigma.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating GreekLowerCaseFilter:
|
||||
* <ul>
|
||||
* <li> As of 3.1, supplementary characters are properly lowercased.
|
||||
* </ul>
|
||||
*/
|
||||
public final class GreekLowerCaseFilter extends TokenFilter
|
||||
{
|
||||
private TermAttribute termAtt;
|
||||
|
||||
public GreekLowerCaseFilter(TokenStream in)
|
||||
{
|
||||
super(in);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
public final class GreekLowerCaseFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final CharacterUtils charUtils;
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char[] chArray = termAtt.termBuffer();
|
||||
int chLen = termAtt.termLength();
|
||||
// TODO: iterate codepoints to support supp. characters
|
||||
for (int i = 0; i < chLen; i++)
|
||||
{
|
||||
chArray[i] = (char) lowerCase(chArray[i]);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
/** @deprecated Use {@link #GreekLowerCaseFilter(Version, TokenStream)} instead. */
|
||||
@Deprecated
|
||||
public GreekLowerCaseFilter(TokenStream in) {
|
||||
this(Version.LUCENE_30, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a GreekLowerCaseFilter that normalizes Greek token text.
|
||||
*
|
||||
* @param matchVersion Lucene compatibility version,
|
||||
* See <a href="#version">above</a>
|
||||
* @param in TokenStream to filter
|
||||
*/
|
||||
public GreekLowerCaseFilter(Version matchVersion, TokenStream in) {
|
||||
super(in);
|
||||
this.charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char[] chArray = termAtt.buffer();
|
||||
int chLen = termAtt.length();
|
||||
for (int i = 0; i < chLen;) {
|
||||
i += Character.toChars(
|
||||
lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
private int lowerCase(int codepoint) {
|
||||
switch(codepoint) {
|
||||
/* There are two lowercase forms of sigma:
|
||||
* U+03C2: small final sigma (end of word)
|
||||
* U+03C3: small sigma (otherwise)
|
||||
*
|
||||
* Standardize both to U+03C3
|
||||
*/
|
||||
case '\u03C2': /* small final sigma */
|
||||
return '\u03C3'; /* small sigma */
|
||||
}
|
||||
|
||||
private int lowerCase(int codepoint) {
|
||||
switch(codepoint) {
|
||||
/* There are two lowercase forms of sigma:
|
||||
* U+03C2: small final sigma (end of word)
|
||||
* U+03C3: small sigma (otherwise)
|
||||
*
|
||||
* Standardize both to U+03C3
|
||||
*/
|
||||
case '\u03C2': /* small final sigma */
|
||||
return '\u03C3'; /* small sigma */
|
||||
|
||||
/* Some greek characters contain diacritics.
|
||||
* This filter removes these, converting to the lowercase base form.
|
||||
*/
|
||||
/* Some greek characters contain diacritics.
|
||||
* This filter removes these, converting to the lowercase base form.
|
||||
*/
|
||||
|
||||
case '\u0386': /* capital alpha with tonos */
|
||||
case '\u03AC': /* small alpha with tonos */
|
||||
return '\u03B1'; /* small alpha */
|
||||
|
||||
case '\u0388': /* capital epsilon with tonos */
|
||||
case '\u03AD': /* small epsilon with tonos */
|
||||
return '\u03B5'; /* small epsilon */
|
||||
|
||||
case '\u0389': /* capital eta with tonos */
|
||||
case '\u03AE': /* small eta with tonos */
|
||||
return '\u03B7'; /* small eta */
|
||||
case '\u0386': /* capital alpha with tonos */
|
||||
case '\u03AC': /* small alpha with tonos */
|
||||
return '\u03B1'; /* small alpha */
|
||||
|
||||
case '\u038A': /* capital iota with tonos */
|
||||
case '\u03AA': /* capital iota with dialytika */
|
||||
case '\u03AF': /* small iota with tonos */
|
||||
case '\u03CA': /* small iota with dialytika */
|
||||
case '\u0390': /* small iota with dialytika and tonos */
|
||||
return '\u03B9'; /* small iota */
|
||||
|
||||
case '\u038E': /* capital upsilon with tonos */
|
||||
case '\u03AB': /* capital upsilon with dialytika */
|
||||
case '\u03CD': /* small upsilon with tonos */
|
||||
case '\u03CB': /* small upsilon with dialytika */
|
||||
case '\u03B0': /* small upsilon with dialytika and tonos */
|
||||
return '\u03C5'; /* small upsilon */
|
||||
|
||||
case '\u038C': /* capital omicron with tonos */
|
||||
case '\u03CC': /* small omicron with tonos */
|
||||
return '\u03BF'; /* small omicron */
|
||||
|
||||
case '\u038F': /* capital omega with tonos */
|
||||
case '\u03CE': /* small omega with tonos */
|
||||
return '\u03C9'; /* small omega */
|
||||
|
||||
/* The previous implementation did the conversion below.
|
||||
* Only implemented for backwards compatibility with old indexes.
|
||||
*/
|
||||
|
||||
case '\u03A2': /* reserved */
|
||||
return '\u03C2'; /* small final sigma */
|
||||
|
||||
default:
|
||||
return Character.toLowerCase(codepoint);
|
||||
}
|
||||
case '\u0388': /* capital epsilon with tonos */
|
||||
case '\u03AD': /* small epsilon with tonos */
|
||||
return '\u03B5'; /* small epsilon */
|
||||
|
||||
case '\u0389': /* capital eta with tonos */
|
||||
case '\u03AE': /* small eta with tonos */
|
||||
return '\u03B7'; /* small eta */
|
||||
|
||||
case '\u038A': /* capital iota with tonos */
|
||||
case '\u03AA': /* capital iota with dialytika */
|
||||
case '\u03AF': /* small iota with tonos */
|
||||
case '\u03CA': /* small iota with dialytika */
|
||||
case '\u0390': /* small iota with dialytika and tonos */
|
||||
return '\u03B9'; /* small iota */
|
||||
|
||||
case '\u038E': /* capital upsilon with tonos */
|
||||
case '\u03AB': /* capital upsilon with dialytika */
|
||||
case '\u03CD': /* small upsilon with tonos */
|
||||
case '\u03CB': /* small upsilon with dialytika */
|
||||
case '\u03B0': /* small upsilon with dialytika and tonos */
|
||||
return '\u03C5'; /* small upsilon */
|
||||
|
||||
case '\u038C': /* capital omicron with tonos */
|
||||
case '\u03CC': /* small omicron with tonos */
|
||||
return '\u03BF'; /* small omicron */
|
||||
|
||||
case '\u038F': /* capital omega with tonos */
|
||||
case '\u03CE': /* small omega with tonos */
|
||||
return '\u03C9'; /* small omega */
|
||||
|
||||
/* The previous implementation did the conversion below.
|
||||
* Only implemented for backwards compatibility with old indexes.
|
||||
*/
|
||||
|
||||
case '\u03A2': /* reserved */
|
||||
return '\u03C2'; /* small final sigma */
|
||||
|
||||
default:
|
||||
return Character.toLowerCase(codepoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.KeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link GreekStemmer} to stem Greek
|
||||
* words.
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* <p>
|
||||
* NOTE: Input is expected to be casefolded for Greek (including folding of final
|
||||
* sigma to sigma), and with diacritics removed. This can be achieved by using
|
||||
* either {@link GreekLowerCaseFilter} or ICUFoldingFilter before GreekStemFilter.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class GreekStemFilter extends TokenFilter {
|
||||
private final GreekStemmer stemmer = new GreekStemmer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public GreekStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if(!keywordAttr.isKeyword()) {
|
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
termAtt.setLength(newlen);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,819 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A stemmer for Greek words, according to: <i>Development of a Stemmer for the
|
||||
* Greek Language.</i> Georgios Ntais
|
||||
* <p>
|
||||
* NOTE: Input is expected to be casefolded for Greek (including folding of final
|
||||
* sigma to sigma), and with diacritics removed. This can be achieved with
|
||||
* either {@link GreekLowerCaseFilter} or ICUFoldingFilter.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class GreekStemmer {
|
||||
public int stem(char s[], int len) {
|
||||
if (len < 4) // too short
|
||||
return len;
|
||||
|
||||
final int origLen = len;
|
||||
// "short rules": if it hits one of these, it skips the "long list"
|
||||
len = rule0(s, len);
|
||||
len = rule1(s, len);
|
||||
len = rule2(s, len);
|
||||
len = rule3(s, len);
|
||||
len = rule4(s, len);
|
||||
len = rule5(s, len);
|
||||
len = rule6(s, len);
|
||||
len = rule7(s, len);
|
||||
len = rule8(s, len);
|
||||
len = rule9(s, len);
|
||||
len = rule10(s, len);
|
||||
len = rule11(s, len);
|
||||
len = rule12(s, len);
|
||||
len = rule13(s, len);
|
||||
len = rule14(s, len);
|
||||
len = rule15(s, len);
|
||||
len = rule16(s, len);
|
||||
len = rule17(s, len);
|
||||
len = rule18(s, len);
|
||||
len = rule19(s, len);
|
||||
len = rule20(s, len);
|
||||
// "long list"
|
||||
if (len == origLen)
|
||||
len = rule21(s, len);
|
||||
|
||||
return rule22(s, len);
|
||||
}
|
||||
|
||||
private int rule0(char s[], int len) {
|
||||
if (len > 9 && (endsWith(s, len, "καθεστωτοσ")
|
||||
|| endsWith(s, len, "καθεστωτων")))
|
||||
return len - 4;
|
||||
|
||||
if (len > 8 && (endsWith(s, len, "γεγονοτοσ")
|
||||
|| endsWith(s, len, "γεγονοτων")))
|
||||
return len - 4;
|
||||
|
||||
if (len > 8 && endsWith(s, len, "καθεστωτα"))
|
||||
return len - 3;
|
||||
|
||||
if (len > 7 && (endsWith(s, len, "τατογιου")
|
||||
|| endsWith(s, len, "τατογιων")))
|
||||
return len - 4;
|
||||
|
||||
if (len > 7 && endsWith(s, len, "γεγονοτα"))
|
||||
return len - 3;
|
||||
|
||||
if (len > 7 && endsWith(s, len, "καθεστωσ"))
|
||||
return len - 2;
|
||||
|
||||
if (len > 6 && (endsWith(s, len, "σκαγιου"))
|
||||
|| endsWith(s, len, "σκαγιων")
|
||||
|| endsWith(s, len, "ολογιου")
|
||||
|| endsWith(s, len, "ολογιων")
|
||||
|| endsWith(s, len, "κρεατοσ")
|
||||
|| endsWith(s, len, "κρεατων")
|
||||
|| endsWith(s, len, "περατοσ")
|
||||
|| endsWith(s, len, "περατων")
|
||||
|| endsWith(s, len, "τερατοσ")
|
||||
|| endsWith(s, len, "τερατων"))
|
||||
return len - 4;
|
||||
|
||||
if (len > 6 && endsWith(s, len, "τατογια"))
|
||||
return len - 3;
|
||||
|
||||
if (len > 6 && endsWith(s, len, "γεγονοσ"))
|
||||
return len - 2;
|
||||
|
||||
if (len > 5 && (endsWith(s, len, "φαγιου")
|
||||
|| endsWith(s, len, "φαγιων")
|
||||
|| endsWith(s, len, "σογιου")
|
||||
|| endsWith(s, len, "σογιων")))
|
||||
return len - 4;
|
||||
|
||||
if (len > 5 && (endsWith(s, len, "σκαγια")
|
||||
|| endsWith(s, len, "ολογια")
|
||||
|| endsWith(s, len, "κρεατα")
|
||||
|| endsWith(s, len, "περατα")
|
||||
|| endsWith(s, len, "τερατα")))
|
||||
return len - 3;
|
||||
|
||||
if (len > 4 && (endsWith(s, len, "φαγια")
|
||||
|| endsWith(s, len, "σογια")
|
||||
|| endsWith(s, len, "φωτοσ")
|
||||
|| endsWith(s, len, "φωτων")))
|
||||
return len - 3;
|
||||
|
||||
if (len > 4 && (endsWith(s, len, "κρεασ")
|
||||
|| endsWith(s, len, "περασ")
|
||||
|| endsWith(s, len, "τερασ")))
|
||||
return len - 2;
|
||||
|
||||
if (len > 3 && endsWith(s, len, "φωτα"))
|
||||
return len - 2;
|
||||
|
||||
if (len > 2 && endsWith(s, len, "φωσ"))
|
||||
return len - 1;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule1(char s[], int len) {
|
||||
if (len > 4 && (endsWith(s, len, "αδεσ") || endsWith(s, len, "αδων"))) {
|
||||
len -= 4;
|
||||
if (!(endsWith(s, len, "οκ") ||
|
||||
endsWith(s, len, "μαμ") ||
|
||||
endsWith(s, len, "μαν") ||
|
||||
endsWith(s, len, "μπαμπ") ||
|
||||
endsWith(s, len, "πατερ") ||
|
||||
endsWith(s, len, "γιαγι") ||
|
||||
endsWith(s, len, "νταντ") ||
|
||||
endsWith(s, len, "κυρ") ||
|
||||
endsWith(s, len, "θει") ||
|
||||
endsWith(s, len, "πεθερ")))
|
||||
len += 2; // add back -αδ
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule2(char s[], int len) {
|
||||
if (len > 4 && (endsWith(s, len, "εδεσ") || endsWith(s, len, "εδων"))) {
|
||||
len -= 4;
|
||||
if (endsWith(s, len, "οπ") ||
|
||||
endsWith(s, len, "ιπ") ||
|
||||
endsWith(s, len, "εμπ") ||
|
||||
endsWith(s, len, "υπ") ||
|
||||
endsWith(s, len, "γηπ") ||
|
||||
endsWith(s, len, "δαπ") ||
|
||||
endsWith(s, len, "κρασπ") ||
|
||||
endsWith(s, len, "μιλ"))
|
||||
len += 2; // add back -εδ
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule3(char s[], int len) {
|
||||
if (len > 5 && (endsWith(s, len, "ουδεσ") || endsWith(s, len, "ουδων"))) {
|
||||
len -= 5;
|
||||
if (endsWith(s, len, "αρκ") ||
|
||||
endsWith(s, len, "καλιακ") ||
|
||||
endsWith(s, len, "πεταλ") ||
|
||||
endsWith(s, len, "λιχ") ||
|
||||
endsWith(s, len, "πλεξ") ||
|
||||
endsWith(s, len, "σκ") ||
|
||||
endsWith(s, len, "σ") ||
|
||||
endsWith(s, len, "φλ") ||
|
||||
endsWith(s, len, "φρ") ||
|
||||
endsWith(s, len, "βελ") ||
|
||||
endsWith(s, len, "λουλ") ||
|
||||
endsWith(s, len, "χν") ||
|
||||
endsWith(s, len, "σπ") ||
|
||||
endsWith(s, len, "τραγ") ||
|
||||
endsWith(s, len, "φε"))
|
||||
len += 3; // add back -ουδ
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
|
||||
false);
|
||||
|
||||
private int rule4(char s[], int len) {
|
||||
if (len > 3 && (endsWith(s, len, "εωσ") || endsWith(s, len, "εων"))) {
|
||||
len -= 3;
|
||||
if (exc4.contains(s, 0, len))
|
||||
len++; // add back -ε
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule5(char s[], int len) {
|
||||
if (len > 2 && endsWith(s, len, "ια")) {
|
||||
len -= 2;
|
||||
if (endsWithVowel(s, len))
|
||||
len++; // add back -ι
|
||||
} else if (len > 3 && (endsWith(s, len, "ιου") || endsWith(s, len, "ιων"))) {
|
||||
len -= 3;
|
||||
if (endsWithVowel(s, len))
|
||||
len++; // add back -ι
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
|
||||
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
|
||||
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
|
||||
"πετσ", "πιτσ", "πικαντ", "πλιατσ", "ποστελν", "πρωτοδ", "σερτ",
|
||||
"συναδ", "τσαμ", "υποδ", "φιλον", "φυλοδ", "χασ"),
|
||||
false);
|
||||
|
||||
private int rule6(char s[], int len) {
|
||||
boolean removed = false;
|
||||
if (len > 3 && (endsWith(s, len, "ικα") || endsWith(s, len, "ικο"))) {
|
||||
len -= 3;
|
||||
removed = true;
|
||||
} else if (len > 4 && (endsWith(s, len, "ικου") || endsWith(s, len, "ικων"))) {
|
||||
len -= 4;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed) {
|
||||
if (endsWithVowel(s, len) || exc6.contains(s, 0, len))
|
||||
len += 2; // add back -ικ
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
|
||||
"πεθ", "πικρ", "ποτ", "σιχ", "χ"),
|
||||
false);
|
||||
|
||||
private int rule7(char s[], int len) {
|
||||
if (len == 5 && endsWith(s, len, "αγαμε"))
|
||||
return len - 1;
|
||||
|
||||
if (len > 7 && endsWith(s, len, "ηθηκαμε"))
|
||||
len -= 7;
|
||||
else if (len > 6 && endsWith(s, len, "ουσαμε"))
|
||||
len -= 6;
|
||||
else if (len > 5 && (endsWith(s, len, "αγαμε") ||
|
||||
endsWith(s, len, "ησαμε") ||
|
||||
endsWith(s, len, "ηκαμε")))
|
||||
len -= 5;
|
||||
|
||||
if (len > 3 && endsWith(s, len, "αμε")) {
|
||||
len -= 3;
|
||||
if (exc7.contains(s, 0, len))
|
||||
len += 2; // add back -αμ
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("τρ", "τσ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
|
||||
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
|
||||
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
|
||||
"τσαρλατ", "ορφ", "τσιγγ", "τσοπ", "φωτοστεφ", "χ", "ψυχοπλ", "αγ",
|
||||
"ορφ", "γαλ", "γερ", "δεκ", "διπλ", "αμερικαν", "ουρ", "πιθ",
|
||||
"πουριτ", "σ", "ζωντ", "ικ", "καστ", "κοπ", "λιχ", "λουθηρ", "μαιντ",
|
||||
"μελ", "σιγ", "σπ", "στεγ", "τραγ", "τσαγ", "φ", "ερ", "αδαπ",
|
||||
"αθιγγ", "αμηχ", "ανικ", "ανοργ", "απηγ", "απιθ", "ατσιγγ", "βασ",
|
||||
"βασκ", "βαθυγαλ", "βιομηχ", "βραχυκ", "διατ", "διαφ", "ενοργ",
|
||||
"θυσ", "καπνοβιομηχ", "καταγαλ", "κλιβ", "κοιλαρφ", "λιβ",
|
||||
"μεγλοβιομηχ", "μικροβιομηχ", "νταβ", "ξηροκλιβ", "ολιγοδαμ",
|
||||
"ολογαλ", "πενταρφ", "περηφ", "περιτρ", "πλατ", "πολυδαπ", "πολυμηχ",
|
||||
"στεφ", "ταβ", "τετ", "υπερηφ", "υποκοπ", "χαμηλοδαπ", "ψηλοταβ"),
|
||||
false);
|
||||
|
||||
private int rule8(char s[], int len) {
|
||||
boolean removed = false;
|
||||
|
||||
if (len > 8 && endsWith(s, len, "ιουντανε")) {
|
||||
len -= 8;
|
||||
removed = true;
|
||||
} else if (len > 7 && endsWith(s, len, "ιοντανε") ||
|
||||
endsWith(s, len, "ουντανε") ||
|
||||
endsWith(s, len, "ηθηκανε")) {
|
||||
len -= 7;
|
||||
removed = true;
|
||||
} else if (len > 6 && endsWith(s, len, "ιοτανε") ||
|
||||
endsWith(s, len, "οντανε") ||
|
||||
endsWith(s, len, "ουσανε")) {
|
||||
len -= 6;
|
||||
removed = true;
|
||||
} else if (len > 5 && endsWith(s, len, "αγανε") ||
|
||||
endsWith(s, len, "ησανε") ||
|
||||
endsWith(s, len, "οτανε") ||
|
||||
endsWith(s, len, "ηκανε")) {
|
||||
len -= 5;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed && exc8a.contains(s, 0, len)) {
|
||||
// add -αγαν (we removed > 4 chars so its safe)
|
||||
len += 4;
|
||||
s[len - 4] = 'α';
|
||||
s[len - 3] = 'γ';
|
||||
s[len - 2] = 'α';
|
||||
s[len - 1] = 'ν';
|
||||
}
|
||||
|
||||
if (len > 3 && endsWith(s, len, "ανε")) {
|
||||
len -= 3;
|
||||
if (endsWithVowelNoY(s, len) || exc8b.contains(s, 0, len)) {
|
||||
len += 2; // add back -αν
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
|
||||
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
|
||||
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
|
||||
false);
|
||||
|
||||
private int rule9(char s[], int len) {
|
||||
if (len > 5 && endsWith(s, len, "ησετε"))
|
||||
len -= 5;
|
||||
|
||||
if (len > 3 && endsWith(s, len, "ετε")) {
|
||||
len -= 3;
|
||||
if (exc9.contains(s, 0, len) ||
|
||||
endsWithVowelNoY(s, len) ||
|
||||
endsWith(s, len, "οδ") ||
|
||||
endsWith(s, len, "αιρ") ||
|
||||
endsWith(s, len, "φορ") ||
|
||||
endsWith(s, len, "ταθ") ||
|
||||
endsWith(s, len, "διαθ") ||
|
||||
endsWith(s, len, "σχ") ||
|
||||
endsWith(s, len, "ενδ") ||
|
||||
endsWith(s, len, "ευρ") ||
|
||||
endsWith(s, len, "τιθ") ||
|
||||
endsWith(s, len, "υπερθ") ||
|
||||
endsWith(s, len, "ραθ") ||
|
||||
endsWith(s, len, "ενθ") ||
|
||||
endsWith(s, len, "ροθ") ||
|
||||
endsWith(s, len, "σθ") ||
|
||||
endsWith(s, len, "πυρ") ||
|
||||
endsWith(s, len, "αιν") ||
|
||||
endsWith(s, len, "συνδ") ||
|
||||
endsWith(s, len, "συν") ||
|
||||
endsWith(s, len, "συνθ") ||
|
||||
endsWith(s, len, "χωρ") ||
|
||||
endsWith(s, len, "πον") ||
|
||||
endsWith(s, len, "βρ") ||
|
||||
endsWith(s, len, "καθ") ||
|
||||
endsWith(s, len, "ευθ") ||
|
||||
endsWith(s, len, "εκθ") ||
|
||||
endsWith(s, len, "νετ") ||
|
||||
endsWith(s, len, "ρον") ||
|
||||
endsWith(s, len, "αρκ") ||
|
||||
endsWith(s, len, "βαρ") ||
|
||||
endsWith(s, len, "βολ") ||
|
||||
endsWith(s, len, "ωφελ")) {
|
||||
len += 2; // add back -ετ
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule10(char s[], int len) {
|
||||
if (len > 5 && (endsWith(s, len, "οντασ") || endsWith(s, len, "ωντασ"))) {
|
||||
len -= 5;
|
||||
if (len == 3 && endsWith(s, len, "αρχ")) {
|
||||
len += 3; // add back *ντ
|
||||
s[len - 3] = 'ο';
|
||||
}
|
||||
if (endsWith(s, len, "κρε")) {
|
||||
len += 3; // add back *ντ
|
||||
s[len - 3] = 'ω';
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule11(char s[], int len) {
|
||||
if (len > 6 && endsWith(s, len, "ομαστε")) {
|
||||
len -= 6;
|
||||
if (len == 2 && endsWith(s, len, "ον")) {
|
||||
len += 5; // add back -ομαστ
|
||||
}
|
||||
} else if (len > 7 && endsWith(s, len, "ιομαστε")) {
|
||||
len -= 7;
|
||||
if (len == 2 && endsWith(s, len, "ον")) {
|
||||
len += 5;
|
||||
s[len - 5] = 'ο';
|
||||
s[len - 4] = 'μ';
|
||||
s[len - 3] = 'α';
|
||||
s[len - 2] = 'σ';
|
||||
s[len - 1] = 'τ';
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
|
||||
false);
|
||||
|
||||
private int rule12(char s[], int len) {
|
||||
if (len > 5 && endsWith(s, len, "ιεστε")) {
|
||||
len -= 5;
|
||||
if (exc12a.contains(s, 0, len))
|
||||
len += 4; // add back -ιεστ
|
||||
}
|
||||
|
||||
if (len > 4 && endsWith(s, len, "εστε")) {
|
||||
len -= 4;
|
||||
if (exc12b.contains(s, 0, len))
|
||||
len += 3; // add back -εστ
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
|
||||
false);
|
||||
|
||||
private int rule13(char s[], int len) {
|
||||
if (len > 6 && endsWith(s, len, "ηθηκεσ")) {
|
||||
len -= 6;
|
||||
} else if (len > 5 && (endsWith(s, len, "ηθηκα") || endsWith(s, len, "ηθηκε"))) {
|
||||
len -= 5;
|
||||
}
|
||||
|
||||
boolean removed = false;
|
||||
|
||||
if (len > 4 && endsWith(s, len, "ηκεσ")) {
|
||||
len -= 4;
|
||||
removed = true;
|
||||
} else if (len > 3 && (endsWith(s, len, "ηκα") || endsWith(s, len, "ηκε"))) {
|
||||
len -= 3;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed && (exc13.contains(s, 0, len)
|
||||
|| endsWith(s, len, "σκωλ")
|
||||
|| endsWith(s, len, "σκουλ")
|
||||
|| endsWith(s, len, "ναρθ")
|
||||
|| endsWith(s, len, "σφ")
|
||||
|| endsWith(s, len, "οθ")
|
||||
|| endsWith(s, len, "πιθ"))) {
|
||||
len += 2; // add back the -ηκ
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
|
||||
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
|
||||
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
|
||||
"τσα"),
|
||||
false);
|
||||
|
||||
private int rule14(char s[], int len) {
|
||||
boolean removed = false;
|
||||
|
||||
if (len > 5 && endsWith(s, len, "ουσεσ")) {
|
||||
len -= 5;
|
||||
removed = true;
|
||||
} else if (len > 4 && (endsWith(s, len, "ουσα") || endsWith(s, len, "ουσε"))) {
|
||||
len -= 4;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed && (exc14.contains(s, 0, len)
|
||||
|| endsWithVowel(s, len)
|
||||
|| endsWith(s, len, "ποδαρ")
|
||||
|| endsWith(s, len, "βλεπ")
|
||||
|| endsWith(s, len, "πανταχ")
|
||||
|| endsWith(s, len, "φρυδ")
|
||||
|| endsWith(s, len, "μαντιλ")
|
||||
|| endsWith(s, len, "μαλλ")
|
||||
|| endsWith(s, len, "κυματ")
|
||||
|| endsWith(s, len, "λαχ")
|
||||
|| endsWith(s, len, "ληγ")
|
||||
|| endsWith(s, len, "φαγ")
|
||||
|| endsWith(s, len, "ομ")
|
||||
|| endsWith(s, len, "πρωτ"))) {
|
||||
len += 3; // add back -ουσ
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
|
||||
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
|
||||
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
|
||||
"συντ", "τ", "υποτ", "χαρ", "αειπ", "αιμοστ", "ανυπ", "αποτ",
|
||||
"αρτιπ", "διατ", "εν", "επιτ", "κροκαλοπ", "σιδηροπ", "λ", "ναυ",
|
||||
"ουλαμ", "ουρ", "π", "τρ", "μ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("ψοφ", "ναυλοχ"),
|
||||
false);
|
||||
|
||||
private int rule15(char s[], int len) {
|
||||
boolean removed = false;
|
||||
if (len > 4 && endsWith(s, len, "αγεσ")) {
|
||||
len -= 4;
|
||||
removed = true;
|
||||
} else if (len > 3 && (endsWith(s, len, "αγα") || endsWith(s, len, "αγε"))) {
|
||||
len -= 3;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed) {
|
||||
final boolean cond1 = exc15a.contains(s, 0, len)
|
||||
|| endsWith(s, len, "οφ")
|
||||
|| endsWith(s, len, "πελ")
|
||||
|| endsWith(s, len, "χορτ")
|
||||
|| endsWith(s, len, "λλ")
|
||||
|| endsWith(s, len, "σφ")
|
||||
|| endsWith(s, len, "ρπ")
|
||||
|| endsWith(s, len, "φρ")
|
||||
|| endsWith(s, len, "πρ")
|
||||
|| endsWith(s, len, "λοχ")
|
||||
|| endsWith(s, len, "σμην");
|
||||
|
||||
final boolean cond2 = exc15b.contains(s, 0, len)
|
||||
|| endsWith(s, len, "κολλ");
|
||||
|
||||
if (cond1 && !cond2)
|
||||
len += 2; // add back -αγ
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
|
||||
false);
|
||||
|
||||
private int rule16(char s[], int len) {
|
||||
boolean removed = false;
|
||||
if (len > 4 && endsWith(s, len, "ησου")) {
|
||||
len -= 4;
|
||||
removed = true;
|
||||
} else if (len > 3 && (endsWith(s, len, "ησε") || endsWith(s, len, "ησα"))) {
|
||||
len -= 3;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed && exc16.contains(s, 0, len))
|
||||
len += 2; // add back -ησ
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
|
||||
false);
|
||||
|
||||
private int rule17(char s[], int len) {
|
||||
if (len > 4 && endsWith(s, len, "ηστε")) {
|
||||
len -= 4;
|
||||
if (exc17.contains(s, 0, len))
|
||||
len += 3; // add back the -ηστ
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
|
||||
false);
|
||||
|
||||
private int rule18(char s[], int len) {
|
||||
boolean removed = false;
|
||||
|
||||
if (len > 6 && (endsWith(s, len, "ησουνε") || endsWith(s, len, "ηθουνε"))) {
|
||||
len -= 6;
|
||||
removed = true;
|
||||
} else if (len > 4 && endsWith(s, len, "ουνε")) {
|
||||
len -= 4;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed && exc18.contains(s, 0, len)) {
|
||||
len += 3;
|
||||
s[len - 3] = 'ο';
|
||||
s[len - 2] = 'υ';
|
||||
s[len - 1] = 'ν';
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_31,
|
||||
Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
|
||||
false);
|
||||
|
||||
private int rule19(char s[], int len) {
|
||||
boolean removed = false;
|
||||
|
||||
if (len > 6 && (endsWith(s, len, "ησουμε") || endsWith(s, len, "ηθουμε"))) {
|
||||
len -= 6;
|
||||
removed = true;
|
||||
} else if (len > 4 && endsWith(s, len, "ουμε")) {
|
||||
len -= 4;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed && exc19.contains(s, 0, len)) {
|
||||
len += 3;
|
||||
s[len - 3] = 'ο';
|
||||
s[len - 2] = 'υ';
|
||||
s[len - 1] = 'μ';
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule20(char s[], int len) {
|
||||
if (len > 5 && (endsWith(s, len, "ματων") || endsWith(s, len, "ματοσ")))
|
||||
len -= 3;
|
||||
else if (len > 4 && endsWith(s, len, "ματα"))
|
||||
len -= 2;
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule21(char s[], int len) {
|
||||
if (len > 9 && endsWith(s, len, "ιοντουσαν"))
|
||||
return len - 9;
|
||||
|
||||
if (len > 8 && (endsWith(s, len, "ιομασταν") ||
|
||||
endsWith(s, len, "ιοσασταν") ||
|
||||
endsWith(s, len, "ιουμαστε") ||
|
||||
endsWith(s, len, "οντουσαν")))
|
||||
return len - 8;
|
||||
|
||||
if (len > 7 && (endsWith(s, len, "ιεμαστε") ||
|
||||
endsWith(s, len, "ιεσαστε") ||
|
||||
endsWith(s, len, "ιομουνα") ||
|
||||
endsWith(s, len, "ιοσαστε") ||
|
||||
endsWith(s, len, "ιοσουνα") ||
|
||||
endsWith(s, len, "ιουνται") ||
|
||||
endsWith(s, len, "ιουνταν") ||
|
||||
endsWith(s, len, "ηθηκατε") ||
|
||||
endsWith(s, len, "ομασταν") ||
|
||||
endsWith(s, len, "οσασταν") ||
|
||||
endsWith(s, len, "ουμαστε")))
|
||||
return len - 7;
|
||||
|
||||
if (len > 6 && (endsWith(s, len, "ιομουν") ||
|
||||
endsWith(s, len, "ιονταν") ||
|
||||
endsWith(s, len, "ιοσουν") ||
|
||||
endsWith(s, len, "ηθειτε") ||
|
||||
endsWith(s, len, "ηθηκαν") ||
|
||||
endsWith(s, len, "ομουνα") ||
|
||||
endsWith(s, len, "οσαστε") ||
|
||||
endsWith(s, len, "οσουνα") ||
|
||||
endsWith(s, len, "ουνται") ||
|
||||
endsWith(s, len, "ουνταν") ||
|
||||
endsWith(s, len, "ουσατε")))
|
||||
return len - 6;
|
||||
|
||||
if (len > 5 && (endsWith(s, len, "αγατε") ||
|
||||
endsWith(s, len, "ιεμαι") ||
|
||||
endsWith(s, len, "ιεται") ||
|
||||
endsWith(s, len, "ιεσαι") ||
|
||||
endsWith(s, len, "ιοταν") ||
|
||||
endsWith(s, len, "ιουμα") ||
|
||||
endsWith(s, len, "ηθεισ") ||
|
||||
endsWith(s, len, "ηθουν") ||
|
||||
endsWith(s, len, "ηκατε") ||
|
||||
endsWith(s, len, "ησατε") ||
|
||||
endsWith(s, len, "ησουν") ||
|
||||
endsWith(s, len, "ομουν") ||
|
||||
endsWith(s, len, "ονται") ||
|
||||
endsWith(s, len, "ονταν") ||
|
||||
endsWith(s, len, "οσουν") ||
|
||||
endsWith(s, len, "ουμαι") ||
|
||||
endsWith(s, len, "ουσαν")))
|
||||
return len - 5;
|
||||
|
||||
if (len > 4 && (endsWith(s, len, "αγαν") ||
|
||||
endsWith(s, len, "αμαι") ||
|
||||
endsWith(s, len, "ασαι") ||
|
||||
endsWith(s, len, "αται") ||
|
||||
endsWith(s, len, "ειτε") ||
|
||||
endsWith(s, len, "εσαι") ||
|
||||
endsWith(s, len, "εται") ||
|
||||
endsWith(s, len, "ηδεσ") ||
|
||||
endsWith(s, len, "ηδων") ||
|
||||
endsWith(s, len, "ηθει") ||
|
||||
endsWith(s, len, "ηκαν") ||
|
||||
endsWith(s, len, "ησαν") ||
|
||||
endsWith(s, len, "ησει") ||
|
||||
endsWith(s, len, "ησεσ") ||
|
||||
endsWith(s, len, "ομαι") ||
|
||||
endsWith(s, len, "οταν")))
|
||||
return len - 4;
|
||||
|
||||
if (len > 3 && (endsWith(s, len, "αει") ||
|
||||
endsWith(s, len, "εισ") ||
|
||||
endsWith(s, len, "ηθω") ||
|
||||
endsWith(s, len, "ησω") ||
|
||||
endsWith(s, len, "ουν") ||
|
||||
endsWith(s, len, "ουσ")))
|
||||
return len - 3;
|
||||
|
||||
if (len > 2 && (endsWith(s, len, "αν") ||
|
||||
endsWith(s, len, "ασ") ||
|
||||
endsWith(s, len, "αω") ||
|
||||
endsWith(s, len, "ει") ||
|
||||
endsWith(s, len, "εσ") ||
|
||||
endsWith(s, len, "ησ") ||
|
||||
endsWith(s, len, "οι") ||
|
||||
endsWith(s, len, "οσ") ||
|
||||
endsWith(s, len, "ου") ||
|
||||
endsWith(s, len, "υσ") ||
|
||||
endsWith(s, len, "ων")))
|
||||
return len - 2;
|
||||
|
||||
if (len > 1 && endsWithVowel(s, len))
|
||||
return len - 1;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private int rule22(char s[], int len) {
|
||||
if (endsWith(s, len, "εστερ") ||
|
||||
endsWith(s, len, "εστατ"))
|
||||
return len - 5;
|
||||
|
||||
if (endsWith(s, len, "οτερ") ||
|
||||
endsWith(s, len, "οτατ") ||
|
||||
endsWith(s, len, "υτερ") ||
|
||||
endsWith(s, len, "υτατ") ||
|
||||
endsWith(s, len, "ωτερ") ||
|
||||
endsWith(s, len, "ωτατ"))
|
||||
return len - 4;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
private boolean endsWith(char s[], int len, String suffix) {
|
||||
final int suffixLen = suffix.length();
|
||||
if (suffixLen > len)
|
||||
return false;
|
||||
for (int i = suffixLen - 1; i >= 0; i--)
|
||||
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean endsWithVowel(char s[], int len) {
|
||||
if (len == 0)
|
||||
return false;
|
||||
switch(s[len - 1]) {
|
||||
case 'α':
|
||||
case 'ε':
|
||||
case 'η':
|
||||
case 'ι':
|
||||
case 'ο':
|
||||
case 'υ':
|
||||
case 'ω':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean endsWithVowelNoY(char s[], int len) {
|
||||
if (len == 0)
|
||||
return false;
|
||||
switch(s[len - 1]) {
|
||||
case 'α':
|
||||
case 'ε':
|
||||
case 'η':
|
||||
case 'ι':
|
||||
case 'ο':
|
||||
case 'ω':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
# Lucene Greek Stopwords list
|
||||
ο
|
||||
η
|
||||
το
|
||||
οι
|
||||
τα
|
||||
του
|
||||
τησ
|
||||
των
|
||||
τον
|
||||
την
|
||||
και
|
||||
κι
|
||||
κ
|
||||
ειμαι
|
||||
εισαι
|
||||
ειναι
|
||||
ειμαστε
|
||||
ειστε
|
||||
στο
|
||||
στον
|
||||
στη
|
||||
στην
|
||||
μα
|
||||
αλλα
|
||||
απο
|
||||
για
|
||||
προσ
|
||||
με
|
||||
σε
|
||||
ωσ
|
||||
παρα
|
||||
αντι
|
||||
κατα
|
||||
μετα
|
||||
θα
|
||||
να
|
||||
δε
|
||||
δεν
|
||||
μη
|
||||
μην
|
||||
επι
|
||||
ενω
|
||||
εαν
|
||||
αν
|
||||
τοτε
|
||||
που
|
||||
πωσ
|
||||
ποιοσ
|
||||
ποια
|
||||
ποιο
|
||||
ποιοι
|
||||
ποιεσ
|
||||
ποιων
|
||||
ποιουσ
|
||||
αυτοσ
|
||||
αυτη
|
||||
αυτο
|
||||
αυτοι
|
||||
αυτων
|
||||
αυτουσ
|
||||
αυτεσ
|
||||
αυτα
|
||||
εκεινοσ
|
||||
εκεινη
|
||||
εκεινο
|
||||
εκεινοι
|
||||
εκεινεσ
|
||||
εκεινα
|
||||
εκεινων
|
||||
εκεινουσ
|
||||
οπωσ
|
||||
ομωσ
|
||||
ισωσ
|
||||
οσο
|
||||
οτι
|
|
@ -26,42 +26,67 @@ import org.apache.lucene.util.Version;
|
|||
*/
|
||||
public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* Test the analysis of various greek strings.
|
||||
*
|
||||
* @throws Exception in case an error occurs
|
||||
*/
|
||||
public void testAnalyzer() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||
// Verify the correct analysis of capitals and small accented letters, and
|
||||
// stemming
|
||||
assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
|
||||
new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ",
|
||||
"ελληνικ", "γλωσσ" });
|
||||
// Verify the correct analysis of small letters with diaeresis and the elimination
|
||||
// of punctuation marks
|
||||
assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
|
||||
new String[] { "προιοντ", "πολλαπλ", "αναγκ" });
|
||||
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
|
||||
// as well as the elimination of stop words
|
||||
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the analysis of various greek strings.
|
||||
*
|
||||
* @throws Exception in case an error occurs
|
||||
* @deprecated Remove this test when support for 3.0 is no longer needed
|
||||
*/
|
||||
public void testAnalyzer() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||
@Deprecated
|
||||
public void testAnalyzerBWCompat() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
|
||||
// Verify the correct analysis of capitals and small accented letters
|
||||
assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
|
||||
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
|
||||
"\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
|
||||
assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
|
||||
new String[] { "μια", "εξαιρετικα", "καλη", "πλουσια", "σειρα", "χαρακτηρων",
|
||||
"ελληνικησ", "γλωσσασ" });
|
||||
// Verify the correct analysis of small letters with diaeresis and the elimination
|
||||
// of punctuation marks
|
||||
assertAnalyzesTo(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
|
||||
// Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
|
||||
assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
|
||||
new String[] { "προιοντα", "πολλαπλεσ", "αναγκεσ" });
|
||||
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
|
||||
// as well as the elimination of stop words
|
||||
assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
|
||||
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||
new String[] { "προυποθεσεισ", "αψογοσ", "μεστοσ", "αλλοι" });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||
// Verify the correct analysis of capitals and small accented letters
|
||||
assertAnalyzesToReuse(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
|
||||
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
|
||||
"\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
|
||||
// Verify the correct analysis of small letters with diaeresis and the elimination
|
||||
// of punctuation marks
|
||||
assertAnalyzesToReuse(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
|
||||
// Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
|
||||
// as well as the elimination of stop words
|
||||
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
|
||||
}
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||
// Verify the correct analysis of capitals and small accented letters, and
|
||||
// stemming
|
||||
assertAnalyzesToReuse(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
|
||||
new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ",
|
||||
"ελληνικ", "γλωσσ" });
|
||||
// Verify the correct analysis of small letters with diaeresis and the elimination
|
||||
// of punctuation marks
|
||||
assertAnalyzesToReuse(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
|
||||
new String[] { "προιοντ", "πολλαπλ", "αναγκ" });
|
||||
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
|
||||
// as well as the elimination of stop words
|
||||
assertAnalyzesToReuse(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
|
||||
|
|
|
@ -0,0 +1,508 @@
|
|||
package org.apache.lucene.analysis.el;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
public class TestGreekStemmer extends BaseTokenStreamTestCase {
|
||||
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
||||
public void testMasculineNouns() throws Exception {
|
||||
// -ος
|
||||
checkOneTerm(a, "άνθρωπος", "ανθρωπ");
|
||||
checkOneTerm(a, "ανθρώπου", "ανθρωπ");
|
||||
checkOneTerm(a, "άνθρωπο", "ανθρωπ");
|
||||
checkOneTerm(a, "άνθρωπε", "ανθρωπ");
|
||||
checkOneTerm(a, "άνθρωποι", "ανθρωπ");
|
||||
checkOneTerm(a, "ανθρώπων", "ανθρωπ");
|
||||
checkOneTerm(a, "ανθρώπους", "ανθρωπ");
|
||||
checkOneTerm(a, "άνθρωποι", "ανθρωπ");
|
||||
|
||||
// -ης
|
||||
checkOneTerm(a, "πελάτης", "πελατ");
|
||||
checkOneTerm(a, "πελάτη", "πελατ");
|
||||
checkOneTerm(a, "πελάτες", "πελατ");
|
||||
checkOneTerm(a, "πελατών", "πελατ");
|
||||
|
||||
// -ας/-ες
|
||||
checkOneTerm(a, "ελέφαντας", "ελεφαντ");
|
||||
checkOneTerm(a, "ελέφαντα", "ελεφαντ");
|
||||
checkOneTerm(a, "ελέφαντες", "ελεφαντ");
|
||||
checkOneTerm(a, "ελεφάντων", "ελεφαντ");
|
||||
|
||||
// -ας/-αδες
|
||||
checkOneTerm(a, "μπαμπάς", "μπαμπ");
|
||||
checkOneTerm(a, "μπαμπά", "μπαμπ");
|
||||
checkOneTerm(a, "μπαμπάδες", "μπαμπ");
|
||||
checkOneTerm(a, "μπαμπάδων", "μπαμπ");
|
||||
|
||||
// -ης/-ηδες
|
||||
checkOneTerm(a, "μπακάλης", "μπακαλ");
|
||||
checkOneTerm(a, "μπακάλη", "μπακαλ");
|
||||
checkOneTerm(a, "μπακάληδες", "μπακαλ");
|
||||
checkOneTerm(a, "μπακάληδων", "μπακαλ");
|
||||
|
||||
// -ες
|
||||
checkOneTerm(a, "καφές", "καφ");
|
||||
checkOneTerm(a, "καφέ", "καφ");
|
||||
checkOneTerm(a, "καφέδες", "καφ");
|
||||
checkOneTerm(a, "καφέδων", "καφ");
|
||||
|
||||
// -έας/είς
|
||||
checkOneTerm(a, "γραμματέας", "γραμματε");
|
||||
checkOneTerm(a, "γραμματέα", "γραμματε");
|
||||
// plural forms conflate w/ each other, not w/ the sing forms
|
||||
checkOneTerm(a, "γραμματείς", "γραμματ");
|
||||
checkOneTerm(a, "γραμματέων", "γραμματ");
|
||||
|
||||
// -ους/οι
|
||||
checkOneTerm(a, "απόπλους", "αποπλ");
|
||||
checkOneTerm(a, "απόπλου", "αποπλ");
|
||||
checkOneTerm(a, "απόπλοι", "αποπλ");
|
||||
checkOneTerm(a, "απόπλων", "αποπλ");
|
||||
|
||||
// -ους/-ουδες
|
||||
checkOneTerm(a, "παππούς", "παππ");
|
||||
checkOneTerm(a, "παππού", "παππ");
|
||||
checkOneTerm(a, "παππούδες", "παππ");
|
||||
checkOneTerm(a, "παππούδων", "παππ");
|
||||
|
||||
// -ης/-εις
|
||||
checkOneTerm(a, "λάτρης", "λατρ");
|
||||
checkOneTerm(a, "λάτρη", "λατρ");
|
||||
checkOneTerm(a, "λάτρεις", "λατρ");
|
||||
checkOneTerm(a, "λάτρεων", "λατρ");
|
||||
|
||||
// -υς
|
||||
checkOneTerm(a, "πέλεκυς", "πελεκ");
|
||||
checkOneTerm(a, "πέλεκυ", "πελεκ");
|
||||
checkOneTerm(a, "πελέκεις", "πελεκ");
|
||||
checkOneTerm(a, "πελέκεων", "πελεκ");
|
||||
|
||||
// -ωρ
|
||||
// note: nom./voc. doesn't conflate w/ the rest
|
||||
checkOneTerm(a, "μέντωρ", "μεντωρ");
|
||||
checkOneTerm(a, "μέντορος", "μεντορ");
|
||||
checkOneTerm(a, "μέντορα", "μεντορ");
|
||||
checkOneTerm(a, "μέντορες", "μεντορ");
|
||||
checkOneTerm(a, "μεντόρων", "μεντορ");
|
||||
|
||||
// -ων
|
||||
checkOneTerm(a, "αγώνας", "αγων");
|
||||
checkOneTerm(a, "αγώνος", "αγων");
|
||||
checkOneTerm(a, "αγώνα", "αγων");
|
||||
checkOneTerm(a, "αγώνα", "αγων");
|
||||
checkOneTerm(a, "αγώνες", "αγων");
|
||||
checkOneTerm(a, "αγώνων", "αγων");
|
||||
|
||||
// -ας/-ηδες
|
||||
checkOneTerm(a, "αέρας", "αερ");
|
||||
checkOneTerm(a, "αέρα", "αερ");
|
||||
checkOneTerm(a, "αέρηδες", "αερ");
|
||||
checkOneTerm(a, "αέρηδων", "αερ");
|
||||
|
||||
// -ης/-ητες
|
||||
checkOneTerm(a, "γόης", "γο");
|
||||
checkOneTerm(a, "γόη", "γοη"); // too short
|
||||
// the two plural forms conflate
|
||||
checkOneTerm(a, "γόητες", "γοητ");
|
||||
checkOneTerm(a, "γοήτων", "γοητ");
|
||||
}
|
||||
|
||||
public void testFeminineNouns() throws Exception {
|
||||
// -α/-ες,-ών
|
||||
checkOneTerm(a, "φορά", "φορ");
|
||||
checkOneTerm(a, "φοράς", "φορ");
|
||||
checkOneTerm(a, "φορές", "φορ");
|
||||
checkOneTerm(a, "φορών", "φορ");
|
||||
|
||||
// -α/-ες,-ων
|
||||
checkOneTerm(a, "αγελάδα", "αγελαδ");
|
||||
checkOneTerm(a, "αγελάδας", "αγελαδ");
|
||||
checkOneTerm(a, "αγελάδες", "αγελαδ");
|
||||
checkOneTerm(a, "αγελάδων", "αγελαδ");
|
||||
|
||||
// -η/-ες
|
||||
checkOneTerm(a, "ζάχαρη", "ζαχαρ");
|
||||
checkOneTerm(a, "ζάχαρης", "ζαχαρ");
|
||||
checkOneTerm(a, "ζάχαρες", "ζαχαρ");
|
||||
checkOneTerm(a, "ζαχάρεων", "ζαχαρ");
|
||||
|
||||
// -η/-εις
|
||||
checkOneTerm(a, "τηλεόραση", "τηλεορασ");
|
||||
checkOneTerm(a, "τηλεόρασης", "τηλεορασ");
|
||||
checkOneTerm(a, "τηλεοράσεις", "τηλεορασ");
|
||||
checkOneTerm(a, "τηλεοράσεων", "τηλεορασ");
|
||||
|
||||
// -α/-αδες
|
||||
checkOneTerm(a, "μαμά", "μαμ");
|
||||
checkOneTerm(a, "μαμάς", "μαμ");
|
||||
checkOneTerm(a, "μαμάδες", "μαμ");
|
||||
checkOneTerm(a, "μαμάδων", "μαμ");
|
||||
|
||||
// -ος
|
||||
checkOneTerm(a, "λεωφόρος", "λεωφορ");
|
||||
checkOneTerm(a, "λεωφόρου", "λεωφορ");
|
||||
checkOneTerm(a, "λεωφόρο", "λεωφορ");
|
||||
checkOneTerm(a, "λεωφόρε", "λεωφορ");
|
||||
checkOneTerm(a, "λεωφόροι", "λεωφορ");
|
||||
checkOneTerm(a, "λεωφόρων", "λεωφορ");
|
||||
checkOneTerm(a, "λεωφόρους", "λεωφορ");
|
||||
|
||||
// -ου
|
||||
checkOneTerm(a, "αλεπού", "αλεπ");
|
||||
checkOneTerm(a, "αλεπούς", "αλεπ");
|
||||
checkOneTerm(a, "αλεπούδες", "αλεπ");
|
||||
checkOneTerm(a, "αλεπούδων", "αλεπ");
|
||||
|
||||
// -έας/είς
|
||||
// note: not all forms conflate
|
||||
checkOneTerm(a, "γραμματέας", "γραμματε");
|
||||
checkOneTerm(a, "γραμματέως", "γραμματ");
|
||||
checkOneTerm(a, "γραμματέα", "γραμματε");
|
||||
checkOneTerm(a, "γραμματείς", "γραμματ");
|
||||
checkOneTerm(a, "γραμματέων", "γραμματ");
|
||||
}
|
||||
|
||||
public void testNeuterNouns() throws Exception {
|
||||
// ending with -ο
|
||||
// note: nom doesnt conflate
|
||||
checkOneTerm(a, "βιβλίο", "βιβλι");
|
||||
checkOneTerm(a, "βιβλίου", "βιβλ");
|
||||
checkOneTerm(a, "βιβλία", "βιβλ");
|
||||
checkOneTerm(a, "βιβλίων", "βιβλ");
|
||||
|
||||
// ending with -ι
|
||||
checkOneTerm(a, "πουλί", "πουλ");
|
||||
checkOneTerm(a, "πουλιού", "πουλ");
|
||||
checkOneTerm(a, "πουλιά", "πουλ");
|
||||
checkOneTerm(a, "πουλιών", "πουλ");
|
||||
|
||||
// ending with -α
|
||||
// note: nom. doesnt conflate
|
||||
checkOneTerm(a, "πρόβλημα", "προβλημ");
|
||||
checkOneTerm(a, "προβλήματος", "προβλημα");
|
||||
checkOneTerm(a, "προβλήματα", "προβλημα");
|
||||
checkOneTerm(a, "προβλημάτων", "προβλημα");
|
||||
|
||||
// ending with -ος/-ους
|
||||
checkOneTerm(a, "πέλαγος", "πελαγ");
|
||||
checkOneTerm(a, "πελάγους", "πελαγ");
|
||||
checkOneTerm(a, "πελάγη", "πελαγ");
|
||||
checkOneTerm(a, "πελάγων", "πελαγ");
|
||||
|
||||
// ending with -ός/-ότος
|
||||
checkOneTerm(a, "γεγονός", "γεγον");
|
||||
checkOneTerm(a, "γεγονότος", "γεγον");
|
||||
checkOneTerm(a, "γεγονότα", "γεγον");
|
||||
checkOneTerm(a, "γεγονότων", "γεγον");
|
||||
|
||||
// ending with -υ/-ιου
|
||||
checkOneTerm(a, "βράδυ", "βραδ");
|
||||
checkOneTerm(a, "βράδι", "βραδ");
|
||||
checkOneTerm(a, "βραδιού", "βραδ");
|
||||
checkOneTerm(a, "βράδια", "βραδ");
|
||||
checkOneTerm(a, "βραδιών", "βραδ");
|
||||
|
||||
// ending with -υ/-ατος
|
||||
// note: nom. doesnt conflate
|
||||
checkOneTerm(a, "δόρυ", "δορ");
|
||||
checkOneTerm(a, "δόρατος", "δορατ");
|
||||
checkOneTerm(a, "δόρατα", "δορατ");
|
||||
checkOneTerm(a, "δοράτων", "δορατ");
|
||||
|
||||
// ending with -ας
|
||||
checkOneTerm(a, "κρέας", "κρε");
|
||||
checkOneTerm(a, "κρέατος", "κρε");
|
||||
checkOneTerm(a, "κρέατα", "κρε");
|
||||
checkOneTerm(a, "κρεάτων", "κρε");
|
||||
|
||||
// ending with -ως
|
||||
checkOneTerm(a, "λυκόφως", "λυκοφω");
|
||||
checkOneTerm(a, "λυκόφωτος", "λυκοφω");
|
||||
checkOneTerm(a, "λυκόφωτα", "λυκοφω");
|
||||
checkOneTerm(a, "λυκοφώτων", "λυκοφω");
|
||||
|
||||
// ending with -ον/-ου
|
||||
// note: nom. doesnt conflate
|
||||
checkOneTerm(a, "μέσον", "μεσον");
|
||||
checkOneTerm(a, "μέσου", "μεσ");
|
||||
checkOneTerm(a, "μέσα", "μεσ");
|
||||
checkOneTerm(a, "μέσων", "μεσ");
|
||||
|
||||
// ending in -ον/-οντος
|
||||
// note: nom. doesnt conflate
|
||||
checkOneTerm(a, "ενδιαφέρον", "ενδιαφερον");
|
||||
checkOneTerm(a, "ενδιαφέροντος", "ενδιαφεροντ");
|
||||
checkOneTerm(a, "ενδιαφέροντα", "ενδιαφεροντ");
|
||||
checkOneTerm(a, "ενδιαφερόντων", "ενδιαφεροντ");
|
||||
|
||||
// ending with -εν/-εντος
|
||||
checkOneTerm(a, "ανακοινωθέν", "ανακοινωθεν");
|
||||
checkOneTerm(a, "ανακοινωθέντος", "ανακοινωθεντ");
|
||||
checkOneTerm(a, "ανακοινωθέντα", "ανακοινωθεντ");
|
||||
checkOneTerm(a, "ανακοινωθέντων", "ανακοινωθεντ");
|
||||
|
||||
// ending with -αν/-αντος
|
||||
checkOneTerm(a, "σύμπαν", "συμπ");
|
||||
checkOneTerm(a, "σύμπαντος", "συμπαντ");
|
||||
checkOneTerm(a, "σύμπαντα", "συμπαντ");
|
||||
checkOneTerm(a, "συμπάντων", "συμπαντ");
|
||||
|
||||
// ending with -α/-ακτος
|
||||
checkOneTerm(a, "γάλα", "γαλ");
|
||||
checkOneTerm(a, "γάλακτος", "γαλακτ");
|
||||
checkOneTerm(a, "γάλατα", "γαλατ");
|
||||
checkOneTerm(a, "γαλάκτων", "γαλακτ");
|
||||
}
|
||||
|
||||
public void testAdjectives() throws Exception {
|
||||
// ending with -ής, -ές/-είς, -ή
|
||||
checkOneTerm(a, "συνεχής", "συνεχ");
|
||||
checkOneTerm(a, "συνεχούς", "συνεχ");
|
||||
checkOneTerm(a, "συνεχή", "συνεχ");
|
||||
checkOneTerm(a, "συνεχών", "συνεχ");
|
||||
checkOneTerm(a, "συνεχείς", "συνεχ");
|
||||
checkOneTerm(a, "συνεχές", "συνεχ");
|
||||
|
||||
// ending with -ης, -ες/-εις, -η
|
||||
checkOneTerm(a, "συνήθης", "συνηθ");
|
||||
checkOneTerm(a, "συνήθους", "συνηθ");
|
||||
checkOneTerm(a, "συνήθη", "συνηθ");
|
||||
// note: doesn't conflate
|
||||
checkOneTerm(a, "συνήθεις", "συν");
|
||||
checkOneTerm(a, "συνήθων", "συνηθ");
|
||||
checkOneTerm(a, "σύνηθες", "συνηθ");
|
||||
|
||||
// ending with -υς, -υ/-εις, -ια
|
||||
checkOneTerm(a, "βαθύς", "βαθ");
|
||||
checkOneTerm(a, "βαθέος", "βαθε");
|
||||
checkOneTerm(a, "βαθύ", "βαθ");
|
||||
checkOneTerm(a, "βαθείς", "βαθ");
|
||||
checkOneTerm(a, "βαθέων", "βαθ");
|
||||
|
||||
checkOneTerm(a, "βαθιά", "βαθ");
|
||||
checkOneTerm(a, "βαθιάς", "βαθι");
|
||||
checkOneTerm(a, "βαθιές", "βαθι");
|
||||
checkOneTerm(a, "βαθιών", "βαθ");
|
||||
|
||||
checkOneTerm(a, "βαθέα", "βαθε");
|
||||
|
||||
// comparative/superlative
|
||||
checkOneTerm(a, "ψηλός", "ψηλ");
|
||||
checkOneTerm(a, "ψηλότερος", "ψηλ");
|
||||
checkOneTerm(a, "ψηλότατος", "ψηλ");
|
||||
|
||||
checkOneTerm(a, "ωραίος", "ωραι");
|
||||
checkOneTerm(a, "ωραιότερος", "ωραι");
|
||||
checkOneTerm(a, "ωραιότατος", "ωραι");
|
||||
|
||||
checkOneTerm(a, "επιεικής", "επιεικ");
|
||||
checkOneTerm(a, "επιεικέστερος", "επιεικ");
|
||||
checkOneTerm(a, "επιεικέστατος", "επιεικ");
|
||||
}
|
||||
|
||||
|
||||
public void testVerbs() throws Exception {
|
||||
// note, past/present verb stems will not conflate (from the paper)
|
||||
//-ω,-α/-.ω,-.α
|
||||
checkOneTerm(a, "ορίζω", "οριζ");
|
||||
checkOneTerm(a, "όριζα", "οριζ");
|
||||
checkOneTerm(a, "όριζε", "οριζ");
|
||||
checkOneTerm(a, "ορίζοντας", "οριζ");
|
||||
checkOneTerm(a, "ορίζομαι", "οριζ");
|
||||
checkOneTerm(a, "οριζόμουν", "οριζ");
|
||||
checkOneTerm(a, "ορίζεσαι", "οριζ");
|
||||
|
||||
checkOneTerm(a, "όρισα", "ορισ");
|
||||
checkOneTerm(a, "ορίσω", "ορισ");
|
||||
checkOneTerm(a, "όρισε", "ορισ");
|
||||
checkOneTerm(a, "ορίσει", "ορισ");
|
||||
|
||||
checkOneTerm(a, "ορίστηκα", "οριστ");
|
||||
checkOneTerm(a, "οριστώ", "οριστ");
|
||||
checkOneTerm(a, "οριστείς", "οριστ");
|
||||
checkOneTerm(a, "οριστεί", "οριστ");
|
||||
|
||||
checkOneTerm(a, "ορισμένο", "ορισμεν");
|
||||
checkOneTerm(a, "ορισμένη", "ορισμεν");
|
||||
checkOneTerm(a, "ορισμένος", "ορισμεν");
|
||||
|
||||
// -ω,-α/-ξω,-ξα
|
||||
checkOneTerm(a, "ανοίγω", "ανοιγ");
|
||||
checkOneTerm(a, "άνοιγα", "ανοιγ");
|
||||
checkOneTerm(a, "άνοιγε", "ανοιγ");
|
||||
checkOneTerm(a, "ανοίγοντας", "ανοιγ");
|
||||
checkOneTerm(a, "ανοίγομαι", "ανοιγ");
|
||||
checkOneTerm(a, "ανοιγόμουν", "ανοιγ");
|
||||
|
||||
checkOneTerm(a, "άνοιξα", "ανοιξ");
|
||||
checkOneTerm(a, "ανοίξω", "ανοιξ");
|
||||
checkOneTerm(a, "άνοιξε", "ανοιξ");
|
||||
checkOneTerm(a, "ανοίξει", "ανοιξ");
|
||||
|
||||
checkOneTerm(a, "ανοίχτηκα", "ανοιχτ");
|
||||
checkOneTerm(a, "ανοιχτώ", "ανοιχτ");
|
||||
checkOneTerm(a, "ανοίχτηκα", "ανοιχτ");
|
||||
checkOneTerm(a, "ανοιχτείς", "ανοιχτ");
|
||||
checkOneTerm(a, "ανοιχτεί", "ανοιχτ");
|
||||
|
||||
checkOneTerm(a, "ανοίξου", "ανοιξ");
|
||||
|
||||
//-ώ/-άω,-ούσα/-άσω,-ασα
|
||||
checkOneTerm(a, "περνώ", "περν");
|
||||
checkOneTerm(a, "περνάω", "περν");
|
||||
checkOneTerm(a, "περνούσα", "περν");
|
||||
checkOneTerm(a, "πέρναγα", "περν");
|
||||
checkOneTerm(a, "πέρνα", "περν");
|
||||
checkOneTerm(a, "περνώντας", "περν");
|
||||
|
||||
checkOneTerm(a, "πέρασα", "περασ");
|
||||
checkOneTerm(a, "περάσω", "περασ");
|
||||
checkOneTerm(a, "πέρασε", "περασ");
|
||||
checkOneTerm(a, "περάσει", "περασ");
|
||||
|
||||
checkOneTerm(a, "περνιέμαι", "περν");
|
||||
checkOneTerm(a, "περνιόμουν", "περν");
|
||||
|
||||
checkOneTerm(a, "περάστηκα", "περαστ");
|
||||
checkOneTerm(a, "περαστώ", "περαστ");
|
||||
checkOneTerm(a, "περαστείς", "περαστ");
|
||||
checkOneTerm(a, "περαστεί", "περαστ");
|
||||
|
||||
checkOneTerm(a, "περασμένο", "περασμεν");
|
||||
checkOneTerm(a, "περασμένη", "περασμεν");
|
||||
checkOneTerm(a, "περασμένος", "περασμεν");
|
||||
|
||||
// -ώ/-άω,-ούσα/-άξω,-αξα
|
||||
checkOneTerm(a, "πετώ", "πετ");
|
||||
checkOneTerm(a, "πετάω", "πετ");
|
||||
checkOneTerm(a, "πετούσα", "πετ");
|
||||
checkOneTerm(a, "πέταγα", "πετ");
|
||||
checkOneTerm(a, "πέτα", "πετ");
|
||||
checkOneTerm(a, "πετώντας", "πετ");
|
||||
checkOneTerm(a, "πετιέμαι", "πετ");
|
||||
checkOneTerm(a, "πετιόμουν", "πετ");
|
||||
|
||||
checkOneTerm(a, "πέταξα", "πεταξ");
|
||||
checkOneTerm(a, "πετάξω", "πεταξ");
|
||||
checkOneTerm(a, "πέταξε", "πεταξ");
|
||||
checkOneTerm(a, "πετάξει", "πεταξ");
|
||||
|
||||
checkOneTerm(a, "πετάχτηκα", "πεταχτ");
|
||||
checkOneTerm(a, "πεταχτώ", "πεταχτ");
|
||||
checkOneTerm(a, "πεταχτείς", "πεταχτ");
|
||||
checkOneTerm(a, "πεταχτεί", "πεταχτ");
|
||||
|
||||
checkOneTerm(a, "πεταμένο", "πεταμεν");
|
||||
checkOneTerm(a, "πεταμένη", "πεταμεν");
|
||||
checkOneTerm(a, "πεταμένος", "πεταμεν");
|
||||
|
||||
// -ώ/-άω,-ούσα / -έσω,-εσα
|
||||
checkOneTerm(a, "καλώ", "καλ");
|
||||
checkOneTerm(a, "καλούσα", "καλ");
|
||||
checkOneTerm(a, "καλείς", "καλ");
|
||||
checkOneTerm(a, "καλώντας", "καλ");
|
||||
|
||||
checkOneTerm(a, "καλούμαι", "καλ");
|
||||
// pass. imperfect /imp. progressive doesnt conflate
|
||||
checkOneTerm(a, "καλούμουν", "καλουμ");
|
||||
checkOneTerm(a, "καλείσαι", "καλεισα");
|
||||
|
||||
checkOneTerm(a, "καλέστηκα", "καλεστ");
|
||||
checkOneTerm(a, "καλεστώ", "καλεστ");
|
||||
checkOneTerm(a, "καλεστείς", "καλεστ");
|
||||
checkOneTerm(a, "καλεστεί", "καλεστ");
|
||||
|
||||
checkOneTerm(a, "καλεσμένο", "καλεσμεν");
|
||||
checkOneTerm(a, "καλεσμένη", "καλεσμεν");
|
||||
checkOneTerm(a, "καλεσμένος", "καλεσμεν");
|
||||
|
||||
checkOneTerm(a, "φορώ", "φορ");
|
||||
checkOneTerm(a, "φοράω", "φορ");
|
||||
checkOneTerm(a, "φορούσα", "φορ");
|
||||
checkOneTerm(a, "φόραγα", "φορ");
|
||||
checkOneTerm(a, "φόρα", "φορ");
|
||||
checkOneTerm(a, "φορώντας", "φορ");
|
||||
checkOneTerm(a, "φοριέμαι", "φορ");
|
||||
checkOneTerm(a, "φοριόμουν", "φορ");
|
||||
checkOneTerm(a, "φοριέσαι", "φορ");
|
||||
|
||||
checkOneTerm(a, "φόρεσα", "φορεσ");
|
||||
checkOneTerm(a, "φορέσω", "φορεσ");
|
||||
checkOneTerm(a, "φόρεσε", "φορεσ");
|
||||
checkOneTerm(a, "φορέσει", "φορεσ");
|
||||
|
||||
checkOneTerm(a, "φορέθηκα", "φορεθ");
|
||||
checkOneTerm(a, "φορεθώ", "φορεθ");
|
||||
checkOneTerm(a, "φορεθείς", "φορεθ");
|
||||
checkOneTerm(a, "φορεθεί", "φορεθ");
|
||||
|
||||
checkOneTerm(a, "φορεμένο", "φορεμεν");
|
||||
checkOneTerm(a, "φορεμένη", "φορεμεν");
|
||||
checkOneTerm(a, "φορεμένος", "φορεμεν");
|
||||
|
||||
// -ώ/-άω,-ούσα / -ήσω,-ησα
|
||||
checkOneTerm(a, "κρατώ", "κρατ");
|
||||
checkOneTerm(a, "κρατάω", "κρατ");
|
||||
checkOneTerm(a, "κρατούσα", "κρατ");
|
||||
checkOneTerm(a, "κράταγα", "κρατ");
|
||||
checkOneTerm(a, "κράτα", "κρατ");
|
||||
checkOneTerm(a, "κρατώντας", "κρατ");
|
||||
|
||||
checkOneTerm(a, "κράτησα", "κρατ");
|
||||
checkOneTerm(a, "κρατήσω", "κρατ");
|
||||
checkOneTerm(a, "κράτησε", "κρατ");
|
||||
checkOneTerm(a, "κρατήσει", "κρατ");
|
||||
|
||||
checkOneTerm(a, "κρατούμαι", "κρατ");
|
||||
checkOneTerm(a, "κρατιέμαι", "κρατ");
|
||||
// this imperfect form doesnt conflate
|
||||
checkOneTerm(a, "κρατούμουν", "κρατουμ");
|
||||
checkOneTerm(a, "κρατιόμουν", "κρατ");
|
||||
// this imp. prog form doesnt conflate
|
||||
checkOneTerm(a, "κρατείσαι", "κρατεισα");
|
||||
|
||||
checkOneTerm(a, "κρατήθηκα", "κρατ");
|
||||
checkOneTerm(a, "κρατηθώ", "κρατ");
|
||||
checkOneTerm(a, "κρατηθείς", "κρατ");
|
||||
checkOneTerm(a, "κρατηθεί", "κρατ");
|
||||
checkOneTerm(a, "κρατήσου", "κρατ");
|
||||
|
||||
checkOneTerm(a, "κρατημένο", "κρατημεν");
|
||||
checkOneTerm(a, "κρατημένη", "κρατημεν");
|
||||
checkOneTerm(a, "κρατημένος", "κρατημεν");
|
||||
|
||||
// -.μαι,-.μουν / -.ώ,-.ηκα
|
||||
checkOneTerm(a, "κοιμάμαι", "κοιμ");
|
||||
checkOneTerm(a, "κοιμόμουν", "κοιμ");
|
||||
checkOneTerm(a, "κοιμάσαι", "κοιμ");
|
||||
|
||||
checkOneTerm(a, "κοιμήθηκα", "κοιμ");
|
||||
checkOneTerm(a, "κοιμηθώ", "κοιμ");
|
||||
checkOneTerm(a, "κοιμήσου", "κοιμ");
|
||||
checkOneTerm(a, "κοιμηθεί", "κοιμ");
|
||||
|
||||
checkOneTerm(a, "κοιμισμένο", "κοιμισμεν");
|
||||
checkOneTerm(a, "κοιμισμένη", "κοιμισμεν");
|
||||
checkOneTerm(a, "κοιμισμένος", "κοιμισμεν");
|
||||
}
|
||||
|
||||
public void testExceptions() throws Exception {
|
||||
checkOneTerm(a, "καθεστώτα", "καθεστ");
|
||||
checkOneTerm(a, "καθεστώτος", "καθεστ");
|
||||
checkOneTerm(a, "καθεστώς", "καθεστ");
|
||||
checkOneTerm(a, "καθεστώτων", "καθεστ");
|
||||
|
||||
checkOneTerm(a, "χουμε", "χουμ");
|
||||
checkOneTerm(a, "χουμ", "χουμ");
|
||||
|
||||
checkOneTerm(a, "υποταγεσ", "υποταγ");
|
||||
checkOneTerm(a, "υποταγ", "υποταγ");
|
||||
|
||||
checkOneTerm(a, "εμετε", "εμετ");
|
||||
checkOneTerm(a, "εμετ", "εμετ");
|
||||
|
||||
checkOneTerm(a, "αρχοντασ", "αρχοντ");
|
||||
checkOneTerm(a, "αρχοντων", "αρχοντ");
|
||||
}
|
||||
}
|
|
@ -33,6 +33,7 @@ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
|
|||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
if (args.containsKey("charset"))
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"The charset parameter is no longer supported. "
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.el.GreekStemFilter;
|
||||
|
||||
/** Factory for {@link GreekStemFilter} */
|
||||
public class GreekStemFilterFactory extends BaseTokenFilterFactory {
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new GreekStemFilter(input);
|
||||
}
|
||||
|
||||
}
|
|
@ -31,10 +31,11 @@ public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
|
|||
/**
|
||||
* Ensure the filter actually lowercases (and a bit more) greek text.
|
||||
*/
|
||||
public void testStemming() throws Exception {
|
||||
public void testNormalization() throws Exception {
|
||||
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
|
||||
}
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Greek stem filter factory is working.
|
||||
*/
|
||||
public class TestGreekStemFilterFactory extends BaseTokenTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("άνθρωπος");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
TokenStream normalized = new GreekLowerCaseFilter(DEFAULT_VERSION, tokenizer);
|
||||
GreekStemFilterFactory factory = new GreekStemFilterFactory();
|
||||
TokenStream stream = factory.create(normalized);
|
||||
assertTokenStreamContents(stream, new String[] { "ανθρωπ" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue