diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index c1da64a3e9b..ee3595cab49 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -157,6 +157,9 @@ New features * LUCENE-2393: The HighFreqTerms tool (in misc) can now optionally also include the total termFreq. (Tom Burton-West via Mike McCandless) + * LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words + when Version is set to 3.1 or higher. (Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java index 477881b005f..4dcf341e2f2 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java @@ -16,9 +16,7 @@ package org.apache.lucene.analysis.el; * limitations under the License. */ - import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; @@ -28,8 +26,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.util.Version; +import java.io.IOException; import java.io.Reader; -import java.util.Arrays; import java.util.Map; import java.util.Set; @@ -45,7 +43,7 @@ import java.util.Set; *
You must specify the required {@link Version} * compatibility when creating GreekAnalyzer: *
NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.
*/ -public final class GreekAnalyzer extends StopwordAnalyzerBase -{ - /** - * List of typical Greek stopwords. - */ - private static final String[] GREEK_STOP_WORDS = { - "ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον", "την", "και", - "κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε", "στο", "στον", - "στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με", "σε", "ωσ", - "παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν", "μη", "μην", - "επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ", "ποια", "ποιο", - "ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη", "αυτο", "αυτοι", - "αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη", "εκεινο", - "εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ", - "ισωσ", "οσο", "οτι" - }; +public final class GreekAnalyzer extends StopwordAnalyzerBase { + /** File containing default Greek stopwords. */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Returns a set of default Greek-stopwords + * @return a set of default Greek-stopwords + */ + public static final Set> getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_SET; + } + + private static class DefaultSetHolder { + private static final Set> DEFAULT_SET; - /** - * Returns a set of default Greek-stopwords - * @return a set of default Greek-stopwords - */ - public static final Set> getDefaultStopSet(){ - return DefaultSetHolder.DEFAULT_SET; + static { + try { + DEFAULT_SET = loadStopwordSet(false, GreekAnalyzer.class, DEFAULT_STOPWORD_FILE, "#"); + } catch (IOException ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load default stopword set"); + } } - - private static class DefaultSetHolder { - private static final Set> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet( - Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false)); - } - - public GreekAnalyzer(Version matchVersion) { - this(matchVersion, DefaultSetHolder.DEFAULT_SET); - } - - /** - * Builds an analyzer with the given stop words - * - * @param matchVersion - * lucene compatibility version - * @param stopwords - * a stopword set - */ - public GreekAnalyzer(Version matchVersion, Set> stopwords) { - super(matchVersion, stopwords); - } - - /** - * Builds an analyzer with the given stop words. - * @param stopwords Array of stopwords to use. - * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead - */ - @Deprecated - public GreekAnalyzer(Version matchVersion, String... stopwords) - { - this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords)); - } - - /** - * Builds an analyzer with the given stop words. - * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead - */ - @Deprecated - public GreekAnalyzer(Version matchVersion, Map,?> stopwords) - { - this(matchVersion, stopwords.keySet()); - } - + } + + /** + * Builds an analyzer with the default stop words. + * @param matchVersion Lucene compatibility version, + * See above + */ + public GreekAnalyzer(Version matchVersion) { + this(matchVersion, DefaultSetHolder.DEFAULT_SET); + } + + /** + * Builds an analyzer with the given stop words. + *+ * NOTE: The stopwords set should be pre-processed with the logic of + * {@link GreekLowerCaseFilter} for best results. + * + * @param matchVersion Lucene compatibility version, + * See above + * @param stopwords a stopword set + */ + public GreekAnalyzer(Version matchVersion, Set> stopwords) { + super(matchVersion, stopwords); + } + + /** + * Builds an analyzer with the given stop words. + * @param stopwords Array of stopwords to use. + * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead + */ + @Deprecated + public GreekAnalyzer(Version matchVersion, String... stopwords) { + this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords)); + } + + /** + * Builds an analyzer with the given stop words. + * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead + */ + @Deprecated + public GreekAnalyzer(Version matchVersion, Map,?> stopwords) { + this(matchVersion, stopwords.keySet()); + } + /** * Creates * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} @@ -127,16 +126,19 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase * * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with - * {@link GreekLowerCaseFilter}, {@link StandardFilter} and - * {@link StopFilter} + * {@link GreekLowerCaseFilter}, {@link StandardFilter}, + * {@link StopFilter}, and {@link GreekStemFilter} */ - @Override - protected TokenStreamComponents createComponents(String fieldName, - Reader reader) { - final Tokenizer source = new StandardTokenizer(matchVersion, reader); - TokenStream result = new GreekLowerCaseFilter(source); - if (matchVersion.onOrAfter(Version.LUCENE_31)) - result = new StandardFilter(result); - return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); - } + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new GreekLowerCaseFilter(matchVersion, source); + if (matchVersion.onOrAfter(Version.LUCENE_31)) + result = new StandardFilter(result); + result = new StopFilter(matchVersion, result, stopwords); + if (matchVersion.onOrAfter(Version.LUCENE_31)) + result = new GreekStemFilter(result); + return new TokenStreamComponents(source, result); + } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java index 53da0b8d1f8..d93860eb7c9 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java @@ -20,97 +20,115 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.util.Version; /** * Normalizes token text to lower case, removes some Greek diacritics, * and standardizes final sigma to sigma. - * + * + *
You must specify the required {@link Version} + * compatibility when creating GreekLowerCaseFilter: + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ *+ * NOTE: Input is expected to be casefolded for Greek (including folding of final + * sigma to sigma), and with diacritics removed. This can be achieved by using + * either {@link GreekLowerCaseFilter} or ICUFoldingFilter before GreekStemFilter. + * @lucene.experimental + */ +public final class GreekStemFilter extends TokenFilter { + private final GreekStemmer stemmer = new GreekStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public GreekStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if(!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java new file mode 100644 index 00000000000..53cef6a919d --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java @@ -0,0 +1,819 @@ +package org.apache.lucene.analysis.el; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.util.Version; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for Greek words, according to: Development of a Stemmer for the + * Greek Language. Georgios Ntais + *
+ * NOTE: Input is expected to be casefolded for Greek (including folding of final
+ * sigma to sigma), and with diacritics removed. This can be achieved with
+ * either {@link GreekLowerCaseFilter} or ICUFoldingFilter.
+ * @lucene.experimental
+ */
+public class GreekStemmer {
+ public int stem(char s[], int len) {
+ if (len < 4) // too short
+ return len;
+
+ final int origLen = len;
+ // "short rules": if it hits one of these, it skips the "long list"
+ len = rule0(s, len);
+ len = rule1(s, len);
+ len = rule2(s, len);
+ len = rule3(s, len);
+ len = rule4(s, len);
+ len = rule5(s, len);
+ len = rule6(s, len);
+ len = rule7(s, len);
+ len = rule8(s, len);
+ len = rule9(s, len);
+ len = rule10(s, len);
+ len = rule11(s, len);
+ len = rule12(s, len);
+ len = rule13(s, len);
+ len = rule14(s, len);
+ len = rule15(s, len);
+ len = rule16(s, len);
+ len = rule17(s, len);
+ len = rule18(s, len);
+ len = rule19(s, len);
+ len = rule20(s, len);
+ // "long list"
+ if (len == origLen)
+ len = rule21(s, len);
+
+ return rule22(s, len);
+ }
+
+ private int rule0(char s[], int len) {
+ if (len > 9 && (endsWith(s, len, "καθεστωτοσ")
+ || endsWith(s, len, "καθεστωτων")))
+ return len - 4;
+
+ if (len > 8 && (endsWith(s, len, "γεγονοτοσ")
+ || endsWith(s, len, "γεγονοτων")))
+ return len - 4;
+
+ if (len > 8 && endsWith(s, len, "καθεστωτα"))
+ return len - 3;
+
+ if (len > 7 && (endsWith(s, len, "τατογιου")
+ || endsWith(s, len, "τατογιων")))
+ return len - 4;
+
+ if (len > 7 && endsWith(s, len, "γεγονοτα"))
+ return len - 3;
+
+ if (len > 7 && endsWith(s, len, "καθεστωσ"))
+ return len - 2;
+
+ if (len > 6 && (endsWith(s, len, "σκαγιου"))
+ || endsWith(s, len, "σκαγιων")
+ || endsWith(s, len, "ολογιου")
+ || endsWith(s, len, "ολογιων")
+ || endsWith(s, len, "κρεατοσ")
+ || endsWith(s, len, "κρεατων")
+ || endsWith(s, len, "περατοσ")
+ || endsWith(s, len, "περατων")
+ || endsWith(s, len, "τερατοσ")
+ || endsWith(s, len, "τερατων"))
+ return len - 4;
+
+ if (len > 6 && endsWith(s, len, "τατογια"))
+ return len - 3;
+
+ if (len > 6 && endsWith(s, len, "γεγονοσ"))
+ return len - 2;
+
+ if (len > 5 && (endsWith(s, len, "φαγιου")
+ || endsWith(s, len, "φαγιων")
+ || endsWith(s, len, "σογιου")
+ || endsWith(s, len, "σογιων")))
+ return len - 4;
+
+ if (len > 5 && (endsWith(s, len, "σκαγια")
+ || endsWith(s, len, "ολογια")
+ || endsWith(s, len, "κρεατα")
+ || endsWith(s, len, "περατα")
+ || endsWith(s, len, "τερατα")))
+ return len - 3;
+
+ if (len > 4 && (endsWith(s, len, "φαγια")
+ || endsWith(s, len, "σογια")
+ || endsWith(s, len, "φωτοσ")
+ || endsWith(s, len, "φωτων")))
+ return len - 3;
+
+ if (len > 4 && (endsWith(s, len, "κρεασ")
+ || endsWith(s, len, "περασ")
+ || endsWith(s, len, "τερασ")))
+ return len - 2;
+
+ if (len > 3 && endsWith(s, len, "φωτα"))
+ return len - 2;
+
+ if (len > 2 && endsWith(s, len, "φωσ"))
+ return len - 1;
+
+ return len;
+ }
+
+ private int rule1(char s[], int len) {
+ if (len > 4 && (endsWith(s, len, "αδεσ") || endsWith(s, len, "αδων"))) {
+ len -= 4;
+ if (!(endsWith(s, len, "οκ") ||
+ endsWith(s, len, "μαμ") ||
+ endsWith(s, len, "μαν") ||
+ endsWith(s, len, "μπαμπ") ||
+ endsWith(s, len, "πατερ") ||
+ endsWith(s, len, "γιαγι") ||
+ endsWith(s, len, "νταντ") ||
+ endsWith(s, len, "κυρ") ||
+ endsWith(s, len, "θει") ||
+ endsWith(s, len, "πεθερ")))
+ len += 2; // add back -αδ
+ }
+ return len;
+ }
+
+ private int rule2(char s[], int len) {
+ if (len > 4 && (endsWith(s, len, "εδεσ") || endsWith(s, len, "εδων"))) {
+ len -= 4;
+ if (endsWith(s, len, "οπ") ||
+ endsWith(s, len, "ιπ") ||
+ endsWith(s, len, "εμπ") ||
+ endsWith(s, len, "υπ") ||
+ endsWith(s, len, "γηπ") ||
+ endsWith(s, len, "δαπ") ||
+ endsWith(s, len, "κρασπ") ||
+ endsWith(s, len, "μιλ"))
+ len += 2; // add back -εδ
+ }
+ return len;
+ }
+
+ private int rule3(char s[], int len) {
+ if (len > 5 && (endsWith(s, len, "ουδεσ") || endsWith(s, len, "ουδων"))) {
+ len -= 5;
+ if (endsWith(s, len, "αρκ") ||
+ endsWith(s, len, "καλιακ") ||
+ endsWith(s, len, "πεταλ") ||
+ endsWith(s, len, "λιχ") ||
+ endsWith(s, len, "πλεξ") ||
+ endsWith(s, len, "σκ") ||
+ endsWith(s, len, "σ") ||
+ endsWith(s, len, "φλ") ||
+ endsWith(s, len, "φρ") ||
+ endsWith(s, len, "βελ") ||
+ endsWith(s, len, "λουλ") ||
+ endsWith(s, len, "χν") ||
+ endsWith(s, len, "σπ") ||
+ endsWith(s, len, "τραγ") ||
+ endsWith(s, len, "φε"))
+ len += 3; // add back -ουδ
+ }
+ return len;
+ }
+
+ private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
+ false);
+
+ private int rule4(char s[], int len) {
+ if (len > 3 && (endsWith(s, len, "εωσ") || endsWith(s, len, "εων"))) {
+ len -= 3;
+ if (exc4.contains(s, 0, len))
+ len++; // add back -ε
+ }
+ return len;
+ }
+
+ private int rule5(char s[], int len) {
+ if (len > 2 && endsWith(s, len, "ια")) {
+ len -= 2;
+ if (endsWithVowel(s, len))
+ len++; // add back -ι
+ } else if (len > 3 && (endsWith(s, len, "ιου") || endsWith(s, len, "ιων"))) {
+ len -= 3;
+ if (endsWithVowel(s, len))
+ len++; // add back -ι
+ }
+ return len;
+ }
+
+ private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
+ "αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
+ "μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
+ "πετσ", "πιτσ", "πικαντ", "πλιατσ", "ποστελν", "πρωτοδ", "σερτ",
+ "συναδ", "τσαμ", "υποδ", "φιλον", "φυλοδ", "χασ"),
+ false);
+
+ private int rule6(char s[], int len) {
+ boolean removed = false;
+ if (len > 3 && (endsWith(s, len, "ικα") || endsWith(s, len, "ικο"))) {
+ len -= 3;
+ removed = true;
+ } else if (len > 4 && (endsWith(s, len, "ικου") || endsWith(s, len, "ικων"))) {
+ len -= 4;
+ removed = true;
+ }
+
+ if (removed) {
+ if (endsWithVowel(s, len) || exc6.contains(s, 0, len))
+ len += 2; // add back -ικ
+ }
+ return len;
+ }
+
+ private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
+ "πεθ", "πικρ", "ποτ", "σιχ", "χ"),
+ false);
+
+ private int rule7(char s[], int len) {
+ if (len == 5 && endsWith(s, len, "αγαμε"))
+ return len - 1;
+
+ if (len > 7 && endsWith(s, len, "ηθηκαμε"))
+ len -= 7;
+ else if (len > 6 && endsWith(s, len, "ουσαμε"))
+ len -= 6;
+ else if (len > 5 && (endsWith(s, len, "αγαμε") ||
+ endsWith(s, len, "ησαμε") ||
+ endsWith(s, len, "ηκαμε")))
+ len -= 5;
+
+ if (len > 3 && endsWith(s, len, "αμε")) {
+ len -= 3;
+ if (exc7.contains(s, 0, len))
+ len += 2; // add back -αμ
+ }
+
+ return len;
+ }
+
+ private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("τρ", "τσ"),
+ false);
+
+ private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
+ "καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
+ "π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
+ "τσαρλατ", "ορφ", "τσιγγ", "τσοπ", "φωτοστεφ", "χ", "ψυχοπλ", "αγ",
+ "ορφ", "γαλ", "γερ", "δεκ", "διπλ", "αμερικαν", "ουρ", "πιθ",
+ "πουριτ", "σ", "ζωντ", "ικ", "καστ", "κοπ", "λιχ", "λουθηρ", "μαιντ",
+ "μελ", "σιγ", "σπ", "στεγ", "τραγ", "τσαγ", "φ", "ερ", "αδαπ",
+ "αθιγγ", "αμηχ", "ανικ", "ανοργ", "απηγ", "απιθ", "ατσιγγ", "βασ",
+ "βασκ", "βαθυγαλ", "βιομηχ", "βραχυκ", "διατ", "διαφ", "ενοργ",
+ "θυσ", "καπνοβιομηχ", "καταγαλ", "κλιβ", "κοιλαρφ", "λιβ",
+ "μεγλοβιομηχ", "μικροβιομηχ", "νταβ", "ξηροκλιβ", "ολιγοδαμ",
+ "ολογαλ", "πενταρφ", "περηφ", "περιτρ", "πλατ", "πολυδαπ", "πολυμηχ",
+ "στεφ", "ταβ", "τετ", "υπερηφ", "υποκοπ", "χαμηλοδαπ", "ψηλοταβ"),
+ false);
+
+ private int rule8(char s[], int len) {
+ boolean removed = false;
+
+ if (len > 8 && endsWith(s, len, "ιουντανε")) {
+ len -= 8;
+ removed = true;
+ } else if (len > 7 && endsWith(s, len, "ιοντανε") ||
+ endsWith(s, len, "ουντανε") ||
+ endsWith(s, len, "ηθηκανε")) {
+ len -= 7;
+ removed = true;
+ } else if (len > 6 && endsWith(s, len, "ιοτανε") ||
+ endsWith(s, len, "οντανε") ||
+ endsWith(s, len, "ουσανε")) {
+ len -= 6;
+ removed = true;
+ } else if (len > 5 && endsWith(s, len, "αγανε") ||
+ endsWith(s, len, "ησανε") ||
+ endsWith(s, len, "οτανε") ||
+ endsWith(s, len, "ηκανε")) {
+ len -= 5;
+ removed = true;
+ }
+
+ if (removed && exc8a.contains(s, 0, len)) {
+ // add -αγαν (we removed > 4 chars so its safe)
+ len += 4;
+ s[len - 4] = 'α';
+ s[len - 3] = 'γ';
+ s[len - 2] = 'α';
+ s[len - 1] = 'ν';
+ }
+
+ if (len > 3 && endsWith(s, len, "ανε")) {
+ len -= 3;
+ if (endsWithVowelNoY(s, len) || exc8b.contains(s, 0, len)) {
+ len += 2; // add back -αν
+ }
+ }
+
+ return len;
+ }
+
+ private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
+ "βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
+ "σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
+ false);
+
+ private int rule9(char s[], int len) {
+ if (len > 5 && endsWith(s, len, "ησετε"))
+ len -= 5;
+
+ if (len > 3 && endsWith(s, len, "ετε")) {
+ len -= 3;
+ if (exc9.contains(s, 0, len) ||
+ endsWithVowelNoY(s, len) ||
+ endsWith(s, len, "οδ") ||
+ endsWith(s, len, "αιρ") ||
+ endsWith(s, len, "φορ") ||
+ endsWith(s, len, "ταθ") ||
+ endsWith(s, len, "διαθ") ||
+ endsWith(s, len, "σχ") ||
+ endsWith(s, len, "ενδ") ||
+ endsWith(s, len, "ευρ") ||
+ endsWith(s, len, "τιθ") ||
+ endsWith(s, len, "υπερθ") ||
+ endsWith(s, len, "ραθ") ||
+ endsWith(s, len, "ενθ") ||
+ endsWith(s, len, "ροθ") ||
+ endsWith(s, len, "σθ") ||
+ endsWith(s, len, "πυρ") ||
+ endsWith(s, len, "αιν") ||
+ endsWith(s, len, "συνδ") ||
+ endsWith(s, len, "συν") ||
+ endsWith(s, len, "συνθ") ||
+ endsWith(s, len, "χωρ") ||
+ endsWith(s, len, "πον") ||
+ endsWith(s, len, "βρ") ||
+ endsWith(s, len, "καθ") ||
+ endsWith(s, len, "ευθ") ||
+ endsWith(s, len, "εκθ") ||
+ endsWith(s, len, "νετ") ||
+ endsWith(s, len, "ρον") ||
+ endsWith(s, len, "αρκ") ||
+ endsWith(s, len, "βαρ") ||
+ endsWith(s, len, "βολ") ||
+ endsWith(s, len, "ωφελ")) {
+ len += 2; // add back -ετ
+ }
+ }
+
+ return len;
+ }
+
+ private int rule10(char s[], int len) {
+ if (len > 5 && (endsWith(s, len, "οντασ") || endsWith(s, len, "ωντασ"))) {
+ len -= 5;
+ if (len == 3 && endsWith(s, len, "αρχ")) {
+ len += 3; // add back *ντ
+ s[len - 3] = 'ο';
+ }
+ if (endsWith(s, len, "κρε")) {
+ len += 3; // add back *ντ
+ s[len - 3] = 'ω';
+ }
+ }
+
+ return len;
+ }
+
+ private int rule11(char s[], int len) {
+ if (len > 6 && endsWith(s, len, "ομαστε")) {
+ len -= 6;
+ if (len == 2 && endsWith(s, len, "ον")) {
+ len += 5; // add back -ομαστ
+ }
+ } else if (len > 7 && endsWith(s, len, "ιομαστε")) {
+ len -= 7;
+ if (len == 2 && endsWith(s, len, "ον")) {
+ len += 5;
+ s[len - 5] = 'ο';
+ s[len - 4] = 'μ';
+ s[len - 3] = 'α';
+ s[len - 2] = 'σ';
+ s[len - 1] = 'τ';
+ }
+ }
+ return len;
+ }
+
+ private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
+ false);
+
+ private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
+ false);
+
+ private int rule12(char s[], int len) {
+ if (len > 5 && endsWith(s, len, "ιεστε")) {
+ len -= 5;
+ if (exc12a.contains(s, 0, len))
+ len += 4; // add back -ιεστ
+ }
+
+ if (len > 4 && endsWith(s, len, "εστε")) {
+ len -= 4;
+ if (exc12b.contains(s, 0, len))
+ len += 3; // add back -εστ
+ }
+
+ return len;
+ }
+
+ private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
+ false);
+
+ private int rule13(char s[], int len) {
+ if (len > 6 && endsWith(s, len, "ηθηκεσ")) {
+ len -= 6;
+ } else if (len > 5 && (endsWith(s, len, "ηθηκα") || endsWith(s, len, "ηθηκε"))) {
+ len -= 5;
+ }
+
+ boolean removed = false;
+
+ if (len > 4 && endsWith(s, len, "ηκεσ")) {
+ len -= 4;
+ removed = true;
+ } else if (len > 3 && (endsWith(s, len, "ηκα") || endsWith(s, len, "ηκε"))) {
+ len -= 3;
+ removed = true;
+ }
+
+ if (removed && (exc13.contains(s, 0, len)
+ || endsWith(s, len, "σκωλ")
+ || endsWith(s, len, "σκουλ")
+ || endsWith(s, len, "ναρθ")
+ || endsWith(s, len, "σφ")
+ || endsWith(s, len, "οθ")
+ || endsWith(s, len, "πιθ"))) {
+ len += 2; // add back the -ηκ
+ }
+
+ return len;
+ }
+
+ private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
+ "λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
+ "ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
+ "τσα"),
+ false);
+
+ private int rule14(char s[], int len) {
+ boolean removed = false;
+
+ if (len > 5 && endsWith(s, len, "ουσεσ")) {
+ len -= 5;
+ removed = true;
+ } else if (len > 4 && (endsWith(s, len, "ουσα") || endsWith(s, len, "ουσε"))) {
+ len -= 4;
+ removed = true;
+ }
+
+ if (removed && (exc14.contains(s, 0, len)
+ || endsWithVowel(s, len)
+ || endsWith(s, len, "ποδαρ")
+ || endsWith(s, len, "βλεπ")
+ || endsWith(s, len, "πανταχ")
+ || endsWith(s, len, "φρυδ")
+ || endsWith(s, len, "μαντιλ")
+ || endsWith(s, len, "μαλλ")
+ || endsWith(s, len, "κυματ")
+ || endsWith(s, len, "λαχ")
+ || endsWith(s, len, "ληγ")
+ || endsWith(s, len, "φαγ")
+ || endsWith(s, len, "ομ")
+ || endsWith(s, len, "πρωτ"))) {
+ len += 3; // add back -ουσ
+ }
+
+ return len;
+ }
+
+ private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
+ "αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
+ "ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
+ "συντ", "τ", "υποτ", "χαρ", "αειπ", "αιμοστ", "ανυπ", "αποτ",
+ "αρτιπ", "διατ", "εν", "επιτ", "κροκαλοπ", "σιδηροπ", "λ", "ναυ",
+ "ουλαμ", "ουρ", "π", "τρ", "μ"),
+ false);
+
+ private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("ψοφ", "ναυλοχ"),
+ false);
+
+ private int rule15(char s[], int len) {
+ boolean removed = false;
+ if (len > 4 && endsWith(s, len, "αγεσ")) {
+ len -= 4;
+ removed = true;
+ } else if (len > 3 && (endsWith(s, len, "αγα") || endsWith(s, len, "αγε"))) {
+ len -= 3;
+ removed = true;
+ }
+
+ if (removed) {
+ final boolean cond1 = exc15a.contains(s, 0, len)
+ || endsWith(s, len, "οφ")
+ || endsWith(s, len, "πελ")
+ || endsWith(s, len, "χορτ")
+ || endsWith(s, len, "λλ")
+ || endsWith(s, len, "σφ")
+ || endsWith(s, len, "ρπ")
+ || endsWith(s, len, "φρ")
+ || endsWith(s, len, "πρ")
+ || endsWith(s, len, "λοχ")
+ || endsWith(s, len, "σμην");
+
+ final boolean cond2 = exc15b.contains(s, 0, len)
+ || endsWith(s, len, "κολλ");
+
+ if (cond1 && !cond2)
+ len += 2; // add back -αγ
+ }
+
+ return len;
+ }
+
+ private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
+ false);
+
+ private int rule16(char s[], int len) {
+ boolean removed = false;
+ if (len > 4 && endsWith(s, len, "ησου")) {
+ len -= 4;
+ removed = true;
+ } else if (len > 3 && (endsWith(s, len, "ησε") || endsWith(s, len, "ησα"))) {
+ len -= 3;
+ removed = true;
+ }
+
+ if (removed && exc16.contains(s, 0, len))
+ len += 2; // add back -ησ
+
+ return len;
+ }
+
+ private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
+ false);
+
+ private int rule17(char s[], int len) {
+ if (len > 4 && endsWith(s, len, "ηστε")) {
+ len -= 4;
+ if (exc17.contains(s, 0, len))
+ len += 3; // add back the -ηστ
+ }
+
+ return len;
+ }
+
+ private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
+ false);
+
+ private int rule18(char s[], int len) {
+ boolean removed = false;
+
+ if (len > 6 && (endsWith(s, len, "ησουνε") || endsWith(s, len, "ηθουνε"))) {
+ len -= 6;
+ removed = true;
+ } else if (len > 4 && endsWith(s, len, "ουνε")) {
+ len -= 4;
+ removed = true;
+ }
+
+ if (removed && exc18.contains(s, 0, len)) {
+ len += 3;
+ s[len - 3] = 'ο';
+ s[len - 2] = 'υ';
+ s[len - 1] = 'ν';
+ }
+ return len;
+ }
+
+ private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
+ false);
+
+ private int rule19(char s[], int len) {
+ boolean removed = false;
+
+ if (len > 6 && (endsWith(s, len, "ησουμε") || endsWith(s, len, "ηθουμε"))) {
+ len -= 6;
+ removed = true;
+ } else if (len > 4 && endsWith(s, len, "ουμε")) {
+ len -= 4;
+ removed = true;
+ }
+
+ if (removed && exc19.contains(s, 0, len)) {
+ len += 3;
+ s[len - 3] = 'ο';
+ s[len - 2] = 'υ';
+ s[len - 1] = 'μ';
+ }
+ return len;
+ }
+
+ private int rule20(char s[], int len) {
+ if (len > 5 && (endsWith(s, len, "ματων") || endsWith(s, len, "ματοσ")))
+ len -= 3;
+ else if (len > 4 && endsWith(s, len, "ματα"))
+ len -= 2;
+ return len;
+ }
+
+ private int rule21(char s[], int len) {
+ if (len > 9 && endsWith(s, len, "ιοντουσαν"))
+ return len - 9;
+
+ if (len > 8 && (endsWith(s, len, "ιομασταν") ||
+ endsWith(s, len, "ιοσασταν") ||
+ endsWith(s, len, "ιουμαστε") ||
+ endsWith(s, len, "οντουσαν")))
+ return len - 8;
+
+ if (len > 7 && (endsWith(s, len, "ιεμαστε") ||
+ endsWith(s, len, "ιεσαστε") ||
+ endsWith(s, len, "ιομουνα") ||
+ endsWith(s, len, "ιοσαστε") ||
+ endsWith(s, len, "ιοσουνα") ||
+ endsWith(s, len, "ιουνται") ||
+ endsWith(s, len, "ιουνταν") ||
+ endsWith(s, len, "ηθηκατε") ||
+ endsWith(s, len, "ομασταν") ||
+ endsWith(s, len, "οσασταν") ||
+ endsWith(s, len, "ουμαστε")))
+ return len - 7;
+
+ if (len > 6 && (endsWith(s, len, "ιομουν") ||
+ endsWith(s, len, "ιονταν") ||
+ endsWith(s, len, "ιοσουν") ||
+ endsWith(s, len, "ηθειτε") ||
+ endsWith(s, len, "ηθηκαν") ||
+ endsWith(s, len, "ομουνα") ||
+ endsWith(s, len, "οσαστε") ||
+ endsWith(s, len, "οσουνα") ||
+ endsWith(s, len, "ουνται") ||
+ endsWith(s, len, "ουνταν") ||
+ endsWith(s, len, "ουσατε")))
+ return len - 6;
+
+ if (len > 5 && (endsWith(s, len, "αγατε") ||
+ endsWith(s, len, "ιεμαι") ||
+ endsWith(s, len, "ιεται") ||
+ endsWith(s, len, "ιεσαι") ||
+ endsWith(s, len, "ιοταν") ||
+ endsWith(s, len, "ιουμα") ||
+ endsWith(s, len, "ηθεισ") ||
+ endsWith(s, len, "ηθουν") ||
+ endsWith(s, len, "ηκατε") ||
+ endsWith(s, len, "ησατε") ||
+ endsWith(s, len, "ησουν") ||
+ endsWith(s, len, "ομουν") ||
+ endsWith(s, len, "ονται") ||
+ endsWith(s, len, "ονταν") ||
+ endsWith(s, len, "οσουν") ||
+ endsWith(s, len, "ουμαι") ||
+ endsWith(s, len, "ουσαν")))
+ return len - 5;
+
+ if (len > 4 && (endsWith(s, len, "αγαν") ||
+ endsWith(s, len, "αμαι") ||
+ endsWith(s, len, "ασαι") ||
+ endsWith(s, len, "αται") ||
+ endsWith(s, len, "ειτε") ||
+ endsWith(s, len, "εσαι") ||
+ endsWith(s, len, "εται") ||
+ endsWith(s, len, "ηδεσ") ||
+ endsWith(s, len, "ηδων") ||
+ endsWith(s, len, "ηθει") ||
+ endsWith(s, len, "ηκαν") ||
+ endsWith(s, len, "ησαν") ||
+ endsWith(s, len, "ησει") ||
+ endsWith(s, len, "ησεσ") ||
+ endsWith(s, len, "ομαι") ||
+ endsWith(s, len, "οταν")))
+ return len - 4;
+
+ if (len > 3 && (endsWith(s, len, "αει") ||
+ endsWith(s, len, "εισ") ||
+ endsWith(s, len, "ηθω") ||
+ endsWith(s, len, "ησω") ||
+ endsWith(s, len, "ουν") ||
+ endsWith(s, len, "ουσ")))
+ return len - 3;
+
+ if (len > 2 && (endsWith(s, len, "αν") ||
+ endsWith(s, len, "ασ") ||
+ endsWith(s, len, "αω") ||
+ endsWith(s, len, "ει") ||
+ endsWith(s, len, "εσ") ||
+ endsWith(s, len, "ησ") ||
+ endsWith(s, len, "οι") ||
+ endsWith(s, len, "οσ") ||
+ endsWith(s, len, "ου") ||
+ endsWith(s, len, "υσ") ||
+ endsWith(s, len, "ων")))
+ return len - 2;
+
+ if (len > 1 && endsWithVowel(s, len))
+ return len - 1;
+
+ return len;
+ }
+
+ private int rule22(char s[], int len) {
+ if (endsWith(s, len, "εστερ") ||
+ endsWith(s, len, "εστατ"))
+ return len - 5;
+
+ if (endsWith(s, len, "οτερ") ||
+ endsWith(s, len, "οτατ") ||
+ endsWith(s, len, "υτερ") ||
+ endsWith(s, len, "υτατ") ||
+ endsWith(s, len, "ωτερ") ||
+ endsWith(s, len, "ωτατ"))
+ return len - 4;
+
+ return len;
+ }
+
+ private boolean endsWith(char s[], int len, String suffix) {
+ final int suffixLen = suffix.length();
+ if (suffixLen > len)
+ return false;
+ for (int i = suffixLen - 1; i >= 0; i--)
+ if (s[len -(suffixLen - i)] != suffix.charAt(i))
+ return false;
+
+ return true;
+ }
+
+ private boolean endsWithVowel(char s[], int len) {
+ if (len == 0)
+ return false;
+ switch(s[len - 1]) {
+ case 'α':
+ case 'ε':
+ case 'η':
+ case 'ι':
+ case 'ο':
+ case 'υ':
+ case 'ω':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ private boolean endsWithVowelNoY(char s[], int len) {
+ if (len == 0)
+ return false;
+ switch(s[len - 1]) {
+ case 'α':
+ case 'ε':
+ case 'η':
+ case 'ι':
+ case 'ο':
+ case 'ω':
+ return true;
+ default:
+ return false;
+ }
+ }
+}
diff --git a/modules/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt b/modules/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt
new file mode 100644
index 00000000000..1a08d318326
--- /dev/null
+++ b/modules/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt
@@ -0,0 +1,76 @@
+# Lucene Greek Stopwords list
+ο
+η
+το
+οι
+τα
+του
+τησ
+των
+τον
+την
+και
+κι
+κ
+ειμαι
+εισαι
+ειναι
+ειμαστε
+ειστε
+στο
+στον
+στη
+στην
+μα
+αλλα
+απο
+για
+προσ
+με
+σε
+ωσ
+παρα
+αντι
+κατα
+μετα
+θα
+να
+δε
+δεν
+μη
+μην
+επι
+ενω
+εαν
+αν
+τοτε
+που
+πωσ
+ποιοσ
+ποια
+ποιο
+ποιοι
+ποιεσ
+ποιων
+ποιουσ
+αυτοσ
+αυτη
+αυτο
+αυτοι
+αυτων
+αυτουσ
+αυτεσ
+αυτα
+εκεινοσ
+εκεινη
+εκεινο
+εκεινοι
+εκεινεσ
+εκεινα
+εκεινων
+εκεινουσ
+οπωσ
+ομωσ
+ισωσ
+οσο
+οτι
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
index 6f87c1fa5e1..69dbf0105b3 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
@@ -26,42 +26,67 @@ import org.apache.lucene.util.Version;
*/
public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
+ /**
+ * Test the analysis of various greek strings.
+ *
+ * @throws Exception in case an error occurs
+ */
+ public void testAnalyzer() throws Exception {
+ Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
+ // Verify the correct analysis of capitals and small accented letters, and
+ // stemming
+ assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
+ new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ",
+ "ελληνικ", "γλωσσ" });
+ // Verify the correct analysis of small letters with diaeresis and the elimination
+ // of punctuation marks
+ assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
+ new String[] { "προιοντ", "πολλαπλ", "αναγκ" });
+ // Verify the correct analysis of capital accented letters and capital letters with diaeresis,
+ // as well as the elimination of stop words
+ assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
+ new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
+ }
+
/**
* Test the analysis of various greek strings.
*
* @throws Exception in case an error occurs
+ * @deprecated Remove this test when support for 3.0 is no longer needed
*/
- public void testAnalyzer() throws Exception {
- Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
+ @Deprecated
+ public void testAnalyzerBWCompat() throws Exception {
+ Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
// Verify the correct analysis of capitals and small accented letters
- assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
- new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
- "\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
+ assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
+ new String[] { "μια", "εξαιρετικα", "καλη", "πλουσια", "σειρα", "χαρακτηρων",
+ "ελληνικησ", "γλωσσασ" });
// Verify the correct analysis of small letters with diaeresis and the elimination
// of punctuation marks
- assertAnalyzesTo(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
- new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
- // Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
+ assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
+ new String[] { "προιοντα", "πολλαπλεσ", "αναγκεσ" });
+ // Verify the correct analysis of capital accented letters and capital letters with diaeresis,
// as well as the elimination of stop words
- assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
- new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
+ assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
+ new String[] { "προυποθεσεισ", "αψογοσ", "μεστοσ", "αλλοι" });
}
- public void testReusableTokenStream() throws Exception {
- Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
- // Verify the correct analysis of capitals and small accented letters
- assertAnalyzesToReuse(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
- new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
- "\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
- // Verify the correct analysis of small letters with diaeresis and the elimination
- // of punctuation marks
- assertAnalyzesToReuse(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
- new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
- // Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
- // as well as the elimination of stop words
- assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
- new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
- }
+ public void testReusableTokenStream() throws Exception {
+ Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
+ // Verify the correct analysis of capitals and small accented letters, and
+ // stemming
+ assertAnalyzesToReuse(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
+ new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ",
+ "ελληνικ", "γλωσσ" });
+ // Verify the correct analysis of small letters with diaeresis and the elimination
+ // of punctuation marks
+ assertAnalyzesToReuse(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
+ new String[] { "προιοντ", "πολλαπλ", "αναγκ" });
+ // Verify the correct analysis of capital accented letters and capital letters with diaeresis,
+ // as well as the elimination of stop words
+ assertAnalyzesToReuse(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
+ new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
+ }
/**
* Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java
new file mode 100644
index 00000000000..1b95c29b31a
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java
@@ -0,0 +1,508 @@
+package org.apache.lucene.analysis.el;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestGreekStemmer extends BaseTokenStreamTestCase {
+ Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
+
+ public void testMasculineNouns() throws Exception {
+ // -ος
+ checkOneTerm(a, "άνθρωπος", "ανθρωπ");
+ checkOneTerm(a, "ανθρώπου", "ανθρωπ");
+ checkOneTerm(a, "άνθρωπο", "ανθρωπ");
+ checkOneTerm(a, "άνθρωπε", "ανθρωπ");
+ checkOneTerm(a, "άνθρωποι", "ανθρωπ");
+ checkOneTerm(a, "ανθρώπων", "ανθρωπ");
+ checkOneTerm(a, "ανθρώπους", "ανθρωπ");
+ checkOneTerm(a, "άνθρωποι", "ανθρωπ");
+
+ // -ης
+ checkOneTerm(a, "πελάτης", "πελατ");
+ checkOneTerm(a, "πελάτη", "πελατ");
+ checkOneTerm(a, "πελάτες", "πελατ");
+ checkOneTerm(a, "πελατών", "πελατ");
+
+ // -ας/-ες
+ checkOneTerm(a, "ελέφαντας", "ελεφαντ");
+ checkOneTerm(a, "ελέφαντα", "ελεφαντ");
+ checkOneTerm(a, "ελέφαντες", "ελεφαντ");
+ checkOneTerm(a, "ελεφάντων", "ελεφαντ");
+
+ // -ας/-αδες
+ checkOneTerm(a, "μπαμπάς", "μπαμπ");
+ checkOneTerm(a, "μπαμπά", "μπαμπ");
+ checkOneTerm(a, "μπαμπάδες", "μπαμπ");
+ checkOneTerm(a, "μπαμπάδων", "μπαμπ");
+
+ // -ης/-ηδες
+ checkOneTerm(a, "μπακάλης", "μπακαλ");
+ checkOneTerm(a, "μπακάλη", "μπακαλ");
+ checkOneTerm(a, "μπακάληδες", "μπακαλ");
+ checkOneTerm(a, "μπακάληδων", "μπακαλ");
+
+ // -ες
+ checkOneTerm(a, "καφές", "καφ");
+ checkOneTerm(a, "καφέ", "καφ");
+ checkOneTerm(a, "καφέδες", "καφ");
+ checkOneTerm(a, "καφέδων", "καφ");
+
+ // -έας/είς
+ checkOneTerm(a, "γραμματέας", "γραμματε");
+ checkOneTerm(a, "γραμματέα", "γραμματε");
+ // plural forms conflate w/ each other, not w/ the sing forms
+ checkOneTerm(a, "γραμματείς", "γραμματ");
+ checkOneTerm(a, "γραμματέων", "γραμματ");
+
+ // -ους/οι
+ checkOneTerm(a, "απόπλους", "αποπλ");
+ checkOneTerm(a, "απόπλου", "αποπλ");
+ checkOneTerm(a, "απόπλοι", "αποπλ");
+ checkOneTerm(a, "απόπλων", "αποπλ");
+
+ // -ους/-ουδες
+ checkOneTerm(a, "παππούς", "παππ");
+ checkOneTerm(a, "παππού", "παππ");
+ checkOneTerm(a, "παππούδες", "παππ");
+ checkOneTerm(a, "παππούδων", "παππ");
+
+ // -ης/-εις
+ checkOneTerm(a, "λάτρης", "λατρ");
+ checkOneTerm(a, "λάτρη", "λατρ");
+ checkOneTerm(a, "λάτρεις", "λατρ");
+ checkOneTerm(a, "λάτρεων", "λατρ");
+
+ // -υς
+ checkOneTerm(a, "πέλεκυς", "πελεκ");
+ checkOneTerm(a, "πέλεκυ", "πελεκ");
+ checkOneTerm(a, "πελέκεις", "πελεκ");
+ checkOneTerm(a, "πελέκεων", "πελεκ");
+
+ // -ωρ
+ // note: nom./voc. doesn't conflate w/ the rest
+ checkOneTerm(a, "μέντωρ", "μεντωρ");
+ checkOneTerm(a, "μέντορος", "μεντορ");
+ checkOneTerm(a, "μέντορα", "μεντορ");
+ checkOneTerm(a, "μέντορες", "μεντορ");
+ checkOneTerm(a, "μεντόρων", "μεντορ");
+
+ // -ων
+ checkOneTerm(a, "αγώνας", "αγων");
+ checkOneTerm(a, "αγώνος", "αγων");
+ checkOneTerm(a, "αγώνα", "αγων");
+ checkOneTerm(a, "αγώνα", "αγων");
+ checkOneTerm(a, "αγώνες", "αγων");
+ checkOneTerm(a, "αγώνων", "αγων");
+
+ // -ας/-ηδες
+ checkOneTerm(a, "αέρας", "αερ");
+ checkOneTerm(a, "αέρα", "αερ");
+ checkOneTerm(a, "αέρηδες", "αερ");
+ checkOneTerm(a, "αέρηδων", "αερ");
+
+ // -ης/-ητες
+ checkOneTerm(a, "γόης", "γο");
+ checkOneTerm(a, "γόη", "γοη"); // too short
+ // the two plural forms conflate
+ checkOneTerm(a, "γόητες", "γοητ");
+ checkOneTerm(a, "γοήτων", "γοητ");
+ }
+
+ public void testFeminineNouns() throws Exception {
+ // -α/-ες,-ών
+ checkOneTerm(a, "φορά", "φορ");
+ checkOneTerm(a, "φοράς", "φορ");
+ checkOneTerm(a, "φορές", "φορ");
+ checkOneTerm(a, "φορών", "φορ");
+
+ // -α/-ες,-ων
+ checkOneTerm(a, "αγελάδα", "αγελαδ");
+ checkOneTerm(a, "αγελάδας", "αγελαδ");
+ checkOneTerm(a, "αγελάδες", "αγελαδ");
+ checkOneTerm(a, "αγελάδων", "αγελαδ");
+
+ // -η/-ες
+ checkOneTerm(a, "ζάχαρη", "ζαχαρ");
+ checkOneTerm(a, "ζάχαρης", "ζαχαρ");
+ checkOneTerm(a, "ζάχαρες", "ζαχαρ");
+ checkOneTerm(a, "ζαχάρεων", "ζαχαρ");
+
+ // -η/-εις
+ checkOneTerm(a, "τηλεόραση", "τηλεορασ");
+ checkOneTerm(a, "τηλεόρασης", "τηλεορασ");
+ checkOneTerm(a, "τηλεοράσεις", "τηλεορασ");
+ checkOneTerm(a, "τηλεοράσεων", "τηλεορασ");
+
+ // -α/-αδες
+ checkOneTerm(a, "μαμά", "μαμ");
+ checkOneTerm(a, "μαμάς", "μαμ");
+ checkOneTerm(a, "μαμάδες", "μαμ");
+ checkOneTerm(a, "μαμάδων", "μαμ");
+
+ // -ος
+ checkOneTerm(a, "λεωφόρος", "λεωφορ");
+ checkOneTerm(a, "λεωφόρου", "λεωφορ");
+ checkOneTerm(a, "λεωφόρο", "λεωφορ");
+ checkOneTerm(a, "λεωφόρε", "λεωφορ");
+ checkOneTerm(a, "λεωφόροι", "λεωφορ");
+ checkOneTerm(a, "λεωφόρων", "λεωφορ");
+ checkOneTerm(a, "λεωφόρους", "λεωφορ");
+
+ // -ου
+ checkOneTerm(a, "αλεπού", "αλεπ");
+ checkOneTerm(a, "αλεπούς", "αλεπ");
+ checkOneTerm(a, "αλεπούδες", "αλεπ");
+ checkOneTerm(a, "αλεπούδων", "αλεπ");
+
+ // -έας/είς
+ // note: not all forms conflate
+ checkOneTerm(a, "γραμματέας", "γραμματε");
+ checkOneTerm(a, "γραμματέως", "γραμματ");
+ checkOneTerm(a, "γραμματέα", "γραμματε");
+ checkOneTerm(a, "γραμματείς", "γραμματ");
+ checkOneTerm(a, "γραμματέων", "γραμματ");
+ }
+
+ public void testNeuterNouns() throws Exception {
+ // ending with -ο
+ // note: nom doesnt conflate
+ checkOneTerm(a, "βιβλίο", "βιβλι");
+ checkOneTerm(a, "βιβλίου", "βιβλ");
+ checkOneTerm(a, "βιβλία", "βιβλ");
+ checkOneTerm(a, "βιβλίων", "βιβλ");
+
+ // ending with -ι
+ checkOneTerm(a, "πουλί", "πουλ");
+ checkOneTerm(a, "πουλιού", "πουλ");
+ checkOneTerm(a, "πουλιά", "πουλ");
+ checkOneTerm(a, "πουλιών", "πουλ");
+
+ // ending with -α
+ // note: nom. doesnt conflate
+ checkOneTerm(a, "πρόβλημα", "προβλημ");
+ checkOneTerm(a, "προβλήματος", "προβλημα");
+ checkOneTerm(a, "προβλήματα", "προβλημα");
+ checkOneTerm(a, "προβλημάτων", "προβλημα");
+
+ // ending with -ος/-ους
+ checkOneTerm(a, "πέλαγος", "πελαγ");
+ checkOneTerm(a, "πελάγους", "πελαγ");
+ checkOneTerm(a, "πελάγη", "πελαγ");
+ checkOneTerm(a, "πελάγων", "πελαγ");
+
+ // ending with -ός/-ότος
+ checkOneTerm(a, "γεγονός", "γεγον");
+ checkOneTerm(a, "γεγονότος", "γεγον");
+ checkOneTerm(a, "γεγονότα", "γεγον");
+ checkOneTerm(a, "γεγονότων", "γεγον");
+
+ // ending with -υ/-ιου
+ checkOneTerm(a, "βράδυ", "βραδ");
+ checkOneTerm(a, "βράδι", "βραδ");
+ checkOneTerm(a, "βραδιού", "βραδ");
+ checkOneTerm(a, "βράδια", "βραδ");
+ checkOneTerm(a, "βραδιών", "βραδ");
+
+ // ending with -υ/-ατος
+ // note: nom. doesnt conflate
+ checkOneTerm(a, "δόρυ", "δορ");
+ checkOneTerm(a, "δόρατος", "δορατ");
+ checkOneTerm(a, "δόρατα", "δορατ");
+ checkOneTerm(a, "δοράτων", "δορατ");
+
+ // ending with -ας
+ checkOneTerm(a, "κρέας", "κρε");
+ checkOneTerm(a, "κρέατος", "κρε");
+ checkOneTerm(a, "κρέατα", "κρε");
+ checkOneTerm(a, "κρεάτων", "κρε");
+
+ // ending with -ως
+ checkOneTerm(a, "λυκόφως", "λυκοφω");
+ checkOneTerm(a, "λυκόφωτος", "λυκοφω");
+ checkOneTerm(a, "λυκόφωτα", "λυκοφω");
+ checkOneTerm(a, "λυκοφώτων", "λυκοφω");
+
+ // ending with -ον/-ου
+ // note: nom. doesnt conflate
+ checkOneTerm(a, "μέσον", "μεσον");
+ checkOneTerm(a, "μέσου", "μεσ");
+ checkOneTerm(a, "μέσα", "μεσ");
+ checkOneTerm(a, "μέσων", "μεσ");
+
+ // ending in -ον/-οντος
+ // note: nom. doesnt conflate
+ checkOneTerm(a, "ενδιαφέρον", "ενδιαφερον");
+ checkOneTerm(a, "ενδιαφέροντος", "ενδιαφεροντ");
+ checkOneTerm(a, "ενδιαφέροντα", "ενδιαφεροντ");
+ checkOneTerm(a, "ενδιαφερόντων", "ενδιαφεροντ");
+
+ // ending with -εν/-εντος
+ checkOneTerm(a, "ανακοινωθέν", "ανακοινωθεν");
+ checkOneTerm(a, "ανακοινωθέντος", "ανακοινωθεντ");
+ checkOneTerm(a, "ανακοινωθέντα", "ανακοινωθεντ");
+ checkOneTerm(a, "ανακοινωθέντων", "ανακοινωθεντ");
+
+ // ending with -αν/-αντος
+ checkOneTerm(a, "σύμπαν", "συμπ");
+ checkOneTerm(a, "σύμπαντος", "συμπαντ");
+ checkOneTerm(a, "σύμπαντα", "συμπαντ");
+ checkOneTerm(a, "συμπάντων", "συμπαντ");
+
+ // ending with -α/-ακτος
+ checkOneTerm(a, "γάλα", "γαλ");
+ checkOneTerm(a, "γάλακτος", "γαλακτ");
+ checkOneTerm(a, "γάλατα", "γαλατ");
+ checkOneTerm(a, "γαλάκτων", "γαλακτ");
+ }
+
+ public void testAdjectives() throws Exception {
+ // ending with -ής, -ές/-είς, -ή
+ checkOneTerm(a, "συνεχής", "συνεχ");
+ checkOneTerm(a, "συνεχούς", "συνεχ");
+ checkOneTerm(a, "συνεχή", "συνεχ");
+ checkOneTerm(a, "συνεχών", "συνεχ");
+ checkOneTerm(a, "συνεχείς", "συνεχ");
+ checkOneTerm(a, "συνεχές", "συνεχ");
+
+ // ending with -ης, -ες/-εις, -η
+ checkOneTerm(a, "συνήθης", "συνηθ");
+ checkOneTerm(a, "συνήθους", "συνηθ");
+ checkOneTerm(a, "συνήθη", "συνηθ");
+ // note: doesn't conflate
+ checkOneTerm(a, "συνήθεις", "συν");
+ checkOneTerm(a, "συνήθων", "συνηθ");
+ checkOneTerm(a, "σύνηθες", "συνηθ");
+
+ // ending with -υς, -υ/-εις, -ια
+ checkOneTerm(a, "βαθύς", "βαθ");
+ checkOneTerm(a, "βαθέος", "βαθε");
+ checkOneTerm(a, "βαθύ", "βαθ");
+ checkOneTerm(a, "βαθείς", "βαθ");
+ checkOneTerm(a, "βαθέων", "βαθ");
+
+ checkOneTerm(a, "βαθιά", "βαθ");
+ checkOneTerm(a, "βαθιάς", "βαθι");
+ checkOneTerm(a, "βαθιές", "βαθι");
+ checkOneTerm(a, "βαθιών", "βαθ");
+
+ checkOneTerm(a, "βαθέα", "βαθε");
+
+ // comparative/superlative
+ checkOneTerm(a, "ψηλός", "ψηλ");
+ checkOneTerm(a, "ψηλότερος", "ψηλ");
+ checkOneTerm(a, "ψηλότατος", "ψηλ");
+
+ checkOneTerm(a, "ωραίος", "ωραι");
+ checkOneTerm(a, "ωραιότερος", "ωραι");
+ checkOneTerm(a, "ωραιότατος", "ωραι");
+
+ checkOneTerm(a, "επιεικής", "επιεικ");
+ checkOneTerm(a, "επιεικέστερος", "επιεικ");
+ checkOneTerm(a, "επιεικέστατος", "επιεικ");
+ }
+
+
+ public void testVerbs() throws Exception {
+ // note, past/present verb stems will not conflate (from the paper)
+ //-ω,-α/-.ω,-.α
+ checkOneTerm(a, "ορίζω", "οριζ");
+ checkOneTerm(a, "όριζα", "οριζ");
+ checkOneTerm(a, "όριζε", "οριζ");
+ checkOneTerm(a, "ορίζοντας", "οριζ");
+ checkOneTerm(a, "ορίζομαι", "οριζ");
+ checkOneTerm(a, "οριζόμουν", "οριζ");
+ checkOneTerm(a, "ορίζεσαι", "οριζ");
+
+ checkOneTerm(a, "όρισα", "ορισ");
+ checkOneTerm(a, "ορίσω", "ορισ");
+ checkOneTerm(a, "όρισε", "ορισ");
+ checkOneTerm(a, "ορίσει", "ορισ");
+
+ checkOneTerm(a, "ορίστηκα", "οριστ");
+ checkOneTerm(a, "οριστώ", "οριστ");
+ checkOneTerm(a, "οριστείς", "οριστ");
+ checkOneTerm(a, "οριστεί", "οριστ");
+
+ checkOneTerm(a, "ορισμένο", "ορισμεν");
+ checkOneTerm(a, "ορισμένη", "ορισμεν");
+ checkOneTerm(a, "ορισμένος", "ορισμεν");
+
+ // -ω,-α/-ξω,-ξα
+ checkOneTerm(a, "ανοίγω", "ανοιγ");
+ checkOneTerm(a, "άνοιγα", "ανοιγ");
+ checkOneTerm(a, "άνοιγε", "ανοιγ");
+ checkOneTerm(a, "ανοίγοντας", "ανοιγ");
+ checkOneTerm(a, "ανοίγομαι", "ανοιγ");
+ checkOneTerm(a, "ανοιγόμουν", "ανοιγ");
+
+ checkOneTerm(a, "άνοιξα", "ανοιξ");
+ checkOneTerm(a, "ανοίξω", "ανοιξ");
+ checkOneTerm(a, "άνοιξε", "ανοιξ");
+ checkOneTerm(a, "ανοίξει", "ανοιξ");
+
+ checkOneTerm(a, "ανοίχτηκα", "ανοιχτ");
+ checkOneTerm(a, "ανοιχτώ", "ανοιχτ");
+ checkOneTerm(a, "ανοίχτηκα", "ανοιχτ");
+ checkOneTerm(a, "ανοιχτείς", "ανοιχτ");
+ checkOneTerm(a, "ανοιχτεί", "ανοιχτ");
+
+ checkOneTerm(a, "ανοίξου", "ανοιξ");
+
+ //-ώ/-άω,-ούσα/-άσω,-ασα
+ checkOneTerm(a, "περνώ", "περν");
+ checkOneTerm(a, "περνάω", "περν");
+ checkOneTerm(a, "περνούσα", "περν");
+ checkOneTerm(a, "πέρναγα", "περν");
+ checkOneTerm(a, "πέρνα", "περν");
+ checkOneTerm(a, "περνώντας", "περν");
+
+ checkOneTerm(a, "πέρασα", "περασ");
+ checkOneTerm(a, "περάσω", "περασ");
+ checkOneTerm(a, "πέρασε", "περασ");
+ checkOneTerm(a, "περάσει", "περασ");
+
+ checkOneTerm(a, "περνιέμαι", "περν");
+ checkOneTerm(a, "περνιόμουν", "περν");
+
+ checkOneTerm(a, "περάστηκα", "περαστ");
+ checkOneTerm(a, "περαστώ", "περαστ");
+ checkOneTerm(a, "περαστείς", "περαστ");
+ checkOneTerm(a, "περαστεί", "περαστ");
+
+ checkOneTerm(a, "περασμένο", "περασμεν");
+ checkOneTerm(a, "περασμένη", "περασμεν");
+ checkOneTerm(a, "περασμένος", "περασμεν");
+
+ // -ώ/-άω,-ούσα/-άξω,-αξα
+ checkOneTerm(a, "πετώ", "πετ");
+ checkOneTerm(a, "πετάω", "πετ");
+ checkOneTerm(a, "πετούσα", "πετ");
+ checkOneTerm(a, "πέταγα", "πετ");
+ checkOneTerm(a, "πέτα", "πετ");
+ checkOneTerm(a, "πετώντας", "πετ");
+ checkOneTerm(a, "πετιέμαι", "πετ");
+ checkOneTerm(a, "πετιόμουν", "πετ");
+
+ checkOneTerm(a, "πέταξα", "πεταξ");
+ checkOneTerm(a, "πετάξω", "πεταξ");
+ checkOneTerm(a, "πέταξε", "πεταξ");
+ checkOneTerm(a, "πετάξει", "πεταξ");
+
+ checkOneTerm(a, "πετάχτηκα", "πεταχτ");
+ checkOneTerm(a, "πεταχτώ", "πεταχτ");
+ checkOneTerm(a, "πεταχτείς", "πεταχτ");
+ checkOneTerm(a, "πεταχτεί", "πεταχτ");
+
+ checkOneTerm(a, "πεταμένο", "πεταμεν");
+ checkOneTerm(a, "πεταμένη", "πεταμεν");
+ checkOneTerm(a, "πεταμένος", "πεταμεν");
+
+ // -ώ/-άω,-ούσα / -έσω,-εσα
+ checkOneTerm(a, "καλώ", "καλ");
+ checkOneTerm(a, "καλούσα", "καλ");
+ checkOneTerm(a, "καλείς", "καλ");
+ checkOneTerm(a, "καλώντας", "καλ");
+
+ checkOneTerm(a, "καλούμαι", "καλ");
+ // pass. imperfect /imp. progressive doesnt conflate
+ checkOneTerm(a, "καλούμουν", "καλουμ");
+ checkOneTerm(a, "καλείσαι", "καλεισα");
+
+ checkOneTerm(a, "καλέστηκα", "καλεστ");
+ checkOneTerm(a, "καλεστώ", "καλεστ");
+ checkOneTerm(a, "καλεστείς", "καλεστ");
+ checkOneTerm(a, "καλεστεί", "καλεστ");
+
+ checkOneTerm(a, "καλεσμένο", "καλεσμεν");
+ checkOneTerm(a, "καλεσμένη", "καλεσμεν");
+ checkOneTerm(a, "καλεσμένος", "καλεσμεν");
+
+ checkOneTerm(a, "φορώ", "φορ");
+ checkOneTerm(a, "φοράω", "φορ");
+ checkOneTerm(a, "φορούσα", "φορ");
+ checkOneTerm(a, "φόραγα", "φορ");
+ checkOneTerm(a, "φόρα", "φορ");
+ checkOneTerm(a, "φορώντας", "φορ");
+ checkOneTerm(a, "φοριέμαι", "φορ");
+ checkOneTerm(a, "φοριόμουν", "φορ");
+ checkOneTerm(a, "φοριέσαι", "φορ");
+
+ checkOneTerm(a, "φόρεσα", "φορεσ");
+ checkOneTerm(a, "φορέσω", "φορεσ");
+ checkOneTerm(a, "φόρεσε", "φορεσ");
+ checkOneTerm(a, "φορέσει", "φορεσ");
+
+ checkOneTerm(a, "φορέθηκα", "φορεθ");
+ checkOneTerm(a, "φορεθώ", "φορεθ");
+ checkOneTerm(a, "φορεθείς", "φορεθ");
+ checkOneTerm(a, "φορεθεί", "φορεθ");
+
+ checkOneTerm(a, "φορεμένο", "φορεμεν");
+ checkOneTerm(a, "φορεμένη", "φορεμεν");
+ checkOneTerm(a, "φορεμένος", "φορεμεν");
+
+ // -ώ/-άω,-ούσα / -ήσω,-ησα
+ checkOneTerm(a, "κρατώ", "κρατ");
+ checkOneTerm(a, "κρατάω", "κρατ");
+ checkOneTerm(a, "κρατούσα", "κρατ");
+ checkOneTerm(a, "κράταγα", "κρατ");
+ checkOneTerm(a, "κράτα", "κρατ");
+ checkOneTerm(a, "κρατώντας", "κρατ");
+
+ checkOneTerm(a, "κράτησα", "κρατ");
+ checkOneTerm(a, "κρατήσω", "κρατ");
+ checkOneTerm(a, "κράτησε", "κρατ");
+ checkOneTerm(a, "κρατήσει", "κρατ");
+
+ checkOneTerm(a, "κρατούμαι", "κρατ");
+ checkOneTerm(a, "κρατιέμαι", "κρατ");
+ // this imperfect form doesnt conflate
+ checkOneTerm(a, "κρατούμουν", "κρατουμ");
+ checkOneTerm(a, "κρατιόμουν", "κρατ");
+ // this imp. prog form doesnt conflate
+ checkOneTerm(a, "κρατείσαι", "κρατεισα");
+
+ checkOneTerm(a, "κρατήθηκα", "κρατ");
+ checkOneTerm(a, "κρατηθώ", "κρατ");
+ checkOneTerm(a, "κρατηθείς", "κρατ");
+ checkOneTerm(a, "κρατηθεί", "κρατ");
+ checkOneTerm(a, "κρατήσου", "κρατ");
+
+ checkOneTerm(a, "κρατημένο", "κρατημεν");
+ checkOneTerm(a, "κρατημένη", "κρατημεν");
+ checkOneTerm(a, "κρατημένος", "κρατημεν");
+
+ // -.μαι,-.μουν / -.ώ,-.ηκα
+ checkOneTerm(a, "κοιμάμαι", "κοιμ");
+ checkOneTerm(a, "κοιμόμουν", "κοιμ");
+ checkOneTerm(a, "κοιμάσαι", "κοιμ");
+
+ checkOneTerm(a, "κοιμήθηκα", "κοιμ");
+ checkOneTerm(a, "κοιμηθώ", "κοιμ");
+ checkOneTerm(a, "κοιμήσου", "κοιμ");
+ checkOneTerm(a, "κοιμηθεί", "κοιμ");
+
+ checkOneTerm(a, "κοιμισμένο", "κοιμισμεν");
+ checkOneTerm(a, "κοιμισμένη", "κοιμισμεν");
+ checkOneTerm(a, "κοιμισμένος", "κοιμισμεν");
+ }
+
+ public void testExceptions() throws Exception {
+ checkOneTerm(a, "καθεστώτα", "καθεστ");
+ checkOneTerm(a, "καθεστώτος", "καθεστ");
+ checkOneTerm(a, "καθεστώς", "καθεστ");
+ checkOneTerm(a, "καθεστώτων", "καθεστ");
+
+ checkOneTerm(a, "χουμε", "χουμ");
+ checkOneTerm(a, "χουμ", "χουμ");
+
+ checkOneTerm(a, "υποταγεσ", "υποταγ");
+ checkOneTerm(a, "υποταγ", "υποταγ");
+
+ checkOneTerm(a, "εμετε", "εμετ");
+ checkOneTerm(a, "εμετ", "εμετ");
+
+ checkOneTerm(a, "αρχοντασ", "αρχοντ");
+ checkOneTerm(a, "αρχοντων", "αρχοντ");
+ }
+}
diff --git a/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java
index 61fc2c06ad1..c9dd4101021 100644
--- a/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java
@@ -33,6 +33,7 @@ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory
@Override
public void init(Map