diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index c1da64a3e9b..ee3595cab49 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -157,6 +157,9 @@ New features * LUCENE-2393: The HighFreqTerms tool (in misc) can now optionally also include the total termFreq. (Tom Burton-West via Mike McCandless) + * LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words + when Version is set to 3.1 or higher. (Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java index 477881b005f..4dcf341e2f2 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java @@ -16,9 +16,7 @@ package org.apache.lucene.analysis.el; * limitations under the License. */ - import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; @@ -28,8 +26,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.util.Version; +import java.io.IOException; import java.io.Reader; -import java.util.Arrays; import java.util.Map; import java.util.Set; @@ -45,7 +43,7 @@ import java.util.Set; *

You must specify the required {@link Version} * compatibility when creating GreekAnalyzer: *

@@ -53,73 +51,74 @@ import java.util.Set; *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ -public final class GreekAnalyzer extends StopwordAnalyzerBase -{ - /** - * List of typical Greek stopwords. - */ - private static final String[] GREEK_STOP_WORDS = { - "ο", "η", "το", "οι", "τα", "του", "τησ", "των", "τον", "την", "και", - "κι", "κ", "ειμαι", "εισαι", "ειναι", "ειμαστε", "ειστε", "στο", "στον", - "στη", "στην", "μα", "αλλα", "απο", "για", "προσ", "με", "σε", "ωσ", - "παρα", "αντι", "κατα", "μετα", "θα", "να", "δε", "δεν", "μη", "μην", - "επι", "ενω", "εαν", "αν", "τοτε", "που", "πωσ", "ποιοσ", "ποια", "ποιο", - "ποιοι", "ποιεσ", "ποιων", "ποιουσ", "αυτοσ", "αυτη", "αυτο", "αυτοι", - "αυτων", "αυτουσ", "αυτεσ", "αυτα", "εκεινοσ", "εκεινη", "εκεινο", - "εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ", - "ισωσ", "οσο", "οτι" - }; +public final class GreekAnalyzer extends StopwordAnalyzerBase { + /** File containing default Greek stopwords. */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Returns a set of default Greek-stopwords + * @return a set of default Greek-stopwords + */ + public static final Set getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_SET; + } + + private static class DefaultSetHolder { + private static final Set DEFAULT_SET; - /** - * Returns a set of default Greek-stopwords - * @return a set of default Greek-stopwords - */ - public static final Set getDefaultStopSet(){ - return DefaultSetHolder.DEFAULT_SET; + static { + try { + DEFAULT_SET = loadStopwordSet(false, GreekAnalyzer.class, DEFAULT_STOPWORD_FILE, "#"); + } catch (IOException ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load default stopword set"); + } } - - private static class DefaultSetHolder { - private static final Set DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet( - Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false)); - } - - public GreekAnalyzer(Version matchVersion) { - this(matchVersion, DefaultSetHolder.DEFAULT_SET); - } - - /** - * Builds an analyzer with the given stop words - * - * @param matchVersion - * lucene compatibility version - * @param stopwords - * a stopword set - */ - public GreekAnalyzer(Version matchVersion, Set stopwords) { - super(matchVersion, stopwords); - } - - /** - * Builds an analyzer with the given stop words. - * @param stopwords Array of stopwords to use. - * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead - */ - @Deprecated - public GreekAnalyzer(Version matchVersion, String... stopwords) - { - this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords)); - } - - /** - * Builds an analyzer with the given stop words. - * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead - */ - @Deprecated - public GreekAnalyzer(Version matchVersion, Map stopwords) - { - this(matchVersion, stopwords.keySet()); - } - + } + + /** + * Builds an analyzer with the default stop words. + * @param matchVersion Lucene compatibility version, + * See above + */ + public GreekAnalyzer(Version matchVersion) { + this(matchVersion, DefaultSetHolder.DEFAULT_SET); + } + + /** + * Builds an analyzer with the given stop words. + *

+ * NOTE: The stopwords set should be pre-processed with the logic of + * {@link GreekLowerCaseFilter} for best results. + * + * @param matchVersion Lucene compatibility version, + * See above + * @param stopwords a stopword set + */ + public GreekAnalyzer(Version matchVersion, Set stopwords) { + super(matchVersion, stopwords); + } + + /** + * Builds an analyzer with the given stop words. + * @param stopwords Array of stopwords to use. + * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead + */ + @Deprecated + public GreekAnalyzer(Version matchVersion, String... stopwords) { + this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords)); + } + + /** + * Builds an analyzer with the given stop words. + * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead + */ + @Deprecated + public GreekAnalyzer(Version matchVersion, Map stopwords) { + this(matchVersion, stopwords.keySet()); + } + /** * Creates * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} @@ -127,16 +126,19 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase * * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with - * {@link GreekLowerCaseFilter}, {@link StandardFilter} and - * {@link StopFilter} + * {@link GreekLowerCaseFilter}, {@link StandardFilter}, + * {@link StopFilter}, and {@link GreekStemFilter} */ - @Override - protected TokenStreamComponents createComponents(String fieldName, - Reader reader) { - final Tokenizer source = new StandardTokenizer(matchVersion, reader); - TokenStream result = new GreekLowerCaseFilter(source); - if (matchVersion.onOrAfter(Version.LUCENE_31)) - result = new StandardFilter(result); - return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); - } + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new GreekLowerCaseFilter(matchVersion, source); + if (matchVersion.onOrAfter(Version.LUCENE_31)) + result = new StandardFilter(result); + result = new StopFilter(matchVersion, result, stopwords); + if (matchVersion.onOrAfter(Version.LUCENE_31)) + result = new GreekStemFilter(result); + return new TokenStreamComponents(source, result); + } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java index 53da0b8d1f8..d93860eb7c9 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java @@ -20,97 +20,115 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.util.Version; /** * Normalizes token text to lower case, removes some Greek diacritics, * and standardizes final sigma to sigma. - * + * + *

You must specify the required {@link Version} + * compatibility when creating GreekLowerCaseFilter: + *

*/ -public final class GreekLowerCaseFilter extends TokenFilter -{ - private TermAttribute termAtt; - - public GreekLowerCaseFilter(TokenStream in) - { - super(in); - termAtt = addAttribute(TermAttribute.class); - } +public final class GreekLowerCaseFilter extends TokenFilter { + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final CharacterUtils charUtils; - @Override - public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - char[] chArray = termAtt.termBuffer(); - int chLen = termAtt.termLength(); - // TODO: iterate codepoints to support supp. characters - for (int i = 0; i < chLen; i++) - { - chArray[i] = (char) lowerCase(chArray[i]); - } - return true; - } else { - return false; - } + /** @deprecated Use {@link #GreekLowerCaseFilter(Version, TokenStream)} instead. */ + @Deprecated + public GreekLowerCaseFilter(TokenStream in) { + this(Version.LUCENE_30, in); + } + + /** + * Create a GreekLowerCaseFilter that normalizes Greek token text. + * + * @param matchVersion Lucene compatibility version, + * See above + * @param in TokenStream to filter + */ + public GreekLowerCaseFilter(Version matchVersion, TokenStream in) { + super(in); + this.charUtils = CharacterUtils.getInstance(matchVersion); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] chArray = termAtt.buffer(); + int chLen = termAtt.length(); + for (int i = 0; i < chLen;) { + i += Character.toChars( + lowerCase(charUtils.codePointAt(chArray, i)), chArray, i); + } + return true; + } else { + return false; } - - private int lowerCase(int codepoint) { - switch(codepoint) { - /* There are two lowercase forms of sigma: - * U+03C2: small final sigma (end of word) - * U+03C3: small sigma (otherwise) - * - * Standardize both to U+03C3 - */ - case '\u03C2': /* small final sigma */ - return '\u03C3'; /* small sigma */ + } + + private int lowerCase(int codepoint) { + switch(codepoint) { + /* There are two lowercase forms of sigma: + * U+03C2: small final sigma (end of word) + * U+03C3: small sigma (otherwise) + * + * Standardize both to U+03C3 + */ + case '\u03C2': /* small final sigma */ + return '\u03C3'; /* small sigma */ - /* Some greek characters contain diacritics. - * This filter removes these, converting to the lowercase base form. - */ + /* Some greek characters contain diacritics. + * This filter removes these, converting to the lowercase base form. + */ - case '\u0386': /* capital alpha with tonos */ - case '\u03AC': /* small alpha with tonos */ - return '\u03B1'; /* small alpha */ - - case '\u0388': /* capital epsilon with tonos */ - case '\u03AD': /* small epsilon with tonos */ - return '\u03B5'; /* small epsilon */ - - case '\u0389': /* capital eta with tonos */ - case '\u03AE': /* small eta with tonos */ - return '\u03B7'; /* small eta */ + case '\u0386': /* capital alpha with tonos */ + case '\u03AC': /* small alpha with tonos */ + return '\u03B1'; /* small alpha */ - case '\u038A': /* capital iota with tonos */ - case '\u03AA': /* capital iota with dialytika */ - case '\u03AF': /* small iota with tonos */ - case '\u03CA': /* small iota with dialytika */ - case '\u0390': /* small iota with dialytika and tonos */ - return '\u03B9'; /* small iota */ - - case '\u038E': /* capital upsilon with tonos */ - case '\u03AB': /* capital upsilon with dialytika */ - case '\u03CD': /* small upsilon with tonos */ - case '\u03CB': /* small upsilon with dialytika */ - case '\u03B0': /* small upsilon with dialytika and tonos */ - return '\u03C5'; /* small upsilon */ - - case '\u038C': /* capital omicron with tonos */ - case '\u03CC': /* small omicron with tonos */ - return '\u03BF'; /* small omicron */ - - case '\u038F': /* capital omega with tonos */ - case '\u03CE': /* small omega with tonos */ - return '\u03C9'; /* small omega */ - - /* The previous implementation did the conversion below. - * Only implemented for backwards compatibility with old indexes. - */ - - case '\u03A2': /* reserved */ - return '\u03C2'; /* small final sigma */ - - default: - return Character.toLowerCase(codepoint); - } + case '\u0388': /* capital epsilon with tonos */ + case '\u03AD': /* small epsilon with tonos */ + return '\u03B5'; /* small epsilon */ + + case '\u0389': /* capital eta with tonos */ + case '\u03AE': /* small eta with tonos */ + return '\u03B7'; /* small eta */ + + case '\u038A': /* capital iota with tonos */ + case '\u03AA': /* capital iota with dialytika */ + case '\u03AF': /* small iota with tonos */ + case '\u03CA': /* small iota with dialytika */ + case '\u0390': /* small iota with dialytika and tonos */ + return '\u03B9'; /* small iota */ + + case '\u038E': /* capital upsilon with tonos */ + case '\u03AB': /* capital upsilon with dialytika */ + case '\u03CD': /* small upsilon with tonos */ + case '\u03CB': /* small upsilon with dialytika */ + case '\u03B0': /* small upsilon with dialytika and tonos */ + return '\u03C5'; /* small upsilon */ + + case '\u038C': /* capital omicron with tonos */ + case '\u03CC': /* small omicron with tonos */ + return '\u03BF'; /* small omicron */ + + case '\u038F': /* capital omega with tonos */ + case '\u03CE': /* small omega with tonos */ + return '\u03C9'; /* small omega */ + + /* The previous implementation did the conversion below. + * Only implemented for backwards compatibility with old indexes. + */ + + case '\u03A2': /* reserved */ + return '\u03C2'; /* small final sigma */ + + default: + return Character.toLowerCase(codepoint); } + } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilter.java new file mode 100644 index 00000000000..4aef18df85b --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilter.java @@ -0,0 +1,63 @@ +package org.apache.lucene.analysis.el; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.KeywordMarkerFilter; // for javadoc +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * A {@link TokenFilter} that applies {@link GreekStemmer} to stem Greek + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ *

+ * NOTE: Input is expected to be casefolded for Greek (including folding of final + * sigma to sigma), and with diacritics removed. This can be achieved by using + * either {@link GreekLowerCaseFilter} or ICUFoldingFilter before GreekStemFilter. + * @lucene.experimental + */ +public final class GreekStemFilter extends TokenFilter { + private final GreekStemmer stemmer = new GreekStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public GreekStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if(!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java new file mode 100644 index 00000000000..53cef6a919d --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java @@ -0,0 +1,819 @@ +package org.apache.lucene.analysis.el; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.util.Version; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for Greek words, according to: Development of a Stemmer for the + * Greek Language. Georgios Ntais + *

+ * NOTE: Input is expected to be casefolded for Greek (including folding of final + * sigma to sigma), and with diacritics removed. This can be achieved with + * either {@link GreekLowerCaseFilter} or ICUFoldingFilter. + * @lucene.experimental + */ +public class GreekStemmer { + public int stem(char s[], int len) { + if (len < 4) // too short + return len; + + final int origLen = len; + // "short rules": if it hits one of these, it skips the "long list" + len = rule0(s, len); + len = rule1(s, len); + len = rule2(s, len); + len = rule3(s, len); + len = rule4(s, len); + len = rule5(s, len); + len = rule6(s, len); + len = rule7(s, len); + len = rule8(s, len); + len = rule9(s, len); + len = rule10(s, len); + len = rule11(s, len); + len = rule12(s, len); + len = rule13(s, len); + len = rule14(s, len); + len = rule15(s, len); + len = rule16(s, len); + len = rule17(s, len); + len = rule18(s, len); + len = rule19(s, len); + len = rule20(s, len); + // "long list" + if (len == origLen) + len = rule21(s, len); + + return rule22(s, len); + } + + private int rule0(char s[], int len) { + if (len > 9 && (endsWith(s, len, "καθεστωτοσ") + || endsWith(s, len, "καθεστωτων"))) + return len - 4; + + if (len > 8 && (endsWith(s, len, "γεγονοτοσ") + || endsWith(s, len, "γεγονοτων"))) + return len - 4; + + if (len > 8 && endsWith(s, len, "καθεστωτα")) + return len - 3; + + if (len > 7 && (endsWith(s, len, "τατογιου") + || endsWith(s, len, "τατογιων"))) + return len - 4; + + if (len > 7 && endsWith(s, len, "γεγονοτα")) + return len - 3; + + if (len > 7 && endsWith(s, len, "καθεστωσ")) + return len - 2; + + if (len > 6 && (endsWith(s, len, "σκαγιου")) + || endsWith(s, len, "σκαγιων") + || endsWith(s, len, "ολογιου") + || endsWith(s, len, "ολογιων") + || endsWith(s, len, "κρεατοσ") + || endsWith(s, len, "κρεατων") + || endsWith(s, len, "περατοσ") + || endsWith(s, len, "περατων") + || endsWith(s, len, "τερατοσ") + || endsWith(s, len, "τερατων")) + return len - 4; + + if (len > 6 && endsWith(s, len, "τατογια")) + return len - 3; + + if (len > 6 && endsWith(s, len, "γεγονοσ")) + return len - 2; + + if (len > 5 && (endsWith(s, len, "φαγιου") + || endsWith(s, len, "φαγιων") + || endsWith(s, len, "σογιου") + || endsWith(s, len, "σογιων"))) + return len - 4; + + if (len > 5 && (endsWith(s, len, "σκαγια") + || endsWith(s, len, "ολογια") + || endsWith(s, len, "κρεατα") + || endsWith(s, len, "περατα") + || endsWith(s, len, "τερατα"))) + return len - 3; + + if (len > 4 && (endsWith(s, len, "φαγια") + || endsWith(s, len, "σογια") + || endsWith(s, len, "φωτοσ") + || endsWith(s, len, "φωτων"))) + return len - 3; + + if (len > 4 && (endsWith(s, len, "κρεασ") + || endsWith(s, len, "περασ") + || endsWith(s, len, "τερασ"))) + return len - 2; + + if (len > 3 && endsWith(s, len, "φωτα")) + return len - 2; + + if (len > 2 && endsWith(s, len, "φωσ")) + return len - 1; + + return len; + } + + private int rule1(char s[], int len) { + if (len > 4 && (endsWith(s, len, "αδεσ") || endsWith(s, len, "αδων"))) { + len -= 4; + if (!(endsWith(s, len, "οκ") || + endsWith(s, len, "μαμ") || + endsWith(s, len, "μαν") || + endsWith(s, len, "μπαμπ") || + endsWith(s, len, "πατερ") || + endsWith(s, len, "γιαγι") || + endsWith(s, len, "νταντ") || + endsWith(s, len, "κυρ") || + endsWith(s, len, "θει") || + endsWith(s, len, "πεθερ"))) + len += 2; // add back -αδ + } + return len; + } + + private int rule2(char s[], int len) { + if (len > 4 && (endsWith(s, len, "εδεσ") || endsWith(s, len, "εδων"))) { + len -= 4; + if (endsWith(s, len, "οπ") || + endsWith(s, len, "ιπ") || + endsWith(s, len, "εμπ") || + endsWith(s, len, "υπ") || + endsWith(s, len, "γηπ") || + endsWith(s, len, "δαπ") || + endsWith(s, len, "κρασπ") || + endsWith(s, len, "μιλ")) + len += 2; // add back -εδ + } + return len; + } + + private int rule3(char s[], int len) { + if (len > 5 && (endsWith(s, len, "ουδεσ") || endsWith(s, len, "ουδων"))) { + len -= 5; + if (endsWith(s, len, "αρκ") || + endsWith(s, len, "καλιακ") || + endsWith(s, len, "πεταλ") || + endsWith(s, len, "λιχ") || + endsWith(s, len, "πλεξ") || + endsWith(s, len, "σκ") || + endsWith(s, len, "σ") || + endsWith(s, len, "φλ") || + endsWith(s, len, "φρ") || + endsWith(s, len, "βελ") || + endsWith(s, len, "λουλ") || + endsWith(s, len, "χν") || + endsWith(s, len, "σπ") || + endsWith(s, len, "τραγ") || + endsWith(s, len, "φε")) + len += 3; // add back -ουδ + } + return len; + } + + private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"), + false); + + private int rule4(char s[], int len) { + if (len > 3 && (endsWith(s, len, "εωσ") || endsWith(s, len, "εων"))) { + len -= 3; + if (exc4.contains(s, 0, len)) + len++; // add back -ε + } + return len; + } + + private int rule5(char s[], int len) { + if (len > 2 && endsWith(s, len, "ια")) { + len -= 2; + if (endsWithVowel(s, len)) + len++; // add back -ι + } else if (len > 3 && (endsWith(s, len, "ιου") || endsWith(s, len, "ιων"))) { + len -= 3; + if (endsWithVowel(s, len)) + len++; // add back -ι + } + return len; + } + + private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ", + "αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ", + "μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ", + "πετσ", "πιτσ", "πικαντ", "πλιατσ", "ποστελν", "πρωτοδ", "σερτ", + "συναδ", "τσαμ", "υποδ", "φιλον", "φυλοδ", "χασ"), + false); + + private int rule6(char s[], int len) { + boolean removed = false; + if (len > 3 && (endsWith(s, len, "ικα") || endsWith(s, len, "ικο"))) { + len -= 3; + removed = true; + } else if (len > 4 && (endsWith(s, len, "ικου") || endsWith(s, len, "ικων"))) { + len -= 4; + removed = true; + } + + if (removed) { + if (endsWithVowel(s, len) || exc6.contains(s, 0, len)) + len += 2; // add back -ικ + } + return len; + } + + private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ", + "πεθ", "πικρ", "ποτ", "σιχ", "χ"), + false); + + private int rule7(char s[], int len) { + if (len == 5 && endsWith(s, len, "αγαμε")) + return len - 1; + + if (len > 7 && endsWith(s, len, "ηθηκαμε")) + len -= 7; + else if (len > 6 && endsWith(s, len, "ουσαμε")) + len -= 6; + else if (len > 5 && (endsWith(s, len, "αγαμε") || + endsWith(s, len, "ησαμε") || + endsWith(s, len, "ηκαμε"))) + len -= 5; + + if (len > 3 && endsWith(s, len, "αμε")) { + len -= 3; + if (exc7.contains(s, 0, len)) + len += 2; // add back -αμ + } + + return len; + } + + private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_31, + Arrays.asList("τρ", "τσ"), + false); + + private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_31, + Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ", + "καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ", + "π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ", + "τσαρλατ", "ορφ", "τσιγγ", "τσοπ", "φωτοστεφ", "χ", "ψυχοπλ", "αγ", + "ορφ", "γαλ", "γερ", "δεκ", "διπλ", "αμερικαν", "ουρ", "πιθ", + "πουριτ", "σ", "ζωντ", "ικ", "καστ", "κοπ", "λιχ", "λουθηρ", "μαιντ", + "μελ", "σιγ", "σπ", "στεγ", "τραγ", "τσαγ", "φ", "ερ", "αδαπ", + "αθιγγ", "αμηχ", "ανικ", "ανοργ", "απηγ", "απιθ", "ατσιγγ", "βασ", + "βασκ", "βαθυγαλ", "βιομηχ", "βραχυκ", "διατ", "διαφ", "ενοργ", + "θυσ", "καπνοβιομηχ", "καταγαλ", "κλιβ", "κοιλαρφ", "λιβ", + "μεγλοβιομηχ", "μικροβιομηχ", "νταβ", "ξηροκλιβ", "ολιγοδαμ", + "ολογαλ", "πενταρφ", "περηφ", "περιτρ", "πλατ", "πολυδαπ", "πολυμηχ", + "στεφ", "ταβ", "τετ", "υπερηφ", "υποκοπ", "χαμηλοδαπ", "ψηλοταβ"), + false); + + private int rule8(char s[], int len) { + boolean removed = false; + + if (len > 8 && endsWith(s, len, "ιουντανε")) { + len -= 8; + removed = true; + } else if (len > 7 && endsWith(s, len, "ιοντανε") || + endsWith(s, len, "ουντανε") || + endsWith(s, len, "ηθηκανε")) { + len -= 7; + removed = true; + } else if (len > 6 && endsWith(s, len, "ιοτανε") || + endsWith(s, len, "οντανε") || + endsWith(s, len, "ουσανε")) { + len -= 6; + removed = true; + } else if (len > 5 && endsWith(s, len, "αγανε") || + endsWith(s, len, "ησανε") || + endsWith(s, len, "οτανε") || + endsWith(s, len, "ηκανε")) { + len -= 5; + removed = true; + } + + if (removed && exc8a.contains(s, 0, len)) { + // add -αγαν (we removed > 4 chars so its safe) + len += 4; + s[len - 4] = 'α'; + s[len - 3] = 'γ'; + s[len - 2] = 'α'; + s[len - 1] = 'ν'; + } + + if (len > 3 && endsWith(s, len, "ανε")) { + len -= 3; + if (endsWithVowelNoY(s, len) || exc8b.contains(s, 0, len)) { + len += 2; // add back -αν + } + } + + return len; + } + + private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ", + "βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ", + "σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"), + false); + + private int rule9(char s[], int len) { + if (len > 5 && endsWith(s, len, "ησετε")) + len -= 5; + + if (len > 3 && endsWith(s, len, "ετε")) { + len -= 3; + if (exc9.contains(s, 0, len) || + endsWithVowelNoY(s, len) || + endsWith(s, len, "οδ") || + endsWith(s, len, "αιρ") || + endsWith(s, len, "φορ") || + endsWith(s, len, "ταθ") || + endsWith(s, len, "διαθ") || + endsWith(s, len, "σχ") || + endsWith(s, len, "ενδ") || + endsWith(s, len, "ευρ") || + endsWith(s, len, "τιθ") || + endsWith(s, len, "υπερθ") || + endsWith(s, len, "ραθ") || + endsWith(s, len, "ενθ") || + endsWith(s, len, "ροθ") || + endsWith(s, len, "σθ") || + endsWith(s, len, "πυρ") || + endsWith(s, len, "αιν") || + endsWith(s, len, "συνδ") || + endsWith(s, len, "συν") || + endsWith(s, len, "συνθ") || + endsWith(s, len, "χωρ") || + endsWith(s, len, "πον") || + endsWith(s, len, "βρ") || + endsWith(s, len, "καθ") || + endsWith(s, len, "ευθ") || + endsWith(s, len, "εκθ") || + endsWith(s, len, "νετ") || + endsWith(s, len, "ρον") || + endsWith(s, len, "αρκ") || + endsWith(s, len, "βαρ") || + endsWith(s, len, "βολ") || + endsWith(s, len, "ωφελ")) { + len += 2; // add back -ετ + } + } + + return len; + } + + private int rule10(char s[], int len) { + if (len > 5 && (endsWith(s, len, "οντασ") || endsWith(s, len, "ωντασ"))) { + len -= 5; + if (len == 3 && endsWith(s, len, "αρχ")) { + len += 3; // add back *ντ + s[len - 3] = 'ο'; + } + if (endsWith(s, len, "κρε")) { + len += 3; // add back *ντ + s[len - 3] = 'ω'; + } + } + + return len; + } + + private int rule11(char s[], int len) { + if (len > 6 && endsWith(s, len, "ομαστε")) { + len -= 6; + if (len == 2 && endsWith(s, len, "ον")) { + len += 5; // add back -ομαστ + } + } else if (len > 7 && endsWith(s, len, "ιομαστε")) { + len -= 7; + if (len == 2 && endsWith(s, len, "ον")) { + len += 5; + s[len - 5] = 'ο'; + s[len - 4] = 'μ'; + s[len - 3] = 'α'; + s[len - 2] = 'σ'; + s[len - 1] = 'τ'; + } + } + return len; + } + + private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_31, + Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"), + false); + + private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"), + false); + + private int rule12(char s[], int len) { + if (len > 5 && endsWith(s, len, "ιεστε")) { + len -= 5; + if (exc12a.contains(s, 0, len)) + len += 4; // add back -ιεστ + } + + if (len > 4 && endsWith(s, len, "εστε")) { + len -= 4; + if (exc12b.contains(s, 0, len)) + len += 3; // add back -εστ + } + + return len; + } + + private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"), + false); + + private int rule13(char s[], int len) { + if (len > 6 && endsWith(s, len, "ηθηκεσ")) { + len -= 6; + } else if (len > 5 && (endsWith(s, len, "ηθηκα") || endsWith(s, len, "ηθηκε"))) { + len -= 5; + } + + boolean removed = false; + + if (len > 4 && endsWith(s, len, "ηκεσ")) { + len -= 4; + removed = true; + } else if (len > 3 && (endsWith(s, len, "ηκα") || endsWith(s, len, "ηκε"))) { + len -= 3; + removed = true; + } + + if (removed && (exc13.contains(s, 0, len) + || endsWith(s, len, "σκωλ") + || endsWith(s, len, "σκουλ") + || endsWith(s, len, "ναρθ") + || endsWith(s, len, "σφ") + || endsWith(s, len, "οθ") + || endsWith(s, len, "πιθ"))) { + len += 2; // add back the -ηκ + } + + return len; + } + + private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ", + "λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ", + "ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε", + "τσα"), + false); + + private int rule14(char s[], int len) { + boolean removed = false; + + if (len > 5 && endsWith(s, len, "ουσεσ")) { + len -= 5; + removed = true; + } else if (len > 4 && (endsWith(s, len, "ουσα") || endsWith(s, len, "ουσε"))) { + len -= 4; + removed = true; + } + + if (removed && (exc14.contains(s, 0, len) + || endsWithVowel(s, len) + || endsWith(s, len, "ποδαρ") + || endsWith(s, len, "βλεπ") + || endsWith(s, len, "πανταχ") + || endsWith(s, len, "φρυδ") + || endsWith(s, len, "μαντιλ") + || endsWith(s, len, "μαλλ") + || endsWith(s, len, "κυματ") + || endsWith(s, len, "λαχ") + || endsWith(s, len, "ληγ") + || endsWith(s, len, "φαγ") + || endsWith(s, len, "ομ") + || endsWith(s, len, "πρωτ"))) { + len += 3; // add back -ουσ + } + + return len; + } + + private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ", + "αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ", + "ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ", + "συντ", "τ", "υποτ", "χαρ", "αειπ", "αιμοστ", "ανυπ", "αποτ", + "αρτιπ", "διατ", "εν", "επιτ", "κροκαλοπ", "σιδηροπ", "λ", "ναυ", + "ουλαμ", "ουρ", "π", "τρ", "μ"), + false); + + private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ψοφ", "ναυλοχ"), + false); + + private int rule15(char s[], int len) { + boolean removed = false; + if (len > 4 && endsWith(s, len, "αγεσ")) { + len -= 4; + removed = true; + } else if (len > 3 && (endsWith(s, len, "αγα") || endsWith(s, len, "αγε"))) { + len -= 3; + removed = true; + } + + if (removed) { + final boolean cond1 = exc15a.contains(s, 0, len) + || endsWith(s, len, "οφ") + || endsWith(s, len, "πελ") + || endsWith(s, len, "χορτ") + || endsWith(s, len, "λλ") + || endsWith(s, len, "σφ") + || endsWith(s, len, "ρπ") + || endsWith(s, len, "φρ") + || endsWith(s, len, "πρ") + || endsWith(s, len, "λοχ") + || endsWith(s, len, "σμην"); + + final boolean cond2 = exc15b.contains(s, 0, len) + || endsWith(s, len, "κολλ"); + + if (cond1 && !cond2) + len += 2; // add back -αγ + } + + return len; + } + + private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"), + false); + + private int rule16(char s[], int len) { + boolean removed = false; + if (len > 4 && endsWith(s, len, "ησου")) { + len -= 4; + removed = true; + } else if (len > 3 && (endsWith(s, len, "ησε") || endsWith(s, len, "ησα"))) { + len -= 3; + removed = true; + } + + if (removed && exc16.contains(s, 0, len)) + len += 2; // add back -ησ + + return len; + } + + private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"), + false); + + private int rule17(char s[], int len) { + if (len > 4 && endsWith(s, len, "ηστε")) { + len -= 4; + if (exc17.contains(s, 0, len)) + len += 3; // add back the -ηστ + } + + return len; + } + + private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"), + false); + + private int rule18(char s[], int len) { + boolean removed = false; + + if (len > 6 && (endsWith(s, len, "ησουνε") || endsWith(s, len, "ηθουνε"))) { + len -= 6; + removed = true; + } else if (len > 4 && endsWith(s, len, "ουνε")) { + len -= 4; + removed = true; + } + + if (removed && exc18.contains(s, 0, len)) { + len += 3; + s[len - 3] = 'ο'; + s[len - 2] = 'υ'; + s[len - 1] = 'ν'; + } + return len; + } + + private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"), + false); + + private int rule19(char s[], int len) { + boolean removed = false; + + if (len > 6 && (endsWith(s, len, "ησουμε") || endsWith(s, len, "ηθουμε"))) { + len -= 6; + removed = true; + } else if (len > 4 && endsWith(s, len, "ουμε")) { + len -= 4; + removed = true; + } + + if (removed && exc19.contains(s, 0, len)) { + len += 3; + s[len - 3] = 'ο'; + s[len - 2] = 'υ'; + s[len - 1] = 'μ'; + } + return len; + } + + private int rule20(char s[], int len) { + if (len > 5 && (endsWith(s, len, "ματων") || endsWith(s, len, "ματοσ"))) + len -= 3; + else if (len > 4 && endsWith(s, len, "ματα")) + len -= 2; + return len; + } + + private int rule21(char s[], int len) { + if (len > 9 && endsWith(s, len, "ιοντουσαν")) + return len - 9; + + if (len > 8 && (endsWith(s, len, "ιομασταν") || + endsWith(s, len, "ιοσασταν") || + endsWith(s, len, "ιουμαστε") || + endsWith(s, len, "οντουσαν"))) + return len - 8; + + if (len > 7 && (endsWith(s, len, "ιεμαστε") || + endsWith(s, len, "ιεσαστε") || + endsWith(s, len, "ιομουνα") || + endsWith(s, len, "ιοσαστε") || + endsWith(s, len, "ιοσουνα") || + endsWith(s, len, "ιουνται") || + endsWith(s, len, "ιουνταν") || + endsWith(s, len, "ηθηκατε") || + endsWith(s, len, "ομασταν") || + endsWith(s, len, "οσασταν") || + endsWith(s, len, "ουμαστε"))) + return len - 7; + + if (len > 6 && (endsWith(s, len, "ιομουν") || + endsWith(s, len, "ιονταν") || + endsWith(s, len, "ιοσουν") || + endsWith(s, len, "ηθειτε") || + endsWith(s, len, "ηθηκαν") || + endsWith(s, len, "ομουνα") || + endsWith(s, len, "οσαστε") || + endsWith(s, len, "οσουνα") || + endsWith(s, len, "ουνται") || + endsWith(s, len, "ουνταν") || + endsWith(s, len, "ουσατε"))) + return len - 6; + + if (len > 5 && (endsWith(s, len, "αγατε") || + endsWith(s, len, "ιεμαι") || + endsWith(s, len, "ιεται") || + endsWith(s, len, "ιεσαι") || + endsWith(s, len, "ιοταν") || + endsWith(s, len, "ιουμα") || + endsWith(s, len, "ηθεισ") || + endsWith(s, len, "ηθουν") || + endsWith(s, len, "ηκατε") || + endsWith(s, len, "ησατε") || + endsWith(s, len, "ησουν") || + endsWith(s, len, "ομουν") || + endsWith(s, len, "ονται") || + endsWith(s, len, "ονταν") || + endsWith(s, len, "οσουν") || + endsWith(s, len, "ουμαι") || + endsWith(s, len, "ουσαν"))) + return len - 5; + + if (len > 4 && (endsWith(s, len, "αγαν") || + endsWith(s, len, "αμαι") || + endsWith(s, len, "ασαι") || + endsWith(s, len, "αται") || + endsWith(s, len, "ειτε") || + endsWith(s, len, "εσαι") || + endsWith(s, len, "εται") || + endsWith(s, len, "ηδεσ") || + endsWith(s, len, "ηδων") || + endsWith(s, len, "ηθει") || + endsWith(s, len, "ηκαν") || + endsWith(s, len, "ησαν") || + endsWith(s, len, "ησει") || + endsWith(s, len, "ησεσ") || + endsWith(s, len, "ομαι") || + endsWith(s, len, "οταν"))) + return len - 4; + + if (len > 3 && (endsWith(s, len, "αει") || + endsWith(s, len, "εισ") || + endsWith(s, len, "ηθω") || + endsWith(s, len, "ησω") || + endsWith(s, len, "ουν") || + endsWith(s, len, "ουσ"))) + return len - 3; + + if (len > 2 && (endsWith(s, len, "αν") || + endsWith(s, len, "ασ") || + endsWith(s, len, "αω") || + endsWith(s, len, "ει") || + endsWith(s, len, "εσ") || + endsWith(s, len, "ησ") || + endsWith(s, len, "οι") || + endsWith(s, len, "οσ") || + endsWith(s, len, "ου") || + endsWith(s, len, "υσ") || + endsWith(s, len, "ων"))) + return len - 2; + + if (len > 1 && endsWithVowel(s, len)) + return len - 1; + + return len; + } + + private int rule22(char s[], int len) { + if (endsWith(s, len, "εστερ") || + endsWith(s, len, "εστατ")) + return len - 5; + + if (endsWith(s, len, "οτερ") || + endsWith(s, len, "οτατ") || + endsWith(s, len, "υτερ") || + endsWith(s, len, "υτατ") || + endsWith(s, len, "ωτερ") || + endsWith(s, len, "ωτατ")) + return len - 4; + + return len; + } + + private boolean endsWith(char s[], int len, String suffix) { + final int suffixLen = suffix.length(); + if (suffixLen > len) + return false; + for (int i = suffixLen - 1; i >= 0; i--) + if (s[len -(suffixLen - i)] != suffix.charAt(i)) + return false; + + return true; + } + + private boolean endsWithVowel(char s[], int len) { + if (len == 0) + return false; + switch(s[len - 1]) { + case 'α': + case 'ε': + case 'η': + case 'ι': + case 'ο': + case 'υ': + case 'ω': + return true; + default: + return false; + } + } + + private boolean endsWithVowelNoY(char s[], int len) { + if (len == 0) + return false; + switch(s[len - 1]) { + case 'α': + case 'ε': + case 'η': + case 'ι': + case 'ο': + case 'ω': + return true; + default: + return false; + } + } +} diff --git a/modules/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt b/modules/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt new file mode 100644 index 00000000000..1a08d318326 --- /dev/null +++ b/modules/analysis/common/src/resources/org/apache/lucene/analysis/el/stopwords.txt @@ -0,0 +1,76 @@ +# Lucene Greek Stopwords list +ο +η +το +οι +τα +του +τησ +των +τον +την +και +κι +κ +ειμαι +εισαι +ειναι +ειμαστε +ειστε +στο +στον +στη +στην +μα +αλλα +απο +για +προσ +με +σε +ωσ +παρα +αντι +κατα +μετα +θα +να +δε +δεν +μη +μην +επι +ενω +εαν +αν +τοτε +που +πωσ +ποιοσ +ποια +ποιο +ποιοι +ποιεσ +ποιων +ποιουσ +αυτοσ +αυτη +αυτο +αυτοι +αυτων +αυτουσ +αυτεσ +αυτα +εκεινοσ +εκεινη +εκεινο +εκεινοι +εκεινεσ +εκεινα +εκεινων +εκεινουσ +οπωσ +ομωσ +ισωσ +οσο +οτι diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java index 6f87c1fa5e1..69dbf0105b3 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java @@ -26,42 +26,67 @@ import org.apache.lucene.util.Version; */ public class GreekAnalyzerTest extends BaseTokenStreamTestCase { + /** + * Test the analysis of various greek strings. + * + * @throws Exception in case an error occurs + */ + public void testAnalyzer() throws Exception { + Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT); + // Verify the correct analysis of capitals and small accented letters, and + // stemming + assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας", + new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ", + "ελληνικ", "γλωσσ" }); + // Verify the correct analysis of small letters with diaeresis and the elimination + // of punctuation marks + assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ", + new String[] { "προιοντ", "πολλαπλ", "αναγκ" }); + // Verify the correct analysis of capital accented letters and capital letters with diaeresis, + // as well as the elimination of stop words + assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι", + new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" }); + } + /** * Test the analysis of various greek strings. * * @throws Exception in case an error occurs + * @deprecated Remove this test when support for 3.0 is no longer needed */ - public void testAnalyzer() throws Exception { - Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT); + @Deprecated + public void testAnalyzerBWCompat() throws Exception { + Analyzer a = new GreekAnalyzer(Version.LUCENE_30); // Verify the correct analysis of capitals and small accented letters - assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2", - new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd", - "\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" }); + assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας", + new String[] { "μια", "εξαιρετικα", "καλη", "πλουσια", "σειρα", "χαρακτηρων", + "ελληνικησ", "γλωσσασ" }); // Verify the correct analysis of small letters with diaeresis and the elimination // of punctuation marks - assertAnalyzesTo(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3", - new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" }); - // Verify the correct analysis of capital accented letters and capitalletters with diaeresis, + assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ", + new String[] { "προιοντα", "πολλαπλεσ", "αναγκεσ" }); + // Verify the correct analysis of capital accented letters and capital letters with diaeresis, // as well as the elimination of stop words - assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9", - new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" }); + assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι", + new String[] { "προυποθεσεισ", "αψογοσ", "μεστοσ", "αλλοι" }); } - public void testReusableTokenStream() throws Exception { - Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT); - // Verify the correct analysis of capitals and small accented letters - assertAnalyzesToReuse(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2", - new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd", - "\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" }); - // Verify the correct analysis of small letters with diaeresis and the elimination - // of punctuation marks - assertAnalyzesToReuse(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3", - new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" }); - // Verify the correct analysis of capital accented letters and capitalletters with diaeresis, - // as well as the elimination of stop words - assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9", - new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" }); - } + public void testReusableTokenStream() throws Exception { + Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT); + // Verify the correct analysis of capitals and small accented letters, and + // stemming + assertAnalyzesToReuse(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας", + new String[] { "μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ", + "ελληνικ", "γλωσσ" }); + // Verify the correct analysis of small letters with diaeresis and the elimination + // of punctuation marks + assertAnalyzesToReuse(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ", + new String[] { "προιοντ", "πολλαπλ", "αναγκ" }); + // Verify the correct analysis of capital accented letters and capital letters with diaeresis, + // as well as the elimination of stop words + assertAnalyzesToReuse(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι", + new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" }); + } /** * Greek Analyzer didn't call standardFilter, so no normalization of acronyms. diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java new file mode 100644 index 00000000000..1b95c29b31a --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemmer.java @@ -0,0 +1,508 @@ +package org.apache.lucene.analysis.el; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; + +public class TestGreekStemmer extends BaseTokenStreamTestCase { + Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT); + + public void testMasculineNouns() throws Exception { + // -ος + checkOneTerm(a, "άνθρωπος", "ανθρωπ"); + checkOneTerm(a, "ανθρώπου", "ανθρωπ"); + checkOneTerm(a, "άνθρωπο", "ανθρωπ"); + checkOneTerm(a, "άνθρωπε", "ανθρωπ"); + checkOneTerm(a, "άνθρωποι", "ανθρωπ"); + checkOneTerm(a, "ανθρώπων", "ανθρωπ"); + checkOneTerm(a, "ανθρώπους", "ανθρωπ"); + checkOneTerm(a, "άνθρωποι", "ανθρωπ"); + + // -ης + checkOneTerm(a, "πελάτης", "πελατ"); + checkOneTerm(a, "πελάτη", "πελατ"); + checkOneTerm(a, "πελάτες", "πελατ"); + checkOneTerm(a, "πελατών", "πελατ"); + + // -ας/-ες + checkOneTerm(a, "ελέφαντας", "ελεφαντ"); + checkOneTerm(a, "ελέφαντα", "ελεφαντ"); + checkOneTerm(a, "ελέφαντες", "ελεφαντ"); + checkOneTerm(a, "ελεφάντων", "ελεφαντ"); + + // -ας/-αδες + checkOneTerm(a, "μπαμπάς", "μπαμπ"); + checkOneTerm(a, "μπαμπά", "μπαμπ"); + checkOneTerm(a, "μπαμπάδες", "μπαμπ"); + checkOneTerm(a, "μπαμπάδων", "μπαμπ"); + + // -ης/-ηδες + checkOneTerm(a, "μπακάλης", "μπακαλ"); + checkOneTerm(a, "μπακάλη", "μπακαλ"); + checkOneTerm(a, "μπακάληδες", "μπακαλ"); + checkOneTerm(a, "μπακάληδων", "μπακαλ"); + + // -ες + checkOneTerm(a, "καφές", "καφ"); + checkOneTerm(a, "καφέ", "καφ"); + checkOneTerm(a, "καφέδες", "καφ"); + checkOneTerm(a, "καφέδων", "καφ"); + + // -έας/είς + checkOneTerm(a, "γραμματέας", "γραμματε"); + checkOneTerm(a, "γραμματέα", "γραμματε"); + // plural forms conflate w/ each other, not w/ the sing forms + checkOneTerm(a, "γραμματείς", "γραμματ"); + checkOneTerm(a, "γραμματέων", "γραμματ"); + + // -ους/οι + checkOneTerm(a, "απόπλους", "αποπλ"); + checkOneTerm(a, "απόπλου", "αποπλ"); + checkOneTerm(a, "απόπλοι", "αποπλ"); + checkOneTerm(a, "απόπλων", "αποπλ"); + + // -ους/-ουδες + checkOneTerm(a, "παππούς", "παππ"); + checkOneTerm(a, "παππού", "παππ"); + checkOneTerm(a, "παππούδες", "παππ"); + checkOneTerm(a, "παππούδων", "παππ"); + + // -ης/-εις + checkOneTerm(a, "λάτρης", "λατρ"); + checkOneTerm(a, "λάτρη", "λατρ"); + checkOneTerm(a, "λάτρεις", "λατρ"); + checkOneTerm(a, "λάτρεων", "λατρ"); + + // -υς + checkOneTerm(a, "πέλεκυς", "πελεκ"); + checkOneTerm(a, "πέλεκυ", "πελεκ"); + checkOneTerm(a, "πελέκεις", "πελεκ"); + checkOneTerm(a, "πελέκεων", "πελεκ"); + + // -ωρ + // note: nom./voc. doesn't conflate w/ the rest + checkOneTerm(a, "μέντωρ", "μεντωρ"); + checkOneTerm(a, "μέντορος", "μεντορ"); + checkOneTerm(a, "μέντορα", "μεντορ"); + checkOneTerm(a, "μέντορες", "μεντορ"); + checkOneTerm(a, "μεντόρων", "μεντορ"); + + // -ων + checkOneTerm(a, "αγώνας", "αγων"); + checkOneTerm(a, "αγώνος", "αγων"); + checkOneTerm(a, "αγώνα", "αγων"); + checkOneTerm(a, "αγώνα", "αγων"); + checkOneTerm(a, "αγώνες", "αγων"); + checkOneTerm(a, "αγώνων", "αγων"); + + // -ας/-ηδες + checkOneTerm(a, "αέρας", "αερ"); + checkOneTerm(a, "αέρα", "αερ"); + checkOneTerm(a, "αέρηδες", "αερ"); + checkOneTerm(a, "αέρηδων", "αερ"); + + // -ης/-ητες + checkOneTerm(a, "γόης", "γο"); + checkOneTerm(a, "γόη", "γοη"); // too short + // the two plural forms conflate + checkOneTerm(a, "γόητες", "γοητ"); + checkOneTerm(a, "γοήτων", "γοητ"); + } + + public void testFeminineNouns() throws Exception { + // -α/-ες,-ών + checkOneTerm(a, "φορά", "φορ"); + checkOneTerm(a, "φοράς", "φορ"); + checkOneTerm(a, "φορές", "φορ"); + checkOneTerm(a, "φορών", "φορ"); + + // -α/-ες,-ων + checkOneTerm(a, "αγελάδα", "αγελαδ"); + checkOneTerm(a, "αγελάδας", "αγελαδ"); + checkOneTerm(a, "αγελάδες", "αγελαδ"); + checkOneTerm(a, "αγελάδων", "αγελαδ"); + + // -η/-ες + checkOneTerm(a, "ζάχαρη", "ζαχαρ"); + checkOneTerm(a, "ζάχαρης", "ζαχαρ"); + checkOneTerm(a, "ζάχαρες", "ζαχαρ"); + checkOneTerm(a, "ζαχάρεων", "ζαχαρ"); + + // -η/-εις + checkOneTerm(a, "τηλεόραση", "τηλεορασ"); + checkOneTerm(a, "τηλεόρασης", "τηλεορασ"); + checkOneTerm(a, "τηλεοράσεις", "τηλεορασ"); + checkOneTerm(a, "τηλεοράσεων", "τηλεορασ"); + + // -α/-αδες + checkOneTerm(a, "μαμά", "μαμ"); + checkOneTerm(a, "μαμάς", "μαμ"); + checkOneTerm(a, "μαμάδες", "μαμ"); + checkOneTerm(a, "μαμάδων", "μαμ"); + + // -ος + checkOneTerm(a, "λεωφόρος", "λεωφορ"); + checkOneTerm(a, "λεωφόρου", "λεωφορ"); + checkOneTerm(a, "λεωφόρο", "λεωφορ"); + checkOneTerm(a, "λεωφόρε", "λεωφορ"); + checkOneTerm(a, "λεωφόροι", "λεωφορ"); + checkOneTerm(a, "λεωφόρων", "λεωφορ"); + checkOneTerm(a, "λεωφόρους", "λεωφορ"); + + // -ου + checkOneTerm(a, "αλεπού", "αλεπ"); + checkOneTerm(a, "αλεπούς", "αλεπ"); + checkOneTerm(a, "αλεπούδες", "αλεπ"); + checkOneTerm(a, "αλεπούδων", "αλεπ"); + + // -έας/είς + // note: not all forms conflate + checkOneTerm(a, "γραμματέας", "γραμματε"); + checkOneTerm(a, "γραμματέως", "γραμματ"); + checkOneTerm(a, "γραμματέα", "γραμματε"); + checkOneTerm(a, "γραμματείς", "γραμματ"); + checkOneTerm(a, "γραμματέων", "γραμματ"); + } + + public void testNeuterNouns() throws Exception { + // ending with -ο + // note: nom doesnt conflate + checkOneTerm(a, "βιβλίο", "βιβλι"); + checkOneTerm(a, "βιβλίου", "βιβλ"); + checkOneTerm(a, "βιβλία", "βιβλ"); + checkOneTerm(a, "βιβλίων", "βιβλ"); + + // ending with -ι + checkOneTerm(a, "πουλί", "πουλ"); + checkOneTerm(a, "πουλιού", "πουλ"); + checkOneTerm(a, "πουλιά", "πουλ"); + checkOneTerm(a, "πουλιών", "πουλ"); + + // ending with -α + // note: nom. doesnt conflate + checkOneTerm(a, "πρόβλημα", "προβλημ"); + checkOneTerm(a, "προβλήματος", "προβλημα"); + checkOneTerm(a, "προβλήματα", "προβλημα"); + checkOneTerm(a, "προβλημάτων", "προβλημα"); + + // ending with -ος/-ους + checkOneTerm(a, "πέλαγος", "πελαγ"); + checkOneTerm(a, "πελάγους", "πελαγ"); + checkOneTerm(a, "πελάγη", "πελαγ"); + checkOneTerm(a, "πελάγων", "πελαγ"); + + // ending with -ός/-ότος + checkOneTerm(a, "γεγονός", "γεγον"); + checkOneTerm(a, "γεγονότος", "γεγον"); + checkOneTerm(a, "γεγονότα", "γεγον"); + checkOneTerm(a, "γεγονότων", "γεγον"); + + // ending with -υ/-ιου + checkOneTerm(a, "βράδυ", "βραδ"); + checkOneTerm(a, "βράδι", "βραδ"); + checkOneTerm(a, "βραδιού", "βραδ"); + checkOneTerm(a, "βράδια", "βραδ"); + checkOneTerm(a, "βραδιών", "βραδ"); + + // ending with -υ/-ατος + // note: nom. doesnt conflate + checkOneTerm(a, "δόρυ", "δορ"); + checkOneTerm(a, "δόρατος", "δορατ"); + checkOneTerm(a, "δόρατα", "δορατ"); + checkOneTerm(a, "δοράτων", "δορατ"); + + // ending with -ας + checkOneTerm(a, "κρέας", "κρε"); + checkOneTerm(a, "κρέατος", "κρε"); + checkOneTerm(a, "κρέατα", "κρε"); + checkOneTerm(a, "κρεάτων", "κρε"); + + // ending with -ως + checkOneTerm(a, "λυκόφως", "λυκοφω"); + checkOneTerm(a, "λυκόφωτος", "λυκοφω"); + checkOneTerm(a, "λυκόφωτα", "λυκοφω"); + checkOneTerm(a, "λυκοφώτων", "λυκοφω"); + + // ending with -ον/-ου + // note: nom. doesnt conflate + checkOneTerm(a, "μέσον", "μεσον"); + checkOneTerm(a, "μέσου", "μεσ"); + checkOneTerm(a, "μέσα", "μεσ"); + checkOneTerm(a, "μέσων", "μεσ"); + + // ending in -ον/-οντος + // note: nom. doesnt conflate + checkOneTerm(a, "ενδιαφέρον", "ενδιαφερον"); + checkOneTerm(a, "ενδιαφέροντος", "ενδιαφεροντ"); + checkOneTerm(a, "ενδιαφέροντα", "ενδιαφεροντ"); + checkOneTerm(a, "ενδιαφερόντων", "ενδιαφεροντ"); + + // ending with -εν/-εντος + checkOneTerm(a, "ανακοινωθέν", "ανακοινωθεν"); + checkOneTerm(a, "ανακοινωθέντος", "ανακοινωθεντ"); + checkOneTerm(a, "ανακοινωθέντα", "ανακοινωθεντ"); + checkOneTerm(a, "ανακοινωθέντων", "ανακοινωθεντ"); + + // ending with -αν/-αντος + checkOneTerm(a, "σύμπαν", "συμπ"); + checkOneTerm(a, "σύμπαντος", "συμπαντ"); + checkOneTerm(a, "σύμπαντα", "συμπαντ"); + checkOneTerm(a, "συμπάντων", "συμπαντ"); + + // ending with -α/-ακτος + checkOneTerm(a, "γάλα", "γαλ"); + checkOneTerm(a, "γάλακτος", "γαλακτ"); + checkOneTerm(a, "γάλατα", "γαλατ"); + checkOneTerm(a, "γαλάκτων", "γαλακτ"); + } + + public void testAdjectives() throws Exception { + // ending with -ής, -ές/-είς, -ή + checkOneTerm(a, "συνεχής", "συνεχ"); + checkOneTerm(a, "συνεχούς", "συνεχ"); + checkOneTerm(a, "συνεχή", "συνεχ"); + checkOneTerm(a, "συνεχών", "συνεχ"); + checkOneTerm(a, "συνεχείς", "συνεχ"); + checkOneTerm(a, "συνεχές", "συνεχ"); + + // ending with -ης, -ες/-εις, -η + checkOneTerm(a, "συνήθης", "συνηθ"); + checkOneTerm(a, "συνήθους", "συνηθ"); + checkOneTerm(a, "συνήθη", "συνηθ"); + // note: doesn't conflate + checkOneTerm(a, "συνήθεις", "συν"); + checkOneTerm(a, "συνήθων", "συνηθ"); + checkOneTerm(a, "σύνηθες", "συνηθ"); + + // ending with -υς, -υ/-εις, -ια + checkOneTerm(a, "βαθύς", "βαθ"); + checkOneTerm(a, "βαθέος", "βαθε"); + checkOneTerm(a, "βαθύ", "βαθ"); + checkOneTerm(a, "βαθείς", "βαθ"); + checkOneTerm(a, "βαθέων", "βαθ"); + + checkOneTerm(a, "βαθιά", "βαθ"); + checkOneTerm(a, "βαθιάς", "βαθι"); + checkOneTerm(a, "βαθιές", "βαθι"); + checkOneTerm(a, "βαθιών", "βαθ"); + + checkOneTerm(a, "βαθέα", "βαθε"); + + // comparative/superlative + checkOneTerm(a, "ψηλός", "ψηλ"); + checkOneTerm(a, "ψηλότερος", "ψηλ"); + checkOneTerm(a, "ψηλότατος", "ψηλ"); + + checkOneTerm(a, "ωραίος", "ωραι"); + checkOneTerm(a, "ωραιότερος", "ωραι"); + checkOneTerm(a, "ωραιότατος", "ωραι"); + + checkOneTerm(a, "επιεικής", "επιεικ"); + checkOneTerm(a, "επιεικέστερος", "επιεικ"); + checkOneTerm(a, "επιεικέστατος", "επιεικ"); + } + + + public void testVerbs() throws Exception { + // note, past/present verb stems will not conflate (from the paper) + //-ω,-α/-.ω,-.α + checkOneTerm(a, "ορίζω", "οριζ"); + checkOneTerm(a, "όριζα", "οριζ"); + checkOneTerm(a, "όριζε", "οριζ"); + checkOneTerm(a, "ορίζοντας", "οριζ"); + checkOneTerm(a, "ορίζομαι", "οριζ"); + checkOneTerm(a, "οριζόμουν", "οριζ"); + checkOneTerm(a, "ορίζεσαι", "οριζ"); + + checkOneTerm(a, "όρισα", "ορισ"); + checkOneTerm(a, "ορίσω", "ορισ"); + checkOneTerm(a, "όρισε", "ορισ"); + checkOneTerm(a, "ορίσει", "ορισ"); + + checkOneTerm(a, "ορίστηκα", "οριστ"); + checkOneTerm(a, "οριστώ", "οριστ"); + checkOneTerm(a, "οριστείς", "οριστ"); + checkOneTerm(a, "οριστεί", "οριστ"); + + checkOneTerm(a, "ορισμένο", "ορισμεν"); + checkOneTerm(a, "ορισμένη", "ορισμεν"); + checkOneTerm(a, "ορισμένος", "ορισμεν"); + + // -ω,-α/-ξω,-ξα + checkOneTerm(a, "ανοίγω", "ανοιγ"); + checkOneTerm(a, "άνοιγα", "ανοιγ"); + checkOneTerm(a, "άνοιγε", "ανοιγ"); + checkOneTerm(a, "ανοίγοντας", "ανοιγ"); + checkOneTerm(a, "ανοίγομαι", "ανοιγ"); + checkOneTerm(a, "ανοιγόμουν", "ανοιγ"); + + checkOneTerm(a, "άνοιξα", "ανοιξ"); + checkOneTerm(a, "ανοίξω", "ανοιξ"); + checkOneTerm(a, "άνοιξε", "ανοιξ"); + checkOneTerm(a, "ανοίξει", "ανοιξ"); + + checkOneTerm(a, "ανοίχτηκα", "ανοιχτ"); + checkOneTerm(a, "ανοιχτώ", "ανοιχτ"); + checkOneTerm(a, "ανοίχτηκα", "ανοιχτ"); + checkOneTerm(a, "ανοιχτείς", "ανοιχτ"); + checkOneTerm(a, "ανοιχτεί", "ανοιχτ"); + + checkOneTerm(a, "ανοίξου", "ανοιξ"); + + //-ώ/-άω,-ούσα/-άσω,-ασα + checkOneTerm(a, "περνώ", "περν"); + checkOneTerm(a, "περνάω", "περν"); + checkOneTerm(a, "περνούσα", "περν"); + checkOneTerm(a, "πέρναγα", "περν"); + checkOneTerm(a, "πέρνα", "περν"); + checkOneTerm(a, "περνώντας", "περν"); + + checkOneTerm(a, "πέρασα", "περασ"); + checkOneTerm(a, "περάσω", "περασ"); + checkOneTerm(a, "πέρασε", "περασ"); + checkOneTerm(a, "περάσει", "περασ"); + + checkOneTerm(a, "περνιέμαι", "περν"); + checkOneTerm(a, "περνιόμουν", "περν"); + + checkOneTerm(a, "περάστηκα", "περαστ"); + checkOneTerm(a, "περαστώ", "περαστ"); + checkOneTerm(a, "περαστείς", "περαστ"); + checkOneTerm(a, "περαστεί", "περαστ"); + + checkOneTerm(a, "περασμένο", "περασμεν"); + checkOneTerm(a, "περασμένη", "περασμεν"); + checkOneTerm(a, "περασμένος", "περασμεν"); + + // -ώ/-άω,-ούσα/-άξω,-αξα + checkOneTerm(a, "πετώ", "πετ"); + checkOneTerm(a, "πετάω", "πετ"); + checkOneTerm(a, "πετούσα", "πετ"); + checkOneTerm(a, "πέταγα", "πετ"); + checkOneTerm(a, "πέτα", "πετ"); + checkOneTerm(a, "πετώντας", "πετ"); + checkOneTerm(a, "πετιέμαι", "πετ"); + checkOneTerm(a, "πετιόμουν", "πετ"); + + checkOneTerm(a, "πέταξα", "πεταξ"); + checkOneTerm(a, "πετάξω", "πεταξ"); + checkOneTerm(a, "πέταξε", "πεταξ"); + checkOneTerm(a, "πετάξει", "πεταξ"); + + checkOneTerm(a, "πετάχτηκα", "πεταχτ"); + checkOneTerm(a, "πεταχτώ", "πεταχτ"); + checkOneTerm(a, "πεταχτείς", "πεταχτ"); + checkOneTerm(a, "πεταχτεί", "πεταχτ"); + + checkOneTerm(a, "πεταμένο", "πεταμεν"); + checkOneTerm(a, "πεταμένη", "πεταμεν"); + checkOneTerm(a, "πεταμένος", "πεταμεν"); + + // -ώ/-άω,-ούσα / -έσω,-εσα + checkOneTerm(a, "καλώ", "καλ"); + checkOneTerm(a, "καλούσα", "καλ"); + checkOneTerm(a, "καλείς", "καλ"); + checkOneTerm(a, "καλώντας", "καλ"); + + checkOneTerm(a, "καλούμαι", "καλ"); + // pass. imperfect /imp. progressive doesnt conflate + checkOneTerm(a, "καλούμουν", "καλουμ"); + checkOneTerm(a, "καλείσαι", "καλεισα"); + + checkOneTerm(a, "καλέστηκα", "καλεστ"); + checkOneTerm(a, "καλεστώ", "καλεστ"); + checkOneTerm(a, "καλεστείς", "καλεστ"); + checkOneTerm(a, "καλεστεί", "καλεστ"); + + checkOneTerm(a, "καλεσμένο", "καλεσμεν"); + checkOneTerm(a, "καλεσμένη", "καλεσμεν"); + checkOneTerm(a, "καλεσμένος", "καλεσμεν"); + + checkOneTerm(a, "φορώ", "φορ"); + checkOneTerm(a, "φοράω", "φορ"); + checkOneTerm(a, "φορούσα", "φορ"); + checkOneTerm(a, "φόραγα", "φορ"); + checkOneTerm(a, "φόρα", "φορ"); + checkOneTerm(a, "φορώντας", "φορ"); + checkOneTerm(a, "φοριέμαι", "φορ"); + checkOneTerm(a, "φοριόμουν", "φορ"); + checkOneTerm(a, "φοριέσαι", "φορ"); + + checkOneTerm(a, "φόρεσα", "φορεσ"); + checkOneTerm(a, "φορέσω", "φορεσ"); + checkOneTerm(a, "φόρεσε", "φορεσ"); + checkOneTerm(a, "φορέσει", "φορεσ"); + + checkOneTerm(a, "φορέθηκα", "φορεθ"); + checkOneTerm(a, "φορεθώ", "φορεθ"); + checkOneTerm(a, "φορεθείς", "φορεθ"); + checkOneTerm(a, "φορεθεί", "φορεθ"); + + checkOneTerm(a, "φορεμένο", "φορεμεν"); + checkOneTerm(a, "φορεμένη", "φορεμεν"); + checkOneTerm(a, "φορεμένος", "φορεμεν"); + + // -ώ/-άω,-ούσα / -ήσω,-ησα + checkOneTerm(a, "κρατώ", "κρατ"); + checkOneTerm(a, "κρατάω", "κρατ"); + checkOneTerm(a, "κρατούσα", "κρατ"); + checkOneTerm(a, "κράταγα", "κρατ"); + checkOneTerm(a, "κράτα", "κρατ"); + checkOneTerm(a, "κρατώντας", "κρατ"); + + checkOneTerm(a, "κράτησα", "κρατ"); + checkOneTerm(a, "κρατήσω", "κρατ"); + checkOneTerm(a, "κράτησε", "κρατ"); + checkOneTerm(a, "κρατήσει", "κρατ"); + + checkOneTerm(a, "κρατούμαι", "κρατ"); + checkOneTerm(a, "κρατιέμαι", "κρατ"); + // this imperfect form doesnt conflate + checkOneTerm(a, "κρατούμουν", "κρατουμ"); + checkOneTerm(a, "κρατιόμουν", "κρατ"); + // this imp. prog form doesnt conflate + checkOneTerm(a, "κρατείσαι", "κρατεισα"); + + checkOneTerm(a, "κρατήθηκα", "κρατ"); + checkOneTerm(a, "κρατηθώ", "κρατ"); + checkOneTerm(a, "κρατηθείς", "κρατ"); + checkOneTerm(a, "κρατηθεί", "κρατ"); + checkOneTerm(a, "κρατήσου", "κρατ"); + + checkOneTerm(a, "κρατημένο", "κρατημεν"); + checkOneTerm(a, "κρατημένη", "κρατημεν"); + checkOneTerm(a, "κρατημένος", "κρατημεν"); + + // -.μαι,-.μουν / -.ώ,-.ηκα + checkOneTerm(a, "κοιμάμαι", "κοιμ"); + checkOneTerm(a, "κοιμόμουν", "κοιμ"); + checkOneTerm(a, "κοιμάσαι", "κοιμ"); + + checkOneTerm(a, "κοιμήθηκα", "κοιμ"); + checkOneTerm(a, "κοιμηθώ", "κοιμ"); + checkOneTerm(a, "κοιμήσου", "κοιμ"); + checkOneTerm(a, "κοιμηθεί", "κοιμ"); + + checkOneTerm(a, "κοιμισμένο", "κοιμισμεν"); + checkOneTerm(a, "κοιμισμένη", "κοιμισμεν"); + checkOneTerm(a, "κοιμισμένος", "κοιμισμεν"); + } + + public void testExceptions() throws Exception { + checkOneTerm(a, "καθεστώτα", "καθεστ"); + checkOneTerm(a, "καθεστώτος", "καθεστ"); + checkOneTerm(a, "καθεστώς", "καθεστ"); + checkOneTerm(a, "καθεστώτων", "καθεστ"); + + checkOneTerm(a, "χουμε", "χουμ"); + checkOneTerm(a, "χουμ", "χουμ"); + + checkOneTerm(a, "υποταγεσ", "υποταγ"); + checkOneTerm(a, "υποταγ", "υποταγ"); + + checkOneTerm(a, "εμετε", "εμετ"); + checkOneTerm(a, "εμετ", "εμετ"); + + checkOneTerm(a, "αρχοντασ", "αρχοντ"); + checkOneTerm(a, "αρχοντων", "αρχοντ"); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java index 61fc2c06ad1..c9dd4101021 100644 --- a/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java @@ -33,6 +33,7 @@ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory @Override public void init(Map args) { super.init(args); + assureMatchVersion(); if (args.containsKey("charset")) throw new SolrException(ErrorCode.SERVER_ERROR, "The charset parameter is no longer supported. " diff --git a/solr/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java new file mode 100644 index 00000000000..2c6f005222f --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java @@ -0,0 +1,30 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.el.GreekStemFilter; + +/** Factory for {@link GreekStemFilter} */ +public class GreekStemFilterFactory extends BaseTokenFilterFactory { + + public TokenStream create(TokenStream input) { + return new GreekStemFilter(input); + } + +} diff --git a/solr/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java index f697020e151..929204e776e 100644 --- a/solr/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java +++ b/solr/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java @@ -31,10 +31,11 @@ public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase { /** * Ensure the filter actually lowercases (and a bit more) greek text. */ - public void testStemming() throws Exception { + public void testNormalization() throws Exception { Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ"); Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory(); + factory.init(DEFAULT_VERSION_PARAM); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" }); } diff --git a/solr/src/test/org/apache/solr/analysis/TestGreekStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestGreekStemFilterFactory.java new file mode 100644 index 00000000000..c2ede0dd7e0 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestGreekStemFilterFactory.java @@ -0,0 +1,40 @@ +package org.apache.solr.analysis; + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.el.GreekLowerCaseFilter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Simple tests to ensure the Greek stem filter factory is working. + */ +public class TestGreekStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("άνθρωπος"); + Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader); + TokenStream normalized = new GreekLowerCaseFilter(DEFAULT_VERSION, tokenizer); + GreekStemFilterFactory factory = new GreekStemFilterFactory(); + TokenStream stream = factory.create(normalized); + assertTokenStreamContents(stream, new String[] { "ανθρωπ" }); + } +}