diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index c87674ad77a..fbb69265015 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -187,6 +187,9 @@ New features * LUCENE-2464: FastVectorHighlighter: add SingleFragListBuilder to return entire field contents. (Koji Sekiguchi) + * LUCENE-2503: Added lighter stemming alternatives for European languages. + (Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/modules/analysis/NOTICE.txt b/modules/analysis/NOTICE.txt index 6abde9313c7..35aadb73f73 100644 --- a/modules/analysis/NOTICE.txt +++ b/modules/analysis/NOTICE.txt @@ -17,30 +17,29 @@ were developed by Martin Porter and Richard Boulton. The full snowball package is available from http://snowball.tartarus.org/ -The Arabic stemmer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt. +The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt See http://members.unine.ch/jacques.savoy/clef/index.html. -The Persian analyzer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt. -See http://members.unine.ch/jacques.savoy/clef/index.html. - -The Romanian analyzer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt. -See http://members.unine.ch/jacques.savoy/clef/index.html. - -The Bulgarian analyzer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt. -See http://members.unine.ch/jacques.savoy/clef/index.html. - -The Hindi analyzer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt. -See http://members.unine.ch/jacques.savoy/clef/index.html. +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java The Stempel analyzer (stempel) includes BSD-licensed software developed by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java index eaf89d259e3..c56617643b8 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ar; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Normalizer for Arabic. *
@@ -96,20 +98,4 @@ public class ArabicNormalizer { return len; } - - /** - * Delete a character in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len length of input buffer - * @return length of input buffer after deletion - */ - protected int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } - } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java index 098efa04077..c52916e76ab 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java @@ -1,4 +1,6 @@ package org.apache.lucene.analysis.ar; + + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -16,6 +18,8 @@ package org.apache.lucene.analysis.ar; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Stemmer for Arabic. *
@@ -86,7 +90,7 @@ public class ArabicStemmer { */ public int stemPrefix(char s[], int len) { for (int i = 0; i < prefixes.length; i++) - if (startsWith(s, len, prefixes[i])) + if (startsWithCheckLength(s, len, prefixes[i])) return deleteN(s, 0, len, prefixes[i].length); return len; } @@ -99,7 +103,7 @@ public class ArabicStemmer { */ public int stemSuffix(char s[], int len) { for (int i = 0; i < suffixes.length; i++) - if (endsWith(s, len, suffixes[i])) + if (endsWithCheckLength(s, len, suffixes[i])) len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length); return len; } @@ -111,7 +115,7 @@ public class ArabicStemmer { * @param prefix prefix to check * @return true if the prefix matches and can be stemmed */ - boolean startsWith(char s[], int len, char prefix[]) { + boolean startsWithCheckLength(char s[], int len, char prefix[]) { if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters return false; } else if (len < prefix.length + 2) { // other prefixes require only 2. @@ -132,7 +136,7 @@ public class ArabicStemmer { * @param suffix suffix to check * @return true if the suffix matches and can be stemmed */ - boolean endsWith(char s[], int len, char suffix[]) { + boolean endsWithCheckLength(char s[], int len, char suffix[]) { if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming return false; } else { @@ -142,37 +146,5 @@ public class ArabicStemmer { return true; } - } - - - /** - * Delete n characters in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len Length of input buffer - * @param nChars number of characters to delete - * @return length of input buffer after deletion - */ - protected int deleteN(char s[], int pos, int len, int nChars) { - for (int i = 0; i < nChars; i++) - len = delete(s, pos, len); - return len; - } - - /** - * Delete a character in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len length of input buffer - * @return length of input buffer after deletion - */ - protected int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } - + } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java index 1114517cf93..2aa23cda03c 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.bg; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Light Stemmer for Bulgarian. *
@@ -138,15 +140,4 @@ public class BulgarianStemmer { return len; } - - private boolean endsWith(final char s[], final int len, final String suffix) { - final int suffixLen = suffix.length(); - if (suffixLen > len) - return false; - for (int i = suffixLen - 1; i >= 0; i--) - if (s[len -(suffixLen - i)] != suffix.charAt(i)) - return false; - - return true; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java index 66c42971cf4..32980cc9056 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.cz; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Light Stemmer for Czech. *
@@ -166,16 +168,4 @@ public class CzechStemmer { return len; } - - private boolean endsWith(char s[], int len, String suffix) { - int suffixLen = suffix.length(); - if (suffixLen > len) - return false; - - for (int i = suffixLen - 1; i >= 0; i--) - if (s[len - (suffixLen - i)] != suffix.charAt(i)) - return false; - - return true; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilter.java new file mode 100644 index 00000000000..a949a7d2010 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link GermanLightStemmer} to stem German + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class GermanLightStemFilter extends TokenFilter { + private final GermanLightStemmer stemmer = new GermanLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public GermanLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java new file mode 100644 index 00000000000..04d8b5858d3 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java @@ -0,0 +1,138 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Light Stemmer for German. + *+ * This stemmer implements the "UniNE" algorithm in: + * Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages + * Jacques Savoy + */ +public class GermanLightStemmer { + + public int stem(char s[], int len) { + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'ä': + case 'à': + case 'á': + case 'â': s[i] = 'a'; break; + case 'ö': + case 'ò': + case 'ó': + case 'ô': s[i] = 'o'; break; + case 'ï': + case 'ì': + case 'í': + case 'î': s[i] = 'i'; break; + case 'ü': + case 'ù': + case 'ú': + case 'û': s[i] = 'u'; break; + } + + len = step1(s, len); + return step2(s, len); + } + + private boolean stEnding(char ch) { + switch(ch) { + case 'b': + case 'd': + case 'f': + case 'g': + case 'h': + case 'k': + case 'l': + case 'm': + case 'n': + case 't': return true; + default: return false; + } + } + + private int step1(char s[], int len) { + if (len > 5 && s[len-3] == 'e' && s[len-2] == 'r' && s[len-1] == 'n') + return len - 3; + + if (len > 4 && s[len-2] == 'e') + switch(s[len-1]) { + case 'm': + case 'n': + case 'r': + case 's': return len - 2; + } + + if (len > 3 && s[len-1] == 'e') + return len - 1; + + if (len > 3 && s[len-1] == 's' && stEnding(s[len-2])) + return len - 1; + + return len; + } + + private int step2(char s[], int len) { + if (len > 5 && s[len-3] == 'e' && s[len-2] == 's' && s[len-1] == 't') + return len - 3; + + if (len > 4 && s[len-2] == 'e' && (s[len-1] == 'r' || s[len-1] == 'n')) + return len - 2; + + if (len > 4 && s[len-2] == 's' && s[len-1] == 't' && stEnding(s[len-3])) + return len - 2; + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilter.java new file mode 100644 index 00000000000..9a291c3a79d --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link GermanMinimalStemmer} to stem German + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class GermanMinimalStemFilter extends TokenFilter { + private final GermanMinimalStemmer stemmer = new GermanMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public GermanMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java new file mode 100644 index 00000000000..52fc4a04d1b --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java @@ -0,0 +1,95 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Minimal Stemmer for German. + *+ * This stemmer implements the following algorithm: + * Morphologie et recherche d'information + * Jacques Savoy. + */ +public class GermanMinimalStemmer { + + public int stem(char s[], int len) { + if (len < 5) + return len; + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'ä': s[i] = 'a'; break; + case 'ö': s[i] = 'o'; break; + case 'ü': s[i] = 'u'; break; + } + + if (len > 6 && s[len-3] == 'n' && s[len-2] == 'e' && s[len-1] == 'n') + return len - 3; + + if (len > 5) + switch(s[len-1]) { + case 'n': if (s[len-2] == 'e') return len - 2; else break; + case 'e': if (s[len-2] == 's') return len - 2; else break; + case 's': if (s[len-2] == 'e') return len - 2; else break; + case 'r': if (s[len-2] == 'e') return len - 2; else break; + } + + switch(s[len-1]) { + case 'n': + case 'e': + case 's': + case 'r': return len - 1; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilter.java new file mode 100644 index 00000000000..fede035487a --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.en; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link EnglishMinimalStemmer} to stem + * English words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class EnglishMinimalStemFilter extends TokenFilter { + private final EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public EnglishMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemmer.java new file mode 100644 index 00000000000..aff2e9f5a6b --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemmer.java @@ -0,0 +1,45 @@ +package org.apache.lucene.analysis.en; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Minimal plural stemmer for English. + *+ * This stemmer implements the "S-Stemmer" from + * How Effective Is Suffixing? + * Donna Harman. + */ +public class EnglishMinimalStemmer { + public int stem(char s[], int len) { + if (len < 3 || s[len-1] != 's') + return len; + + switch(s[len-2]) { + case 'u': + case 's': return len; + case 'e': + if (len > 3 && s[len-3] == 'i' && s[len-4] != 'a' && s[len-4] != 'e') { + s[len - 3] = 'y'; + return len - 2; + } + if (s[len-3] == 'i' || s[len-3] == 'a' || s[len-3] == 'o' || s[len-3] == 'e') + return len; + default: return len - 1; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilter.java new file mode 100644 index 00000000000..79ade5c16f8 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.es; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link SpanishLightStemmer} to stem Spanish + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class SpanishLightStemFilter extends TokenFilter { + private final SpanishLightStemmer stemmer = new SpanishLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public SpanishLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java new file mode 100644 index 00000000000..382faa841a4 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis.es; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Light Stemmer for Spanish + *+ * This stemmer implements the algorithm described in: + * Report on CLEF-2001 Experiments + * Jacques Savoy + */ +public class SpanishLightStemmer { + + public int stem(char s[], int len) { + if (len < 5) + return len; + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'à': + case 'á': + case 'â': + case 'ä': s[i] = 'a'; break; + case 'ò': + case 'ó': + case 'ô': + case 'ö': s[i] = 'o'; break; + case 'è': + case 'é': + case 'ê': + case 'ë': s[i] = 'e'; break; + case 'ù': + case 'ú': + case 'û': + case 'ü': s[i] = 'u'; break; + case 'ì': + case 'í': + case 'î': + case 'ï': s[i] = 'i'; break; + } + + switch(s[len-1]) { + case 'o': + case 'a': + case 'e': return len - 1; + case 's': + if (s[len-2] == 'e' && s[len-3] == 's' && s[len-4] == 'e') + return len-2; + if (s[len-2] == 'e' && s[len-3] == 'c') { + s[len-3] = 'z'; + return len - 2; + } + if (s[len-2] == 'o' || s[len-2] == 'a' || s[len-2] == 'e') + return len - 2; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java index 8798b4e3ee0..68407d2dc9e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.fa; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Normalizer for Persian. *
@@ -82,20 +84,4 @@ public class PersianNormalizer { return len; } - - /** - * Delete a character in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len length of input buffer - * @return length of input buffer after deletion - */ - protected int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } - } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilter.java new file mode 100644 index 00000000000..12f58b97cfd --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.fi; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link FinnishLightStemmer} to stem Finnish + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class FinnishLightStemFilter extends TokenFilter { + private final FinnishLightStemmer stemmer = new FinnishLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public FinnishLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java new file mode 100644 index 00000000000..ae43caaeb12 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java @@ -0,0 +1,259 @@ +package org.apache.lucene.analysis.fi; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Finnish. + *+ * This stemmer implements the algorithm described in: + * Report on CLEF-2003 Monolingual Tracks + * Jacques Savoy + */ +public class FinnishLightStemmer { + + public int stem(char s[], int len) { + if (len < 4) + return len; + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'ä': + case 'å': s[i] = 'a'; break; + case 'ö': s[i] = 'o'; break; + } + + len = step1(s, len); + len = step2(s, len); + len = step3(s, len); + len = norm1(s, len); + len = norm2(s, len); + return len; + } + + private int step1(char s[], int len) { + if (len > 8) { + if (endsWith(s, len, "kin")) + return step1(s, len-3); + if (endsWith(s, len, "ko")) + return step1(s, len-2); + } + + if (len > 11) { + if (endsWith(s, len, "dellinen")) + return len-8; + if (endsWith(s, len, "dellisuus")) + return len-9; + } + return len; + } + + private int step2(char s[], int len) { + if (len > 5) { + if (endsWith(s, len, "lla") + || endsWith(s, len, "tse") + || endsWith(s, len, "sti")) + return len-3; + + if (endsWith(s, len, "ni")) + return len-2; + + if (endsWith(s, len, "aa")) + return len-1; // aa -> a + } + + return len; + } + + private int step3(char s[], int len) { + if (len > 8) { + if (endsWith(s, len, "nnen")) { + s[len-4] = 's'; + return len-3; + } + + if (endsWith(s, len, "ntena")) { + s[len-5] = 's'; + return len-4; + } + + if (endsWith(s, len, "tten")) + return len-4; + + if (endsWith(s, len, "eiden")) + return len-5; + } + + if (len > 6) { + if (endsWith(s, len, "neen") + || endsWith(s, len, "niin") + || endsWith(s, len, "seen") + || endsWith(s, len, "teen") + || endsWith(s, len, "inen")) + return len-4; + + if (s[len-3] == 'h' && isVowel(s[len-2]) && s[len-1] == 'n') + return len-3; + + if (endsWith(s, len, "den")) { + s[len-3] = 's'; + return len-2; + } + + if (endsWith(s, len, "ksen")) { + s[len-4] = 's'; + return len-3; + } + + if (endsWith(s, len, "ssa") + || endsWith(s, len, "sta") + || endsWith(s, len, "lla") + || endsWith(s, len, "lta") + || endsWith(s, len, "tta") + || endsWith(s, len, "ksi") + || endsWith(s, len, "lle")) + return len-3; + } + + if (len > 5) { + if (endsWith(s, len, "na") + || endsWith(s, len, "ne")) + return len-2; + + if (endsWith(s, len, "nei")) + return len-3; + } + + if (len > 4) { + if (endsWith(s, len, "ja") + || endsWith(s, len, "ta")) + return len-2; + + if (s[len-1] == 'a') + return len-1; + + if (s[len-1] == 'n' && isVowel(s[len-2])) + return len-2; + + if (s[len-1] == 'n') + return len-1; + } + + return len; + } + + private int norm1(char s[], int len) { + if (len > 5 && endsWith(s, len, "hde")) { + s[len-3] = 'k'; + s[len-2] = 's'; + s[len-1] = 'i'; + } + + if (len > 4) { + if (endsWith(s, len, "ei") || endsWith(s, len, "at")) + return len-2; + } + + if (len > 3) + switch(s[len-1]) { + case 't': + case 's': + case 'j': + case 'e': + case 'a': + case 'i': return len-1; + } + + return len; + } + + private int norm2(char s[], int len) { + if (len > 8) { + if (s[len-1] == 'e' + || s[len-1] == 'o' + || s[len-1] == 'u') + len--; + } + + if (len > 4) { + if (s[len-1] == 'i') + len--; + + if (len > 4) { + char ch = s[0]; + for (int i = 1; i < len; i++) { + if (s[i] == ch && + (ch == 'k' || ch == 'p' || ch == 't')) + len = delete(s, i--, len); + else + ch = s[i]; + } + } + } + + return len; + } + + private boolean isVowel(char ch) { + switch(ch) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'y': return true; + default: return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilter.java new file mode 100644 index 00000000000..65a13b561c0 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link FrenchLightStemmer} to stem French + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class FrenchLightStemFilter extends TokenFilter { + private final FrenchLightStemmer stemmer = new FrenchLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public FrenchLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java new file mode 100644 index 00000000000..43e2e0625d1 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java @@ -0,0 +1,267 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for French. + *+ * This stemmer implements the "UniNE" algorithm in: + * Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages + * Jacques Savoy + */ +public class FrenchLightStemmer { + + public int stem(char s[], int len) { + if (len > 5 && s[len-1] == 'x') { + if (s[len-3] == 'a' && s[len-2] == 'u' && s[len-4] != 'e') + s[len-2] = 'l'; + len--; + } + + if (len > 3 && s[len-1] == 'x') + len--; + + if (len > 3 && s[len-1] == 's') + len--; + + if (len > 9 && endsWith(s, len, "issement")) { + len -= 6; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 8 && endsWith(s, len, "issant")) { + len -= 4; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 6 && endsWith(s, len, "ement")) { + len -= 4; + if (len > 3 && endsWith(s, len, "ive")) { + len--; + s[len-1] = 'f'; + } + return norm(s, len); + } + + if (len > 11 && endsWith(s, len, "ficatrice")) { + len -= 5; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 10 && endsWith(s, len, "ficateur")) { + len -= 4; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 9 && endsWith(s, len, "catrice")) { + len -= 3; + s[len-4] = 'q'; + s[len-3] = 'u'; + s[len-2] = 'e'; + //s[len-1] = 'r' <-- unnecessary, already 'r'. + return norm(s, len); + } + + if (len > 8 && endsWith(s, len, "cateur")) { + len -= 2; + s[len-4] = 'q'; + s[len-3] = 'u'; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 8 && endsWith(s, len, "atrice")) { + len -= 4; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 7 && endsWith(s, len, "ateur")) { + len -= 3; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 6 && endsWith(s, len, "trice")) { + len--; + s[len-3] = 'e'; + s[len-2] = 'u'; + s[len-1] = 'r'; + } + + if (len > 5 && endsWith(s, len, "ième")) + return norm(s, len-4); + + if (len > 7 && endsWith(s, len, "teuse")) { + len -= 2; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 6 && endsWith(s, len, "teur")) { + len--; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 5 && endsWith(s, len, "euse")) + return norm(s, len-2); + + if (len > 8 && endsWith(s, len, "ère")) { + len--; + s[len-2] = 'e'; + return norm(s, len); + } + + if (len > 7 && endsWith(s, len, "ive")) { + len--; + s[len-1] = 'f'; + return norm(s, len); + } + + if (len > 4 && + (endsWith(s, len, "folle") || + endsWith(s, len, "molle"))) { + len -= 2; + s[len-1] = 'u'; + return norm(s, len); + } + + if (len > 9 && endsWith(s, len, "nnelle")) + return norm(s, len-5); + + if (len > 9 && endsWith(s, len, "nnel")) + return norm(s, len-3); + + if (len > 4 && endsWith(s, len, "ète")) { + len--; + s[len-2] = 'e'; + } + + if (len > 8 && endsWith(s, len, "ique")) + len -= 4; + + if (len > 8 && endsWith(s, len, "esse")) + return norm(s, len-3); + + if (len > 7 && endsWith(s, len, "inage")) + return norm(s, len-3); + + if (len > 9 && endsWith(s, len, "isation")) { + len -= 7; + if (len > 5 && endsWith(s, len, "ual")) + s[len-2] = 'e'; + return norm(s, len); + } + + if (len > 9 && endsWith(s, len, "isateur")) + return norm(s, len-7); + + if (len > 8 && endsWith(s, len, "ation")) + return norm(s, len-5); + + if (len > 8 && endsWith(s, len, "ition")) + return norm(s, len-5); + + return norm(s, len); + } + + private int norm(char s[], int len) { + if (len > 4) { + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'à': + case 'á': + case 'â': s[i] = 'a'; break; + case 'ô': s[i] = 'o'; break; + case 'è': + case 'é': + case 'ê': s[i] = 'e'; break; + case 'ù': + case 'û': s[i] = 'u'; break; + case 'î': s[i] = 'i'; break; + case 'ç': s[i] = 'c'; break; + } + + char ch = s[0]; + for (int i = 1; i < len; i++) { + if (s[i] == ch) + len = delete(s, i--, len); + else + ch = s[i]; + } + } + + if (len > 4 && endsWith(s, len, "ie")) + len -= 2; + + if (len > 4) { + if (s[len-1] == 'r') len--; + if (s[len-1] == 'e') len--; + if (s[len-1] == 'e') len--; + if (s[len-1] == s[len-2]) len--; + } + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilter.java new file mode 100644 index 00000000000..3d7789b0696 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link FrenchMinimalStemmer} to stem French + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class FrenchMinimalStemFilter extends TokenFilter { + private final FrenchMinimalStemmer stemmer = new FrenchMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public FrenchMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java new file mode 100644 index 00000000000..788231965b4 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java @@ -0,0 +1,80 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Light Stemmer for French. + *+ * This stemmer implements the following algorithm: + * A Stemming procedure and stopword list for general French corpora. + * Jacques Savoy. + */ +public class FrenchMinimalStemmer { + public int stem(char s[], int len) { + if (len < 6) + return len; + + if (s[len-1] == 'x') { + if (s[len-3] == 'a' && s[len-2] == 'u') + s[len-2] = 'l'; + return len - 1; + } + + if (s[len-1] == 's') len--; + if (s[len-1] == 'r') len--; + if (s[len-1] == 'e') len--; + if (s[len-1] == 'é') len--; + if (s[len-1] == s[len-2]) len--; + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java index 4c614448c19..245afd93f8e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hi; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Normalizer for Hindi. *
@@ -176,19 +178,4 @@ public class HindiNormalizer { return len; } - - /** - * Delete a character in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len length of input buffer - * @return length of input buffer after deletion - */ - protected int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java index 255ffa2cd4e..68ef2ccf51f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hi; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Light Stemmer for Hindi. *
@@ -116,15 +118,4 @@ public class HindiStemmer { return len - 1; return len; } - - private boolean endsWith(final char s[], final int len, final String suffix) { - final int suffixLen = suffix.length(); - if (suffixLen > len) - return false; - for (int i = suffixLen - 1; i >= 0; i--) - if (s[len -(suffixLen - i)] != suffix.charAt(i)) - return false; - - return true; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilter.java new file mode 100644 index 00000000000..f3f06fbbd2f --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.hu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link HungarianLightStemmer} to stem + * Hungarian words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class HungarianLightStemFilter extends TokenFilter { + private final HungarianLightStemmer stemmer = new HungarianLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public HungarianLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java new file mode 100644 index 00000000000..31b5e6fbad3 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java @@ -0,0 +1,238 @@ +package org.apache.lucene.analysis.hu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Hungarian. + *+ * This stemmer implements the "UniNE" algorithm in: + * Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages + * Jacques Savoy + */ +public class HungarianLightStemmer { + public int stem(char s[], int len) { + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'á': s[i] = 'a'; break; + case 'ë': + case 'é': s[i] = 'e'; break; + case 'í': s[i] = 'i'; break; + case 'ó': + case 'ő': + case 'õ': + case 'ö': s[i] = 'o'; break; + case 'ú': + case 'ű': + case 'ũ': + case 'û': + case 'ü': s[i] = 'u'; break; + } + + len = removeCase(s, len); + len = removePossessive(s, len); + len = removePlural(s, len); + return normalize(s, len); + } + + private int removeCase(char s[], int len) { + if (len > 6 && endsWith(s, len, "kent")) + return len - 4; + + if (len > 5) { + if (endsWith(s, len, "nak") || + endsWith(s, len, "nek") || + endsWith(s, len, "val") || + endsWith(s, len, "vel") || + endsWith(s, len, "ert") || + endsWith(s, len, "rol") || + endsWith(s, len, "ban") || + endsWith(s, len, "ben") || + endsWith(s, len, "bol") || + endsWith(s, len, "nal") || + endsWith(s, len, "nel") || + endsWith(s, len, "hoz") || + endsWith(s, len, "hez") || + endsWith(s, len, "tol")) + return len - 3; + + if (endsWith(s, len, "al") || endsWith(s, len, "el")) { + if (!isVowel(s[len-3]) && s[len-3] == s[len-4]) + return len - 3; + } + } + + if (len > 4) { + if (endsWith(s, len, "at") || + endsWith(s, len, "et") || + endsWith(s, len, "ot") || + endsWith(s, len, "va") || + endsWith(s, len, "ve") || + endsWith(s, len, "ra") || + endsWith(s, len, "re") || + endsWith(s, len, "ba") || + endsWith(s, len, "be") || + endsWith(s, len, "ul") || + endsWith(s, len, "ig")) + return len - 2; + + if ((endsWith(s, len, "on") || endsWith(s, len, "en")) && !isVowel(s[len-3])) + return len - 2; + + switch(s[len-1]) { + case 't': + case 'n': return len - 1; + case 'a': + case 'e': if (s[len-2] == s[len-3] && !isVowel(s[len-2])) return len - 2; + } + } + + return len; + } + + private int removePossessive(char s[], int len) { + if (len > 6) { + if (!isVowel(s[len-5]) && + (endsWith(s, len, "atok") || + endsWith(s, len, "otok") || + endsWith(s, len, "etek"))) + return len - 4; + + if (endsWith(s, len, "itek") || endsWith(s, len, "itok")) + return len - 4; + } + + if (len > 5) { + if (!isVowel(s[len-4]) && + (endsWith(s, len, "unk") || + endsWith(s, len, "tok") || + endsWith(s, len, "tek"))) + return len - 3; + + if (isVowel(s[len-4]) && endsWith(s, len, "juk")) + return len - 3; + + if (endsWith(s, len, "ink")) + return len - 3; + } + + if (len > 4) { + if (!isVowel(s[len-3]) && + (endsWith(s, len, "am") || + endsWith(s, len, "em") || + endsWith(s, len, "om") || + endsWith(s, len, "ad") || + endsWith(s, len, "ed") || + endsWith(s, len, "od") || + endsWith(s, len, "uk"))) + return len - 2; + + if (isVowel(s[len-3]) && + (endsWith(s, len, "nk") || + endsWith(s, len, "ja") || + endsWith(s, len, "je"))) + return len - 2; + + if (endsWith(s, len, "im") || + endsWith(s, len, "id") || + endsWith(s, len, "ik")) + return len - 2; + } + + if (len > 3) + switch(s[len-1]) { + case 'a': + case 'e': if (!isVowel(s[len-2])) return len - 1; break; + case 'm': + case 'd': if (isVowel(s[len-2])) return len - 1; break; + case 'i': return len - 1; + } + + return len; + } + + private int removePlural(char s[], int len) { + if (len > 3 && s[len-1] == 'k') + switch(s[len-2]) { + case 'a': + case 'o': + case 'e': if (len > 4) return len - 2; /* intentional fallthru */ + default: return len - 1; + } + return len; + } + + private int normalize(char s[], int len) { + if (len > 3) + switch(s[len-1]) { + case 'a': + case 'e': + case 'i': + case 'o': return len - 1; + } + return len; + } + + private boolean isVowel(char ch) { + switch(ch) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'y': return true; + default: return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java index 82afce8434a..0b7308c48a7 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.id; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Stemmer for Indonesian. *
@@ -266,39 +268,5 @@ public class IndonesianStemmer { return length - 1; } return length; - } - - private boolean startsWith(char s[], int len, String prefix) { - final int prefixLen = prefix.length(); - if (prefixLen > len) - return false; - for (int i = 0; i < prefixLen; i++) - if (s[i] != prefix.charAt(i)) - return false; - return true; - } - - private boolean endsWith(char s[], int len, String suffix) { - final int suffixLen = suffix.length(); - if (suffixLen > len) - return false; - for (int i = suffixLen - 1; i >= 0; i--) - if (s[len -(suffixLen - i)] != suffix.charAt(i)) - return false; - - return true; - } - - private int deleteN(char s[], int pos, int len, int nChars) { - for (int i = 0; i < nChars; i++) - len = delete(s, pos, len); - return len; - } - - private int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } + } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java index 0f7fcf787ff..2f3c3749cd3 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.in; import java.util.BitSet; import java.util.IdentityHashMap; import static java.lang.Character.UnicodeBlock.*; +import static org.apache.lucene.analysis.util.StemmerUtil.*; /** * Normalizes the Unicode representation of text in Indian languages. @@ -290,14 +291,4 @@ public class IndicNormalizer { return len; } - - /** - * Delete a character in-place - */ - private int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java new file mode 100644 index 00000000000..af9625cfc05 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.it; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link ItalianLightStemmer} to stem Italian + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class ItalianLightStemFilter extends TokenFilter { + private final ItalianLightStemmer stemmer = new ItalianLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public ItalianLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java new file mode 100644 index 00000000000..50a80bd6af1 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java @@ -0,0 +1,117 @@ +package org.apache.lucene.analysis.it; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Light Stemmer for Italian. + *+ * This stemmer implements the algorithm described in: + * Report on CLEF-2001 Experiments + * Jacques Savoy + */ +public class ItalianLightStemmer { + + public int stem(char s[], int len) { + if (len < 6) + return len; + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'à': + case 'á': + case 'â': + case 'ä': s[i] = 'a'; break; + case 'ò': + case 'ó': + case 'ô': + case 'ö': s[i] = 'o'; break; + case 'è': + case 'é': + case 'ê': + case 'ë': s[i] = 'e'; break; + case 'ù': + case 'ú': + case 'û': + case 'ü': s[i] = 'u'; break; + case 'ì': + case 'í': + case 'î': + case 'ï': s[i] = 'i'; break; + } + + switch(s[len-1]) { + case 'e': + if (s[len-2] == 'i' || s[len-2] == 'h') + return len - 2; + else + return len - 1; + case 'i': + if (s[len-2] == 'h' || s[len-2] == 'i') + return len - 2; + else + return len - 1; + case 'a': + if (s[len-2] == 'i') + return len - 2; + else + return len - 1; + case 'o': + if (s[len-2] == 'i') + return len - 2; + else + return len - 1; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java new file mode 100644 index 00000000000..81d268a91aa --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.pt; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link PortugueseLightStemmer} to stem + * Portuguese words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class PortugueseLightStemFilter extends TokenFilter { + private final PortugueseLightStemmer stemmer = new PortugueseLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public PortugueseLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java new file mode 100644 index 00000000000..1baea680c0b --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java @@ -0,0 +1,202 @@ +package org.apache.lucene.analysis.pt; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Portuguese + */ +public class PortugueseLightStemmer { + + public int stem(char s[], int len) { + if (len < 4) + return len; + + len = removeSuffix(s, len); + + if (len > 3 && s[len-1] == 'a') + len = normFeminine(s, len); + + if (len > 4) + switch(s[len-1]) { + case 'e': + case 'a': + case 'o': len--; break; + } + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'à': + case 'á': + case 'â': + case 'ä': + case 'ã': s[i] = 'a'; break; + case 'ò': + case 'ó': + case 'ô': + case 'ö': + case 'õ': s[i] = 'o'; break; + case 'è': + case 'é': + case 'ê': + case 'ë': s[i] = 'e'; break; + case 'ù': + case 'ú': + case 'û': + case 'ü': s[i] = 'u'; break; + case 'ì': + case 'í': + case 'î': + case 'ï': s[i] = 'i'; break; + case 'ç': s[i] = 'c'; break; + } + + return len; + } + + private int removeSuffix(char s[], int len) { + if (len > 4 && endsWith(s, len, "es")) + switch(s[len-3]) { + case 'r': + case 's': + case 'l': + case 'z': return len - 2; + } + + if (len > 3 && endsWith(s, len, "ns")) { + s[len - 2] = 'm'; + return len - 1; + } + + if (len > 4 && (endsWith(s, len, "eis") || endsWith(s, len, "éis"))) { + s[len - 3] = 'e'; + s[len - 2] = 'l'; + return len - 1; + } + + if (len > 4 && endsWith(s, len, "ais")) { + s[len - 2] = 'l'; + return len - 1; + } + + if (len > 4 && endsWith(s, len, "óis")) { + s[len - 3] = 'o'; + s[len - 2] = 'l'; + return len - 1; + } + + if (len > 4 && endsWith(s, len, "is")) { + s[len - 1] = 'l'; + return len; + } + + if (len > 3 && + (endsWith(s, len, "ões") || + endsWith(s, len, "ães"))) { + len--; + s[len - 2] = 'ã'; + s[len - 1] = 'o'; + return len; + } + + if (len > 6 && endsWith(s, len, "mente")) + return len - 5; + + if (len > 3 && s[len-1] == 's') + return len - 1; + return len; + } + + private int normFeminine(char s[], int len) { + if (len > 7 && + (endsWith(s, len, "inha") || + endsWith(s, len, "iaca") || + endsWith(s, len, "eira"))) { + s[len - 1] = 'o'; + return len; + } + + if (len > 6) { + if (endsWith(s, len, "osa") || + endsWith(s, len, "ica") || + endsWith(s, len, "ida") || + endsWith(s, len, "ada") || + endsWith(s, len, "iva") || + endsWith(s, len, "ama")) { + s[len - 1] = 'o'; + return len; + } + + if (endsWith(s, len, "ona")) { + s[len - 3] = 'ã'; + s[len - 2] = 'o'; + return len - 1; + } + + if (endsWith(s, len, "ora")) + return len - 1; + + if (endsWith(s, len, "esa")) { + s[len - 3] = 'ê'; + return len - 1; + } + + if (endsWith(s, len, "na")) { + s[len - 1] = 'o'; + return len; + } + } + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java new file mode 100644 index 00000000000..aa5a3716653 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.pt; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link PortugueseMinimalStemmer} to stem + * Portuguese words. + *+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class PortugueseMinimalStemFilter extends TokenFilter { + private final PortugueseMinimalStemmer stemmer = new PortugueseMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public PortugueseMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java new file mode 100644 index 00000000000..7ce19e37445 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java @@ -0,0 +1,119 @@ +package org.apache.lucene.analysis.pt; + +import java.util.Arrays; + +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.Version; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Minimal Stemmer for Portuguese + *+ * This follows the "RSLP-S" algorithm presented in: + * A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese + * Information Retrieval (Orengo, et al) + * which is just the plural reduction step of the RSLP + * algorithm from A Stemming Algorithmm for the Portuguese Language, + * Orengo et al. + */ +public class PortugueseMinimalStemmer { + + private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31, + Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois", + "depois","dois","leis"), + false); + + private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31, + Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos", + "férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés", + "através", "convés", "ês", "país", "após", "ambas", "ambos", + "messias", "depois"), + false); + + public int stem(char s[], int len) { + if (len < 3 || s[len-1] != 's') + return len; + + if (s[len-2] == 'n') { + len--; + s[len-1] = 'm'; + return len; + } + + if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') { + len--; + s[len-2] = 'ã'; + s[len-1] = 'o'; + return len; + } + + if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e') + if (!(len == 4 && s[0] == 'm')) { + len--; + s[len-1] = 'o'; + return len; + } + + if (len >= 4 && s[len-2] == 'i') { + if (s[len-3] == 'a') + if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) { + len--; + s[len-1] = 'l'; + return len; + } + + if (len >= 5 && s[len-3] == 'é') { + len--; + s[len-2] = 'e'; + s[len-1] = 'l'; + return len; + } + + if (len >= 5 && s[len-3] == 'e') { + len--; + s[len-1] = 'l'; + return len; + } + + if (len >= 5 && s[len-3] == 'ó') { + len--; + s[len-2] = 'o'; + s[len-1] = 'l'; + return len; + } + + if (!excIS.contains(s, 0, len)) { + s[len-1] = 'l'; + return len; + } + } + + if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e') + return len - 2; + + if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e') + if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o')) + return len - 2; + + if (excS.contains(s, 0, len)) + return len; + else + return len-1; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java new file mode 100644 index 00000000000..826b22dc13c --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.ru; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link RussianLightStemmer} to stem Russian + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class RussianLightStemFilter extends TokenFilter { + private final RussianLightStemmer stemmer = new RussianLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public RussianLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java new file mode 100644 index 00000000000..e58bf38f6a9 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java @@ -0,0 +1,153 @@ +package org.apache.lucene.analysis.ru; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Russian. + *+ * This stemmer implements the following algorithm: + * Indexing and Searching Strategies for the Russian Language. + * Ljiljana Dolamic and Jacques Savoy. + */ +public class RussianLightStemmer { + + public int stem(char s[], int len) { + len = removeCase(s, len); + return normalize(s, len); + } + + private int normalize(char s[], int len) { + if (len > 3) + switch(s[len-1]) { + case 'ь': + case 'и': return len - 1; + case 'н': if (s[len-2] == 'н') return len - 1; + } + return len; + } + + private int removeCase(char s[], int len) { + if (len > 6 && + (endsWith(s, len, "иями") || + endsWith(s, len, "оями"))) + return len - 4; + + if (len > 5 && + (endsWith(s, len, "иям") || + endsWith(s, len, "иях") || + endsWith(s, len, "оях") || + endsWith(s, len, "ями") || + endsWith(s, len, "оям") || + endsWith(s, len, "оьв") || + endsWith(s, len, "ами") || + endsWith(s, len, "его") || + endsWith(s, len, "ему") || + endsWith(s, len, "ери") || + endsWith(s, len, "ими") || + endsWith(s, len, "ого") || + endsWith(s, len, "ому") || + endsWith(s, len, "ыми") || + endsWith(s, len, "оев"))) + return len - 3; + + if (len > 4 && + (endsWith(s, len, "ая") || + endsWith(s, len, "яя") || + endsWith(s, len, "ях") || + endsWith(s, len, "юю") || + endsWith(s, len, "ах") || + endsWith(s, len, "ею") || + endsWith(s, len, "их") || + endsWith(s, len, "ия") || + endsWith(s, len, "ию") || + endsWith(s, len, "ьв") || + endsWith(s, len, "ою") || + endsWith(s, len, "ую") || + endsWith(s, len, "ям") || + endsWith(s, len, "ых") || + endsWith(s, len, "ея") || + endsWith(s, len, "ам") || + endsWith(s, len, "ем") || + endsWith(s, len, "ей") || + endsWith(s, len, "ём") || + endsWith(s, len, "ев") || + endsWith(s, len, "ий") || + endsWith(s, len, "им") || + endsWith(s, len, "ое") || + endsWith(s, len, "ой") || + endsWith(s, len, "ом") || + endsWith(s, len, "ов") || + endsWith(s, len, "ые") || + endsWith(s, len, "ый") || + endsWith(s, len, "ым") || + endsWith(s, len, "ми"))) + return len - 2; + + if (len > 3) + switch(s[len-1]) { + case 'а': + case 'е': + case 'и': + case 'о': + case 'у': + case 'й': + case 'ы': + case 'я': + case 'ь': return len - 1; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java new file mode 100644 index 00000000000..226c974576a --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.sv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link SwedishLightStemmer} to stem Swedish + * words. + *
+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *
+ */ +public final class SwedishLightStemFilter extends TokenFilter { + private final SwedishLightStemmer stemmer = new SwedishLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public SwedishLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java new file mode 100644 index 00000000000..036ba5f4cb9 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java @@ -0,0 +1,111 @@ +package org.apache.lucene.analysis.sv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Swedish. + *
+ * This stemmer implements the algorithm described in:
+ * Report on CLEF-2003 Monolingual Tracks
+ * Jacques Savoy
+ */
+public class SwedishLightStemmer {
+
+ public int stem(char s[], int len) {
+ if (len > 4 && s[len-1] == 's')
+ len--;
+
+ if (len > 7 &&
+ (endsWith(s, len, "elser") ||
+ endsWith(s, len, "heten")))
+ return len - 5;
+
+ if (len > 6 &&
+ (endsWith(s, len, "arne") ||
+ endsWith(s, len, "erna") ||
+ endsWith(s, len, "ande") ||
+ endsWith(s, len, "else") ||
+ endsWith(s, len, "aste") ||
+ endsWith(s, len, "orna") ||
+ endsWith(s, len, "aren")))
+ return len - 4;
+
+ if (len > 5 &&
+ (endsWith(s, len, "are") ||
+ endsWith(s, len, "ast") ||
+ endsWith(s, len, "het")))
+ return len - 3;
+
+ if (len > 4 &&
+ (endsWith(s, len, "ar") ||
+ endsWith(s, len, "er") ||
+ endsWith(s, len, "or") ||
+ endsWith(s, len, "en") ||
+ endsWith(s, len, "at") ||
+ endsWith(s, len, "te") ||
+ endsWith(s, len, "et")))
+ return len - 2;
+
+ if (len > 3)
+ switch(s[len-1]) {
+ case 't':
+ case 'a':
+ case 'e':
+ case 'n': return len - 1;
+ }
+
+ return len;
+ }
+}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
new file mode 100644
index 00000000000..883a7af8109
--- /dev/null
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
@@ -0,0 +1,89 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Some commonly-used stemming functions */
+public class StemmerUtil {
+ /**
+ * Returns true if the character array starts with the suffix.
+ *
+ * @param s Input Buffer
+ * @param len length of input buffer
+ * @param suffix Suffix string to test
+ * @return true if s
starts with suffix
+ */
+ public static boolean startsWith(char s[], int len, String prefix) {
+ final int prefixLen = prefix.length();
+ if (prefixLen > len)
+ return false;
+ for (int i = 0; i < prefixLen; i++)
+ if (s[i] != prefix.charAt(i))
+ return false;
+ return true;
+ }
+
+ /**
+ * Returns true if the character array ends with the suffix.
+ *
+ * @param s Input Buffer
+ * @param len length of input buffer
+ * @param suffix Suffix string to test
+ * @return true if s
ends with suffix
+ */
+ public static boolean endsWith(char s[], int len, String suffix) {
+ final int suffixLen = suffix.length();
+ if (suffixLen > len)
+ return false;
+ for (int i = suffixLen - 1; i >= 0; i--)
+ if (s[len -(suffixLen - i)] != suffix.charAt(i))
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ public static int delete(char s[], int pos, int len) {
+ if (pos < len)
+ System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+
+ /**
+ * Delete n characters in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len Length of input buffer
+ * @param nChars number of characters to delete
+ * @return length of input buffer after deletion
+ */
+ public static int deleteN(char s[], int pos, int len, int nChars) {
+ // TODO: speed up, this is silly
+ for (int i = 0; i < nChars; i++)
+ len = delete(s, pos, len);
+ return len;
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
new file mode 100644
index 00000000000..63dfdb6c4cc
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link GermanLightStemFilter}
+ */
+public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new GermanLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
new file mode 100644
index 00000000000..c14c7ea4076
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
@@ -0,0 +1,60 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link GermanMinimalStemFilter}
+ */
+public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new GermanMinimalStemFilter(source));
+ }
+ };
+
+ /** Test some examples from the paper */
+ public void testExamples() throws IOException {
+ checkOneTerm(analyzer, "sängerinnen", "sangerin");
+ checkOneTerm(analyzer, "frauen", "frau");
+ checkOneTerm(analyzer, "kenntnisse", "kenntnis");
+ checkOneTerm(analyzer, "staates", "staat");
+ checkOneTerm(analyzer, "bilder", "bild");
+ checkOneTerm(analyzer, "boote", "boot");
+ checkOneTerm(analyzer, "götter", "gott");
+ checkOneTerm(analyzer, "äpfel", "apfel");
+ }
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
index 0376ff5bebe..d7602aa47c9 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
@@ -17,17 +17,17 @@ package org.apache.lucene.analysis.de;
* limitations under the License.
*/
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-import java.io.StringReader;
+import java.io.InputStream;
+import java.io.Reader;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Test the German stemmer. The stemming algorithm is known to work less
@@ -38,25 +38,18 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
public void testStemming() throws Exception {
- Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
- TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer));
- // read test cases from external file:
- InputStreamReader isr = new InputStreamReader(getClass().getResourceAsStream("data.txt"), "iso-8859-1");
- BufferedReader breader = new BufferedReader(isr);
- while(true) {
- String line = breader.readLine();
- if (line == null)
- break;
- line = line.trim();
- if (line.startsWith("#") || line.equals(""))
- continue; // ignore comments and empty lines
- String[] parts = line.split(";");
- //System.out.println(parts[0] + " -- " + parts[1]);
- tokenizer.reset(new StringReader(parts[0]));
- filter.reset();
- assertTokenStreamContents(filter, new String[] { parts[1] });
- }
- breader.close();
- isr.close();
+ Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer t = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(t,
+ new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t)));
+ }
+ };
+
+ InputStream vocOut = getClass().getResourceAsStream("data.txt");
+ assertVocabulary(analyzer, vocOut);
+ vocOut.close();
}
}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt
index 520c18a1df6..5b8ce5ffe31 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt
@@ -1,48 +1,48 @@
# German special characters are replaced:
-hufig;haufig
+häufig haufig
# here the stemmer works okay, it maps related words to the same stem:
-abschlieen;abschliess
-abschlieender;abschliess
-abschlieendes;abschliess
-abschlieenden;abschliess
+abschließen abschliess
+abschließender abschliess
+abschließendes abschliess
+abschließenden abschliess
-Tisch;tisch
-Tische;tisch
-Tischen;tisch
+Tisch tisch
+Tische tisch
+Tischen tisch
-Haus;hau
-Hauses;hau
-Huser;hau
-Husern;hau
+Haus hau
+Hauses hau
+Häuser hau
+Häusern hau
# here's a case where overstemming occurs, i.e. a word is
# mapped to the same stem as unrelated words:
-hauen;hau
+hauen hau
# here's a case where understemming occurs, i.e. two related words
# are not mapped to the same stem. This is the case with basically
# all irregular forms:
-Drama;drama
-Dramen;dram
+Drama drama
+Dramen dram
-# replace "" with 'ss':
-Ausma;ausmass
+# replace "ß" with 'ss':
+Ausmaß ausmass
# fake words to test if suffixes are cut off:
-xxxxxe;xxxxx
-xxxxxs;xxxxx
-xxxxxn;xxxxx
-xxxxxt;xxxxx
-xxxxxem;xxxxx
-xxxxxer;xxxxx
-xxxxxnd;xxxxx
+xxxxxe xxxxx
+xxxxxs xxxxx
+xxxxxn xxxxx
+xxxxxt xxxxx
+xxxxxem xxxxx
+xxxxxer xxxxx
+xxxxxnd xxxxx
# the suffixes are also removed when combined:
-xxxxxetende;xxxxx
+xxxxxetende xxxxx
# words that are shorter than four charcters are not changed:
-xxe;xxe
+xxe xxe
# -em and -er are not removed from words shorter than five characters:
-xxem;xxem
-xxer;xxer
+xxem xxem
+xxer xxer
# -nd is not removed from words shorter than six characters:
-xxxnd;xxxnd
+xxxnd xxxnd
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip
new file mode 100644
index 00000000000..400db0bd66e
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip
new file mode 100644
index 00000000000..d930327386c
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java
new file mode 100644
index 00000000000..8ff0303b47d
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.en;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+/**
+ * Simple tests for {@link EnglishMinimalStemFilter}
+ */
+public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new EnglishMinimalStemFilter(source));
+ }
+ };
+
+ /** Test some examples from various papers about this technique */
+ public void testExamples() throws IOException {
+ checkOneTerm(analyzer, "queries", "query");
+ checkOneTerm(analyzer, "phrases", "phrase");
+ checkOneTerm(analyzer, "corpus", "corpus");
+ checkOneTerm(analyzer, "stress", "stress");
+ checkOneTerm(analyzer, "kings", "king");
+ checkOneTerm(analyzer, "panels", "panel");
+ checkOneTerm(analyzer, "aerodynamics", "aerodynamic");
+ checkOneTerm(analyzer, "congress", "congress");
+ checkOneTerm(analyzer, "serious", "serious");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
index b3653ed4d7c..e34829a27bd 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java
@@ -17,21 +17,22 @@ package org.apache.lucene.analysis.en;
* limitations under the License.
*/
-import java.io.BufferedReader;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.Reader;
import java.io.StringReader;
-import java.util.zip.ZipFile;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
/**
* Test the PorterStemFilter with Martin Porter's test data.
*/
@@ -41,26 +42,16 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
* The output should be the same as the string in output.txt
*/
public void testPorterStemFilter() throws Exception {
- Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
- TokenStream filter = new PorterStemFilter(tokenizer);
- ZipFile zipFile = new ZipFile(getDataFile("porterTestData.zip"));
- InputStream voc = zipFile.getInputStream(zipFile.getEntry("voc.txt"));
- InputStream out = zipFile.getInputStream(zipFile.getEntry("output.txt"));
- BufferedReader vocReader = new BufferedReader(new InputStreamReader(
- voc, "UTF-8"));
- BufferedReader outputReader = new BufferedReader(new InputStreamReader(
- out, "UTF-8"));
- String inputWord = null;
- while ((inputWord = vocReader.readLine()) != null) {
- String expectedWord = outputReader.readLine();
- assertNotNull(expectedWord);
- tokenizer.reset(new StringReader(inputWord));
- filter.reset();
- assertTokenStreamContents(filter, new String[] { expectedWord });
- }
- vocReader.close();
- outputReader.close();
- zipFile.close();
+ Analyzer a = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer t = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(t, new PorterStemFilter(t));
+ }
+ };
+
+ assertVocabulary(a, getDataFile("porterTestData.zip"), "voc.txt", "output.txt");
}
public void testWithKeywordAttribute() throws IOException {
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java
new file mode 100644
index 00000000000..f494bd65725
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.es;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link SpanishLightStemFilter}
+ */
+public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new SpanishLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("eslighttestdata.zip"), "eslight.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip
new file mode 100644
index 00000000000..0e88cf28d12
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
new file mode 100644
index 00000000000..d946a20ca53
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.fi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link FinnishLightStemFilter}
+ */
+public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new FinnishLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip
new file mode 100644
index 00000000000..5a85453a614
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java
new file mode 100644
index 00000000000..ffe8d6c22cc
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java
@@ -0,0 +1,162 @@
+package org.apache.lucene.analysis.fr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link FrenchLightStemFilter}
+ */
+public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new FrenchLightStemFilter(source));
+ }
+ };
+
+ /** Test some examples from the paper */
+ public void testExamples() throws IOException {
+ checkOneTerm(analyzer, "chevaux", "cheval");
+ checkOneTerm(analyzer, "cheval", "cheval");
+
+ checkOneTerm(analyzer, "hiboux", "hibou");
+ checkOneTerm(analyzer, "hibou", "hibou");
+
+ checkOneTerm(analyzer, "chantés", "chant");
+ checkOneTerm(analyzer, "chanter", "chant");
+ checkOneTerm(analyzer, "chante", "chant");
+ checkOneTerm(analyzer, "chant", "chant");
+
+ checkOneTerm(analyzer, "baronnes", "baron");
+ checkOneTerm(analyzer, "barons", "baron");
+ checkOneTerm(analyzer, "baron", "baron");
+
+ checkOneTerm(analyzer, "peaux", "peau");
+ checkOneTerm(analyzer, "peau", "peau");
+
+ checkOneTerm(analyzer, "anneaux", "aneau");
+ checkOneTerm(analyzer, "anneau", "aneau");
+
+ checkOneTerm(analyzer, "neveux", "neveu");
+ checkOneTerm(analyzer, "neveu", "neveu");
+
+ checkOneTerm(analyzer, "affreux", "afreu");
+ checkOneTerm(analyzer, "affreuse", "afreu");
+
+ checkOneTerm(analyzer, "investissement", "investi");
+ checkOneTerm(analyzer, "investir", "investi");
+
+ checkOneTerm(analyzer, "assourdissant", "asourdi");
+ checkOneTerm(analyzer, "assourdir", "asourdi");
+
+ checkOneTerm(analyzer, "pratiquement", "pratiqu");
+ checkOneTerm(analyzer, "pratique", "pratiqu");
+
+ checkOneTerm(analyzer, "administrativement", "administratif");
+ checkOneTerm(analyzer, "administratif", "administratif");
+
+ checkOneTerm(analyzer, "justificatrice", "justifi");
+ checkOneTerm(analyzer, "justificateur", "justifi");
+ checkOneTerm(analyzer, "justifier", "justifi");
+
+ checkOneTerm(analyzer, "educatrice", "eduqu");
+ checkOneTerm(analyzer, "eduquer", "eduqu");
+
+ checkOneTerm(analyzer, "communicateur", "comuniqu");
+ checkOneTerm(analyzer, "communiquer", "comuniqu");
+
+ checkOneTerm(analyzer, "accompagnatrice", "acompagn");
+ checkOneTerm(analyzer, "accompagnateur", "acompagn");
+
+ checkOneTerm(analyzer, "administrateur", "administr");
+ checkOneTerm(analyzer, "administrer", "administr");
+
+ checkOneTerm(analyzer, "productrice", "product");
+ checkOneTerm(analyzer, "producteur", "product");
+
+ checkOneTerm(analyzer, "acheteuse", "achet");
+ checkOneTerm(analyzer, "acheteur", "achet");
+
+ checkOneTerm(analyzer, "planteur", "plant");
+ checkOneTerm(analyzer, "plante", "plant");
+
+ checkOneTerm(analyzer, "poreuse", "poreu");
+ checkOneTerm(analyzer, "poreux", "poreu");
+
+ checkOneTerm(analyzer, "plieuse", "plieu");
+
+ checkOneTerm(analyzer, "bijoutière", "bijouti");
+ checkOneTerm(analyzer, "bijoutier", "bijouti");
+
+ checkOneTerm(analyzer, "caissière", "caisi");
+ checkOneTerm(analyzer, "caissier", "caisi");
+
+ checkOneTerm(analyzer, "abrasive", "abrasif");
+ checkOneTerm(analyzer, "abrasif", "abrasif");
+
+ checkOneTerm(analyzer, "folle", "fou");
+ checkOneTerm(analyzer, "fou", "fou");
+
+ checkOneTerm(analyzer, "personnelle", "person");
+ checkOneTerm(analyzer, "personne", "person");
+
+ // algo bug: too short length
+ //checkOneTerm(analyzer, "personnel", "person");
+
+ checkOneTerm(analyzer, "complète", "complet");
+ checkOneTerm(analyzer, "complet", "complet");
+
+ checkOneTerm(analyzer, "aromatique", "aromat");
+
+ checkOneTerm(analyzer, "faiblesse", "faibl");
+ checkOneTerm(analyzer, "faible", "faibl");
+
+ checkOneTerm(analyzer, "patinage", "patin");
+ checkOneTerm(analyzer, "patin", "patin");
+
+ checkOneTerm(analyzer, "sonorisation", "sono");
+
+ checkOneTerm(analyzer, "ritualisation", "rituel");
+ checkOneTerm(analyzer, "rituel", "rituel");
+
+ // algo bug: masked by rules above
+ //checkOneTerm(analyzer, "colonisateur", "colon");
+
+ checkOneTerm(analyzer, "nomination", "nomin");
+
+ checkOneTerm(analyzer, "disposition", "dispos");
+ checkOneTerm(analyzer, "dispose", "dispos");
+ }
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java
new file mode 100644
index 00000000000..b45c5323c82
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.fr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link FrenchMinimalStemFilter}
+ */
+public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new FrenchMinimalStemFilter(source));
+ }
+ };
+
+ /** Test some examples from the paper */
+ public void testExamples() throws IOException {
+ checkOneTerm(analyzer, "chevaux", "cheval");
+ checkOneTerm(analyzer, "hiboux", "hibou");
+
+ checkOneTerm(analyzer, "chantés", "chant");
+ checkOneTerm(analyzer, "chanter", "chant");
+ checkOneTerm(analyzer, "chante", "chant");
+
+ checkOneTerm(analyzer, "baronnes", "baron");
+ checkOneTerm(analyzer, "barons", "baron");
+ checkOneTerm(analyzer, "baron", "baron");
+ }
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frlighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frlighttestdata.zip
new file mode 100644
index 00000000000..a036b8a991d
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frlighttestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frminimaltestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frminimaltestdata.zip
new file mode 100644
index 00000000000..07dc4e1e550
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frminimaltestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java
new file mode 100644
index 00000000000..e0eaf2a0921
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.hu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link HungarianLightStemFilter}
+ */
+public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new HungarianLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/hulighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/hulighttestdata.zip
new file mode 100644
index 00000000000..e334c6940e2
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/hulighttestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java
new file mode 100644
index 00000000000..b850630086c
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.it;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link ItalianLightStemFilter}
+ */
+public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new ItalianLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("itlighttestdata.zip"), "itlight.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/itlighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/itlighttestdata.zip
new file mode 100644
index 00000000000..f390507ff15
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/itlighttestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
new file mode 100644
index 00000000000..6f5fdcf148d
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
@@ -0,0 +1,95 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link PortugueseLightStemFilter}
+ */
+public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
+ return new TokenStreamComponents(source, new PortugueseLightStemFilter(result));
+ }
+ };
+
+ /**
+ * Test the example from the paper "Assessing the impact of stemming accuracy
+ * on information retrieval"
+ */
+ public void testExamples() throws IOException {
+ assertAnalyzesTo(
+ analyzer,
+ "O debate político, pelo menos o que vem a público, parece, de modo nada "
+ + "surpreendente, restrito a temas menores. Mas há, evidentemente, "
+ + "grandes questões em jogo nas eleições que se aproximam.",
+ new String[] {
+ "o", "debat", "politic", "pelo", "meno", "o", "que", "vem", "a",
+ "public", "parec", "de", "modo", "nada", "surpreendent", "restrit",
+ "a", "tema", "menor", "mas", "há", "evident", "grand", "questa",
+ "em", "jogo", "nas", "eleica", "que", "se", "aproximam"
+ });
+ }
+
+ /**
+ * Test examples from the c implementation
+ */
+ public void testMoreExamples() throws IOException {
+ checkOneTerm(analyzer, "doutores", "doutor");
+ checkOneTerm(analyzer, "doutor", "doutor");
+
+ checkOneTerm(analyzer, "homens", "homem");
+ checkOneTerm(analyzer, "homem", "homem");
+
+ checkOneTerm(analyzer, "papéis", "papel");
+ checkOneTerm(analyzer, "papel", "papel");
+
+ checkOneTerm(analyzer, "normais", "normal");
+ checkOneTerm(analyzer, "normal", "normal");
+
+ checkOneTerm(analyzer, "lencóis", "lencol");
+ checkOneTerm(analyzer, "lencol", "lencol");
+
+ checkOneTerm(analyzer, "barris", "barril");
+ checkOneTerm(analyzer, "barril", "barril");
+
+ checkOneTerm(analyzer, "botões", "bota");
+ checkOneTerm(analyzer, "botão", "bota");
+ }
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
new file mode 100644
index 00000000000..64a2dd7ac51
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
@@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link PortugueseMinimalStemFilter}
+ */
+public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
+ return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(result));
+ }
+ };
+
+ /**
+ * Test the example from the paper "Assessing the impact of stemming accuracy
+ * on information retrieval"
+ */
+ public void testExamples() throws IOException {
+ assertAnalyzesTo(
+ analyzer,
+ "O debate político, pelo menos o que vem a público, parece, de modo nada "
+ + "surpreendente, restrito a temas menores. Mas há, evidentemente, "
+ + "grandes questões em jogo nas eleições que se aproximam.",
+ new String[] {
+ "o", "debate", "político", "pelo", "menos", "o", "que", "vem", "a",
+ "público", "parece", "de", "modo", "nada", "surpreendente", "restrito",
+ "a", "tema", "menor", "mas", "há", "evidentemente", "grande", "questão",
+ "em", "jogo", "na", "eleição", "que", "se", "aproximam"
+ });
+ }
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptlighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptlighttestdata.zip
new file mode 100644
index 00000000000..eca9a46ff9d
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptlighttestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptminimaltestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptminimaltestdata.zip
new file mode 100644
index 00000000000..4169fa26b5e
Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptminimaltestdata.zip differ
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
new file mode 100644
index 00000000000..b524d2a62a4
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
@@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
+
+/**
+ * Simple tests for {@link RussianLightStemFilter}
+ */
+public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(source, new RussianLightStemFilter(source));
+ }
+ };
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
+ }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
index 632391e88ea..0688f6d0d8f 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
@@ -17,71 +17,35 @@ package org.apache.lucene.analysis.ru;
* limitations under the License.
*/
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.LuceneTestCase;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.InputStreamReader;
-import java.io.FileInputStream;
-import java.util.ArrayList;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
*/
@Deprecated
-public class TestRussianStem extends LuceneTestCase
-{
- private ArrayList