diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index c87674ad77a..fbb69265015 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -187,6 +187,9 @@ New features * LUCENE-2464: FastVectorHighlighter: add SingleFragListBuilder to return entire field contents. (Koji Sekiguchi) + * LUCENE-2503: Added lighter stemming alternatives for European languages. + (Robert Muir) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/modules/analysis/NOTICE.txt b/modules/analysis/NOTICE.txt index 6abde9313c7..35aadb73f73 100644 --- a/modules/analysis/NOTICE.txt +++ b/modules/analysis/NOTICE.txt @@ -17,30 +17,29 @@ were developed by Martin Porter and Richard Boulton. The full snowball package is available from http://snowball.tartarus.org/ -The Arabic stemmer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt. +The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt See http://members.unine.ch/jacques.savoy/clef/index.html. -The Persian analyzer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt. -See http://members.unine.ch/jacques.savoy/clef/index.html. - -The Romanian analyzer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt. -See http://members.unine.ch/jacques.savoy/clef/index.html. - -The Bulgarian analyzer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt. -See http://members.unine.ch/jacques.savoy/clef/index.html. - -The Hindi analyzer (common) comes with a default -stopword list that is BSD-licensed created by Jacques Savoy. The file resides in -common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt. -See http://members.unine.ch/jacques.savoy/clef/index.html. +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java The Stempel analyzer (stempel) includes BSD-licensed software developed by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java index eaf89d259e3..c56617643b8 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ar; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Normalizer for Arabic. *

@@ -96,20 +98,4 @@ public class ArabicNormalizer { return len; } - - /** - * Delete a character in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len length of input buffer - * @return length of input buffer after deletion - */ - protected int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } - } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java index 098efa04077..c52916e76ab 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java @@ -1,4 +1,6 @@ package org.apache.lucene.analysis.ar; + + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -16,6 +18,8 @@ package org.apache.lucene.analysis.ar; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Stemmer for Arabic. *

@@ -86,7 +90,7 @@ public class ArabicStemmer { */ public int stemPrefix(char s[], int len) { for (int i = 0; i < prefixes.length; i++) - if (startsWith(s, len, prefixes[i])) + if (startsWithCheckLength(s, len, prefixes[i])) return deleteN(s, 0, len, prefixes[i].length); return len; } @@ -99,7 +103,7 @@ public class ArabicStemmer { */ public int stemSuffix(char s[], int len) { for (int i = 0; i < suffixes.length; i++) - if (endsWith(s, len, suffixes[i])) + if (endsWithCheckLength(s, len, suffixes[i])) len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length); return len; } @@ -111,7 +115,7 @@ public class ArabicStemmer { * @param prefix prefix to check * @return true if the prefix matches and can be stemmed */ - boolean startsWith(char s[], int len, char prefix[]) { + boolean startsWithCheckLength(char s[], int len, char prefix[]) { if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters return false; } else if (len < prefix.length + 2) { // other prefixes require only 2. @@ -132,7 +136,7 @@ public class ArabicStemmer { * @param suffix suffix to check * @return true if the suffix matches and can be stemmed */ - boolean endsWith(char s[], int len, char suffix[]) { + boolean endsWithCheckLength(char s[], int len, char suffix[]) { if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming return false; } else { @@ -142,37 +146,5 @@ public class ArabicStemmer { return true; } - } - - - /** - * Delete n characters in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len Length of input buffer - * @param nChars number of characters to delete - * @return length of input buffer after deletion - */ - protected int deleteN(char s[], int pos, int len, int nChars) { - for (int i = 0; i < nChars; i++) - len = delete(s, pos, len); - return len; - } - - /** - * Delete a character in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len length of input buffer - * @return length of input buffer after deletion - */ - protected int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } - + } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java index 1114517cf93..2aa23cda03c 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.bg; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Light Stemmer for Bulgarian. *

@@ -138,15 +140,4 @@ public class BulgarianStemmer { return len; } - - private boolean endsWith(final char s[], final int len, final String suffix) { - final int suffixLen = suffix.length(); - if (suffixLen > len) - return false; - for (int i = suffixLen - 1; i >= 0; i--) - if (s[len -(suffixLen - i)] != suffix.charAt(i)) - return false; - - return true; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java index 66c42971cf4..32980cc9056 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.cz; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Light Stemmer for Czech. *

@@ -166,16 +168,4 @@ public class CzechStemmer { return len; } - - private boolean endsWith(char s[], int len, String suffix) { - int suffixLen = suffix.length(); - if (suffixLen > len) - return false; - - for (int i = suffixLen - 1; i >= 0; i--) - if (s[len - (suffixLen - i)] != suffix.charAt(i)) - return false; - - return true; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilter.java new file mode 100644 index 00000000000..a949a7d2010 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link GermanLightStemmer} to stem German + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class GermanLightStemFilter extends TokenFilter { + private final GermanLightStemmer stemmer = new GermanLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public GermanLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java new file mode 100644 index 00000000000..04d8b5858d3 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java @@ -0,0 +1,138 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Light Stemmer for German. + *

+ * This stemmer implements the "UniNE" algorithm in: + * Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages + * Jacques Savoy + */ +public class GermanLightStemmer { + + public int stem(char s[], int len) { + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'ä': + case 'à': + case 'á': + case 'â': s[i] = 'a'; break; + case 'ö': + case 'ò': + case 'ó': + case 'ô': s[i] = 'o'; break; + case 'ï': + case 'ì': + case 'í': + case 'î': s[i] = 'i'; break; + case 'ü': + case 'ù': + case 'ú': + case 'û': s[i] = 'u'; break; + } + + len = step1(s, len); + return step2(s, len); + } + + private boolean stEnding(char ch) { + switch(ch) { + case 'b': + case 'd': + case 'f': + case 'g': + case 'h': + case 'k': + case 'l': + case 'm': + case 'n': + case 't': return true; + default: return false; + } + } + + private int step1(char s[], int len) { + if (len > 5 && s[len-3] == 'e' && s[len-2] == 'r' && s[len-1] == 'n') + return len - 3; + + if (len > 4 && s[len-2] == 'e') + switch(s[len-1]) { + case 'm': + case 'n': + case 'r': + case 's': return len - 2; + } + + if (len > 3 && s[len-1] == 'e') + return len - 1; + + if (len > 3 && s[len-1] == 's' && stEnding(s[len-2])) + return len - 1; + + return len; + } + + private int step2(char s[], int len) { + if (len > 5 && s[len-3] == 'e' && s[len-2] == 's' && s[len-1] == 't') + return len - 3; + + if (len > 4 && s[len-2] == 'e' && (s[len-1] == 'r' || s[len-1] == 'n')) + return len - 2; + + if (len > 4 && s[len-2] == 's' && s[len-1] == 't' && stEnding(s[len-3])) + return len - 2; + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilter.java new file mode 100644 index 00000000000..9a291c3a79d --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link GermanMinimalStemmer} to stem German + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class GermanMinimalStemFilter extends TokenFilter { + private final GermanMinimalStemmer stemmer = new GermanMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public GermanMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java new file mode 100644 index 00000000000..52fc4a04d1b --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java @@ -0,0 +1,95 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Minimal Stemmer for German. + *

+ * This stemmer implements the following algorithm: + * Morphologie et recherche d'information + * Jacques Savoy. + */ +public class GermanMinimalStemmer { + + public int stem(char s[], int len) { + if (len < 5) + return len; + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'ä': s[i] = 'a'; break; + case 'ö': s[i] = 'o'; break; + case 'ü': s[i] = 'u'; break; + } + + if (len > 6 && s[len-3] == 'n' && s[len-2] == 'e' && s[len-1] == 'n') + return len - 3; + + if (len > 5) + switch(s[len-1]) { + case 'n': if (s[len-2] == 'e') return len - 2; else break; + case 'e': if (s[len-2] == 's') return len - 2; else break; + case 's': if (s[len-2] == 'e') return len - 2; else break; + case 'r': if (s[len-2] == 'e') return len - 2; else break; + } + + switch(s[len-1]) { + case 'n': + case 'e': + case 's': + case 'r': return len - 1; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilter.java new file mode 100644 index 00000000000..fede035487a --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.en; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link EnglishMinimalStemmer} to stem + * English words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class EnglishMinimalStemFilter extends TokenFilter { + private final EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public EnglishMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemmer.java new file mode 100644 index 00000000000..aff2e9f5a6b --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemmer.java @@ -0,0 +1,45 @@ +package org.apache.lucene.analysis.en; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Minimal plural stemmer for English. + *

+ * This stemmer implements the "S-Stemmer" from + * How Effective Is Suffixing? + * Donna Harman. + */ +public class EnglishMinimalStemmer { + public int stem(char s[], int len) { + if (len < 3 || s[len-1] != 's') + return len; + + switch(s[len-2]) { + case 'u': + case 's': return len; + case 'e': + if (len > 3 && s[len-3] == 'i' && s[len-4] != 'a' && s[len-4] != 'e') { + s[len - 3] = 'y'; + return len - 2; + } + if (s[len-3] == 'i' || s[len-3] == 'a' || s[len-3] == 'o' || s[len-3] == 'e') + return len; + default: return len - 1; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilter.java new file mode 100644 index 00000000000..79ade5c16f8 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.es; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link SpanishLightStemmer} to stem Spanish + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class SpanishLightStemFilter extends TokenFilter { + private final SpanishLightStemmer stemmer = new SpanishLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public SpanishLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java new file mode 100644 index 00000000000..382faa841a4 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis.es; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Light Stemmer for Spanish + *

+ * This stemmer implements the algorithm described in: + * Report on CLEF-2001 Experiments + * Jacques Savoy + */ +public class SpanishLightStemmer { + + public int stem(char s[], int len) { + if (len < 5) + return len; + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'à': + case 'á': + case 'â': + case 'ä': s[i] = 'a'; break; + case 'ò': + case 'ó': + case 'ô': + case 'ö': s[i] = 'o'; break; + case 'è': + case 'é': + case 'ê': + case 'ë': s[i] = 'e'; break; + case 'ù': + case 'ú': + case 'û': + case 'ü': s[i] = 'u'; break; + case 'ì': + case 'í': + case 'î': + case 'ï': s[i] = 'i'; break; + } + + switch(s[len-1]) { + case 'o': + case 'a': + case 'e': return len - 1; + case 's': + if (s[len-2] == 'e' && s[len-3] == 's' && s[len-4] == 'e') + return len-2; + if (s[len-2] == 'e' && s[len-3] == 'c') { + s[len-3] = 'z'; + return len - 2; + } + if (s[len-2] == 'o' || s[len-2] == 'a' || s[len-2] == 'e') + return len - 2; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java index 8798b4e3ee0..68407d2dc9e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.fa; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Normalizer for Persian. *

@@ -82,20 +84,4 @@ public class PersianNormalizer { return len; } - - /** - * Delete a character in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len length of input buffer - * @return length of input buffer after deletion - */ - protected int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } - } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilter.java new file mode 100644 index 00000000000..12f58b97cfd --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.fi; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link FinnishLightStemmer} to stem Finnish + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class FinnishLightStemFilter extends TokenFilter { + private final FinnishLightStemmer stemmer = new FinnishLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public FinnishLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java new file mode 100644 index 00000000000..ae43caaeb12 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java @@ -0,0 +1,259 @@ +package org.apache.lucene.analysis.fi; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Finnish. + *

+ * This stemmer implements the algorithm described in: + * Report on CLEF-2003 Monolingual Tracks + * Jacques Savoy + */ +public class FinnishLightStemmer { + + public int stem(char s[], int len) { + if (len < 4) + return len; + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'ä': + case 'å': s[i] = 'a'; break; + case 'ö': s[i] = 'o'; break; + } + + len = step1(s, len); + len = step2(s, len); + len = step3(s, len); + len = norm1(s, len); + len = norm2(s, len); + return len; + } + + private int step1(char s[], int len) { + if (len > 8) { + if (endsWith(s, len, "kin")) + return step1(s, len-3); + if (endsWith(s, len, "ko")) + return step1(s, len-2); + } + + if (len > 11) { + if (endsWith(s, len, "dellinen")) + return len-8; + if (endsWith(s, len, "dellisuus")) + return len-9; + } + return len; + } + + private int step2(char s[], int len) { + if (len > 5) { + if (endsWith(s, len, "lla") + || endsWith(s, len, "tse") + || endsWith(s, len, "sti")) + return len-3; + + if (endsWith(s, len, "ni")) + return len-2; + + if (endsWith(s, len, "aa")) + return len-1; // aa -> a + } + + return len; + } + + private int step3(char s[], int len) { + if (len > 8) { + if (endsWith(s, len, "nnen")) { + s[len-4] = 's'; + return len-3; + } + + if (endsWith(s, len, "ntena")) { + s[len-5] = 's'; + return len-4; + } + + if (endsWith(s, len, "tten")) + return len-4; + + if (endsWith(s, len, "eiden")) + return len-5; + } + + if (len > 6) { + if (endsWith(s, len, "neen") + || endsWith(s, len, "niin") + || endsWith(s, len, "seen") + || endsWith(s, len, "teen") + || endsWith(s, len, "inen")) + return len-4; + + if (s[len-3] == 'h' && isVowel(s[len-2]) && s[len-1] == 'n') + return len-3; + + if (endsWith(s, len, "den")) { + s[len-3] = 's'; + return len-2; + } + + if (endsWith(s, len, "ksen")) { + s[len-4] = 's'; + return len-3; + } + + if (endsWith(s, len, "ssa") + || endsWith(s, len, "sta") + || endsWith(s, len, "lla") + || endsWith(s, len, "lta") + || endsWith(s, len, "tta") + || endsWith(s, len, "ksi") + || endsWith(s, len, "lle")) + return len-3; + } + + if (len > 5) { + if (endsWith(s, len, "na") + || endsWith(s, len, "ne")) + return len-2; + + if (endsWith(s, len, "nei")) + return len-3; + } + + if (len > 4) { + if (endsWith(s, len, "ja") + || endsWith(s, len, "ta")) + return len-2; + + if (s[len-1] == 'a') + return len-1; + + if (s[len-1] == 'n' && isVowel(s[len-2])) + return len-2; + + if (s[len-1] == 'n') + return len-1; + } + + return len; + } + + private int norm1(char s[], int len) { + if (len > 5 && endsWith(s, len, "hde")) { + s[len-3] = 'k'; + s[len-2] = 's'; + s[len-1] = 'i'; + } + + if (len > 4) { + if (endsWith(s, len, "ei") || endsWith(s, len, "at")) + return len-2; + } + + if (len > 3) + switch(s[len-1]) { + case 't': + case 's': + case 'j': + case 'e': + case 'a': + case 'i': return len-1; + } + + return len; + } + + private int norm2(char s[], int len) { + if (len > 8) { + if (s[len-1] == 'e' + || s[len-1] == 'o' + || s[len-1] == 'u') + len--; + } + + if (len > 4) { + if (s[len-1] == 'i') + len--; + + if (len > 4) { + char ch = s[0]; + for (int i = 1; i < len; i++) { + if (s[i] == ch && + (ch == 'k' || ch == 'p' || ch == 't')) + len = delete(s, i--, len); + else + ch = s[i]; + } + } + } + + return len; + } + + private boolean isVowel(char ch) { + switch(ch) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'y': return true; + default: return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilter.java new file mode 100644 index 00000000000..65a13b561c0 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link FrenchLightStemmer} to stem French + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class FrenchLightStemFilter extends TokenFilter { + private final FrenchLightStemmer stemmer = new FrenchLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public FrenchLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java new file mode 100644 index 00000000000..43e2e0625d1 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java @@ -0,0 +1,267 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for French. + *

+ * This stemmer implements the "UniNE" algorithm in: + * Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages + * Jacques Savoy + */ +public class FrenchLightStemmer { + + public int stem(char s[], int len) { + if (len > 5 && s[len-1] == 'x') { + if (s[len-3] == 'a' && s[len-2] == 'u' && s[len-4] != 'e') + s[len-2] = 'l'; + len--; + } + + if (len > 3 && s[len-1] == 'x') + len--; + + if (len > 3 && s[len-1] == 's') + len--; + + if (len > 9 && endsWith(s, len, "issement")) { + len -= 6; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 8 && endsWith(s, len, "issant")) { + len -= 4; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 6 && endsWith(s, len, "ement")) { + len -= 4; + if (len > 3 && endsWith(s, len, "ive")) { + len--; + s[len-1] = 'f'; + } + return norm(s, len); + } + + if (len > 11 && endsWith(s, len, "ficatrice")) { + len -= 5; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 10 && endsWith(s, len, "ficateur")) { + len -= 4; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 9 && endsWith(s, len, "catrice")) { + len -= 3; + s[len-4] = 'q'; + s[len-3] = 'u'; + s[len-2] = 'e'; + //s[len-1] = 'r' <-- unnecessary, already 'r'. + return norm(s, len); + } + + if (len > 8 && endsWith(s, len, "cateur")) { + len -= 2; + s[len-4] = 'q'; + s[len-3] = 'u'; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 8 && endsWith(s, len, "atrice")) { + len -= 4; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 7 && endsWith(s, len, "ateur")) { + len -= 3; + s[len-2] = 'e'; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 6 && endsWith(s, len, "trice")) { + len--; + s[len-3] = 'e'; + s[len-2] = 'u'; + s[len-1] = 'r'; + } + + if (len > 5 && endsWith(s, len, "ième")) + return norm(s, len-4); + + if (len > 7 && endsWith(s, len, "teuse")) { + len -= 2; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 6 && endsWith(s, len, "teur")) { + len--; + s[len-1] = 'r'; + return norm(s, len); + } + + if (len > 5 && endsWith(s, len, "euse")) + return norm(s, len-2); + + if (len > 8 && endsWith(s, len, "ère")) { + len--; + s[len-2] = 'e'; + return norm(s, len); + } + + if (len > 7 && endsWith(s, len, "ive")) { + len--; + s[len-1] = 'f'; + return norm(s, len); + } + + if (len > 4 && + (endsWith(s, len, "folle") || + endsWith(s, len, "molle"))) { + len -= 2; + s[len-1] = 'u'; + return norm(s, len); + } + + if (len > 9 && endsWith(s, len, "nnelle")) + return norm(s, len-5); + + if (len > 9 && endsWith(s, len, "nnel")) + return norm(s, len-3); + + if (len > 4 && endsWith(s, len, "ète")) { + len--; + s[len-2] = 'e'; + } + + if (len > 8 && endsWith(s, len, "ique")) + len -= 4; + + if (len > 8 && endsWith(s, len, "esse")) + return norm(s, len-3); + + if (len > 7 && endsWith(s, len, "inage")) + return norm(s, len-3); + + if (len > 9 && endsWith(s, len, "isation")) { + len -= 7; + if (len > 5 && endsWith(s, len, "ual")) + s[len-2] = 'e'; + return norm(s, len); + } + + if (len > 9 && endsWith(s, len, "isateur")) + return norm(s, len-7); + + if (len > 8 && endsWith(s, len, "ation")) + return norm(s, len-5); + + if (len > 8 && endsWith(s, len, "ition")) + return norm(s, len-5); + + return norm(s, len); + } + + private int norm(char s[], int len) { + if (len > 4) { + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'à': + case 'á': + case 'â': s[i] = 'a'; break; + case 'ô': s[i] = 'o'; break; + case 'è': + case 'é': + case 'ê': s[i] = 'e'; break; + case 'ù': + case 'û': s[i] = 'u'; break; + case 'î': s[i] = 'i'; break; + case 'ç': s[i] = 'c'; break; + } + + char ch = s[0]; + for (int i = 1; i < len; i++) { + if (s[i] == ch) + len = delete(s, i--, len); + else + ch = s[i]; + } + } + + if (len > 4 && endsWith(s, len, "ie")) + len -= 2; + + if (len > 4) { + if (s[len-1] == 'r') len--; + if (s[len-1] == 'e') len--; + if (s[len-1] == 'e') len--; + if (s[len-1] == s[len-2]) len--; + } + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilter.java new file mode 100644 index 00000000000..3d7789b0696 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link FrenchMinimalStemmer} to stem French + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class FrenchMinimalStemFilter extends TokenFilter { + private final FrenchMinimalStemmer stemmer = new FrenchMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public FrenchMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java new file mode 100644 index 00000000000..788231965b4 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java @@ -0,0 +1,80 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Light Stemmer for French. + *

+ * This stemmer implements the following algorithm: + * A Stemming procedure and stopword list for general French corpora. + * Jacques Savoy. + */ +public class FrenchMinimalStemmer { + public int stem(char s[], int len) { + if (len < 6) + return len; + + if (s[len-1] == 'x') { + if (s[len-3] == 'a' && s[len-2] == 'u') + s[len-2] = 'l'; + return len - 1; + } + + if (s[len-1] == 's') len--; + if (s[len-1] == 'r') len--; + if (s[len-1] == 'e') len--; + if (s[len-1] == 'é') len--; + if (s[len-1] == s[len-2]) len--; + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java index 4c614448c19..245afd93f8e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hi; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Normalizer for Hindi. *

@@ -176,19 +178,4 @@ public class HindiNormalizer { return len; } - - /** - * Delete a character in-place - * - * @param s Input Buffer - * @param pos Position of character to delete - * @param len length of input buffer - * @return length of input buffer after deletion - */ - protected int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java index 255ffa2cd4e..68ef2ccf51f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemmer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hi; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Light Stemmer for Hindi. *

@@ -116,15 +118,4 @@ public class HindiStemmer { return len - 1; return len; } - - private boolean endsWith(final char s[], final int len, final String suffix) { - final int suffixLen = suffix.length(); - if (suffixLen > len) - return false; - for (int i = suffixLen - 1; i >= 0; i--) - if (s[len -(suffixLen - i)] != suffix.charAt(i)) - return false; - - return true; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilter.java new file mode 100644 index 00000000000..f3f06fbbd2f --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.hu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link HungarianLightStemmer} to stem + * Hungarian words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class HungarianLightStemFilter extends TokenFilter { + private final HungarianLightStemmer stemmer = new HungarianLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public HungarianLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java new file mode 100644 index 00000000000..31b5e6fbad3 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java @@ -0,0 +1,238 @@ +package org.apache.lucene.analysis.hu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Hungarian. + *

+ * This stemmer implements the "UniNE" algorithm in: + * Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages + * Jacques Savoy + */ +public class HungarianLightStemmer { + public int stem(char s[], int len) { + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'á': s[i] = 'a'; break; + case 'ë': + case 'é': s[i] = 'e'; break; + case 'í': s[i] = 'i'; break; + case 'ó': + case 'ő': + case 'õ': + case 'ö': s[i] = 'o'; break; + case 'ú': + case 'ű': + case 'ũ': + case 'û': + case 'ü': s[i] = 'u'; break; + } + + len = removeCase(s, len); + len = removePossessive(s, len); + len = removePlural(s, len); + return normalize(s, len); + } + + private int removeCase(char s[], int len) { + if (len > 6 && endsWith(s, len, "kent")) + return len - 4; + + if (len > 5) { + if (endsWith(s, len, "nak") || + endsWith(s, len, "nek") || + endsWith(s, len, "val") || + endsWith(s, len, "vel") || + endsWith(s, len, "ert") || + endsWith(s, len, "rol") || + endsWith(s, len, "ban") || + endsWith(s, len, "ben") || + endsWith(s, len, "bol") || + endsWith(s, len, "nal") || + endsWith(s, len, "nel") || + endsWith(s, len, "hoz") || + endsWith(s, len, "hez") || + endsWith(s, len, "tol")) + return len - 3; + + if (endsWith(s, len, "al") || endsWith(s, len, "el")) { + if (!isVowel(s[len-3]) && s[len-3] == s[len-4]) + return len - 3; + } + } + + if (len > 4) { + if (endsWith(s, len, "at") || + endsWith(s, len, "et") || + endsWith(s, len, "ot") || + endsWith(s, len, "va") || + endsWith(s, len, "ve") || + endsWith(s, len, "ra") || + endsWith(s, len, "re") || + endsWith(s, len, "ba") || + endsWith(s, len, "be") || + endsWith(s, len, "ul") || + endsWith(s, len, "ig")) + return len - 2; + + if ((endsWith(s, len, "on") || endsWith(s, len, "en")) && !isVowel(s[len-3])) + return len - 2; + + switch(s[len-1]) { + case 't': + case 'n': return len - 1; + case 'a': + case 'e': if (s[len-2] == s[len-3] && !isVowel(s[len-2])) return len - 2; + } + } + + return len; + } + + private int removePossessive(char s[], int len) { + if (len > 6) { + if (!isVowel(s[len-5]) && + (endsWith(s, len, "atok") || + endsWith(s, len, "otok") || + endsWith(s, len, "etek"))) + return len - 4; + + if (endsWith(s, len, "itek") || endsWith(s, len, "itok")) + return len - 4; + } + + if (len > 5) { + if (!isVowel(s[len-4]) && + (endsWith(s, len, "unk") || + endsWith(s, len, "tok") || + endsWith(s, len, "tek"))) + return len - 3; + + if (isVowel(s[len-4]) && endsWith(s, len, "juk")) + return len - 3; + + if (endsWith(s, len, "ink")) + return len - 3; + } + + if (len > 4) { + if (!isVowel(s[len-3]) && + (endsWith(s, len, "am") || + endsWith(s, len, "em") || + endsWith(s, len, "om") || + endsWith(s, len, "ad") || + endsWith(s, len, "ed") || + endsWith(s, len, "od") || + endsWith(s, len, "uk"))) + return len - 2; + + if (isVowel(s[len-3]) && + (endsWith(s, len, "nk") || + endsWith(s, len, "ja") || + endsWith(s, len, "je"))) + return len - 2; + + if (endsWith(s, len, "im") || + endsWith(s, len, "id") || + endsWith(s, len, "ik")) + return len - 2; + } + + if (len > 3) + switch(s[len-1]) { + case 'a': + case 'e': if (!isVowel(s[len-2])) return len - 1; break; + case 'm': + case 'd': if (isVowel(s[len-2])) return len - 1; break; + case 'i': return len - 1; + } + + return len; + } + + private int removePlural(char s[], int len) { + if (len > 3 && s[len-1] == 'k') + switch(s[len-2]) { + case 'a': + case 'o': + case 'e': if (len > 4) return len - 2; /* intentional fallthru */ + default: return len - 1; + } + return len; + } + + private int normalize(char s[], int len) { + if (len > 3) + switch(s[len-1]) { + case 'a': + case 'e': + case 'i': + case 'o': return len - 1; + } + return len; + } + + private boolean isVowel(char ch) { + switch(ch) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'y': return true; + default: return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java index 82afce8434a..0b7308c48a7 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemmer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.id; * limitations under the License. */ +import static org.apache.lucene.analysis.util.StemmerUtil.*; + /** * Stemmer for Indonesian. *

@@ -266,39 +268,5 @@ public class IndonesianStemmer { return length - 1; } return length; - } - - private boolean startsWith(char s[], int len, String prefix) { - final int prefixLen = prefix.length(); - if (prefixLen > len) - return false; - for (int i = 0; i < prefixLen; i++) - if (s[i] != prefix.charAt(i)) - return false; - return true; - } - - private boolean endsWith(char s[], int len, String suffix) { - final int suffixLen = suffix.length(); - if (suffixLen > len) - return false; - for (int i = suffixLen - 1; i >= 0; i--) - if (s[len -(suffixLen - i)] != suffix.charAt(i)) - return false; - - return true; - } - - private int deleteN(char s[], int pos, int len, int nChars) { - for (int i = 0; i < nChars; i++) - len = delete(s, pos, len); - return len; - } - - private int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } + } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java index 0f7fcf787ff..2f3c3749cd3 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizer.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.in; import java.util.BitSet; import java.util.IdentityHashMap; import static java.lang.Character.UnicodeBlock.*; +import static org.apache.lucene.analysis.util.StemmerUtil.*; /** * Normalizes the Unicode representation of text in Indian languages. @@ -290,14 +291,4 @@ public class IndicNormalizer { return len; } - - /** - * Delete a character in-place - */ - private int delete(char s[], int pos, int len) { - if (pos < len) - System.arraycopy(s, pos + 1, s, pos, len - pos - 1); - - return len - 1; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java new file mode 100644 index 00000000000..af9625cfc05 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.it; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link ItalianLightStemmer} to stem Italian + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class ItalianLightStemFilter extends TokenFilter { + private final ItalianLightStemmer stemmer = new ItalianLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public ItalianLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java new file mode 100644 index 00000000000..50a80bd6af1 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java @@ -0,0 +1,117 @@ +package org.apache.lucene.analysis.it; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Light Stemmer for Italian. + *

+ * This stemmer implements the algorithm described in: + * Report on CLEF-2001 Experiments + * Jacques Savoy + */ +public class ItalianLightStemmer { + + public int stem(char s[], int len) { + if (len < 6) + return len; + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'à': + case 'á': + case 'â': + case 'ä': s[i] = 'a'; break; + case 'ò': + case 'ó': + case 'ô': + case 'ö': s[i] = 'o'; break; + case 'è': + case 'é': + case 'ê': + case 'ë': s[i] = 'e'; break; + case 'ù': + case 'ú': + case 'û': + case 'ü': s[i] = 'u'; break; + case 'ì': + case 'í': + case 'î': + case 'ï': s[i] = 'i'; break; + } + + switch(s[len-1]) { + case 'e': + if (s[len-2] == 'i' || s[len-2] == 'h') + return len - 2; + else + return len - 1; + case 'i': + if (s[len-2] == 'h' || s[len-2] == 'i') + return len - 2; + else + return len - 1; + case 'a': + if (s[len-2] == 'i') + return len - 2; + else + return len - 1; + case 'o': + if (s[len-2] == 'i') + return len - 2; + else + return len - 1; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java new file mode 100644 index 00000000000..81d268a91aa --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.pt; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link PortugueseLightStemmer} to stem + * Portuguese words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class PortugueseLightStemFilter extends TokenFilter { + private final PortugueseLightStemmer stemmer = new PortugueseLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public PortugueseLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java new file mode 100644 index 00000000000..1baea680c0b --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java @@ -0,0 +1,202 @@ +package org.apache.lucene.analysis.pt; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Portuguese + */ +public class PortugueseLightStemmer { + + public int stem(char s[], int len) { + if (len < 4) + return len; + + len = removeSuffix(s, len); + + if (len > 3 && s[len-1] == 'a') + len = normFeminine(s, len); + + if (len > 4) + switch(s[len-1]) { + case 'e': + case 'a': + case 'o': len--; break; + } + + for (int i = 0; i < len; i++) + switch(s[i]) { + case 'à': + case 'á': + case 'â': + case 'ä': + case 'ã': s[i] = 'a'; break; + case 'ò': + case 'ó': + case 'ô': + case 'ö': + case 'õ': s[i] = 'o'; break; + case 'è': + case 'é': + case 'ê': + case 'ë': s[i] = 'e'; break; + case 'ù': + case 'ú': + case 'û': + case 'ü': s[i] = 'u'; break; + case 'ì': + case 'í': + case 'î': + case 'ï': s[i] = 'i'; break; + case 'ç': s[i] = 'c'; break; + } + + return len; + } + + private int removeSuffix(char s[], int len) { + if (len > 4 && endsWith(s, len, "es")) + switch(s[len-3]) { + case 'r': + case 's': + case 'l': + case 'z': return len - 2; + } + + if (len > 3 && endsWith(s, len, "ns")) { + s[len - 2] = 'm'; + return len - 1; + } + + if (len > 4 && (endsWith(s, len, "eis") || endsWith(s, len, "éis"))) { + s[len - 3] = 'e'; + s[len - 2] = 'l'; + return len - 1; + } + + if (len > 4 && endsWith(s, len, "ais")) { + s[len - 2] = 'l'; + return len - 1; + } + + if (len > 4 && endsWith(s, len, "óis")) { + s[len - 3] = 'o'; + s[len - 2] = 'l'; + return len - 1; + } + + if (len > 4 && endsWith(s, len, "is")) { + s[len - 1] = 'l'; + return len; + } + + if (len > 3 && + (endsWith(s, len, "ões") || + endsWith(s, len, "ães"))) { + len--; + s[len - 2] = 'ã'; + s[len - 1] = 'o'; + return len; + } + + if (len > 6 && endsWith(s, len, "mente")) + return len - 5; + + if (len > 3 && s[len-1] == 's') + return len - 1; + return len; + } + + private int normFeminine(char s[], int len) { + if (len > 7 && + (endsWith(s, len, "inha") || + endsWith(s, len, "iaca") || + endsWith(s, len, "eira"))) { + s[len - 1] = 'o'; + return len; + } + + if (len > 6) { + if (endsWith(s, len, "osa") || + endsWith(s, len, "ica") || + endsWith(s, len, "ida") || + endsWith(s, len, "ada") || + endsWith(s, len, "iva") || + endsWith(s, len, "ama")) { + s[len - 1] = 'o'; + return len; + } + + if (endsWith(s, len, "ona")) { + s[len - 3] = 'ã'; + s[len - 2] = 'o'; + return len - 1; + } + + if (endsWith(s, len, "ora")) + return len - 1; + + if (endsWith(s, len, "esa")) { + s[len - 3] = 'ê'; + return len - 1; + } + + if (endsWith(s, len, "na")) { + s[len - 1] = 'o'; + return len; + } + } + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java new file mode 100644 index 00000000000..aa5a3716653 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.pt; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link PortugueseMinimalStemmer} to stem + * Portuguese words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class PortugueseMinimalStemFilter extends TokenFilter { + private final PortugueseMinimalStemmer stemmer = new PortugueseMinimalStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public PortugueseMinimalStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java new file mode 100644 index 00000000000..7ce19e37445 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java @@ -0,0 +1,119 @@ +package org.apache.lucene.analysis.pt; + +import java.util.Arrays; + +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.Version; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Minimal Stemmer for Portuguese + *

+ * This follows the "RSLP-S" algorithm presented in: + * A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese + * Information Retrieval (Orengo, et al) + * which is just the plural reduction step of the RSLP + * algorithm from A Stemming Algorithmm for the Portuguese Language, + * Orengo et al. + */ +public class PortugueseMinimalStemmer { + + private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31, + Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois", + "depois","dois","leis"), + false); + + private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31, + Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos", + "férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés", + "através", "convés", "ês", "país", "após", "ambas", "ambos", + "messias", "depois"), + false); + + public int stem(char s[], int len) { + if (len < 3 || s[len-1] != 's') + return len; + + if (s[len-2] == 'n') { + len--; + s[len-1] = 'm'; + return len; + } + + if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') { + len--; + s[len-2] = 'ã'; + s[len-1] = 'o'; + return len; + } + + if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e') + if (!(len == 4 && s[0] == 'm')) { + len--; + s[len-1] = 'o'; + return len; + } + + if (len >= 4 && s[len-2] == 'i') { + if (s[len-3] == 'a') + if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) { + len--; + s[len-1] = 'l'; + return len; + } + + if (len >= 5 && s[len-3] == 'é') { + len--; + s[len-2] = 'e'; + s[len-1] = 'l'; + return len; + } + + if (len >= 5 && s[len-3] == 'e') { + len--; + s[len-1] = 'l'; + return len; + } + + if (len >= 5 && s[len-3] == 'ó') { + len--; + s[len-2] = 'o'; + s[len-1] = 'l'; + return len; + } + + if (!excIS.contains(s, 0, len)) { + s[len-1] = 'l'; + return len; + } + } + + if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e') + return len - 2; + + if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e') + if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o')) + return len - 2; + + if (excS.contains(s, 0, len)) + return len; + else + return len-1; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java new file mode 100644 index 00000000000..826b22dc13c --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.ru; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link RussianLightStemmer} to stem Russian + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class RussianLightStemFilter extends TokenFilter { + private final RussianLightStemmer stemmer = new RussianLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public RussianLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java new file mode 100644 index 00000000000..e58bf38f6a9 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java @@ -0,0 +1,153 @@ +package org.apache.lucene.analysis.ru; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Russian. + *

+ * This stemmer implements the following algorithm: + * Indexing and Searching Strategies for the Russian Language. + * Ljiljana Dolamic and Jacques Savoy. + */ +public class RussianLightStemmer { + + public int stem(char s[], int len) { + len = removeCase(s, len); + return normalize(s, len); + } + + private int normalize(char s[], int len) { + if (len > 3) + switch(s[len-1]) { + case 'ь': + case 'и': return len - 1; + case 'н': if (s[len-2] == 'н') return len - 1; + } + return len; + } + + private int removeCase(char s[], int len) { + if (len > 6 && + (endsWith(s, len, "иями") || + endsWith(s, len, "оями"))) + return len - 4; + + if (len > 5 && + (endsWith(s, len, "иям") || + endsWith(s, len, "иях") || + endsWith(s, len, "оях") || + endsWith(s, len, "ями") || + endsWith(s, len, "оям") || + endsWith(s, len, "оьв") || + endsWith(s, len, "ами") || + endsWith(s, len, "его") || + endsWith(s, len, "ему") || + endsWith(s, len, "ери") || + endsWith(s, len, "ими") || + endsWith(s, len, "ого") || + endsWith(s, len, "ому") || + endsWith(s, len, "ыми") || + endsWith(s, len, "оев"))) + return len - 3; + + if (len > 4 && + (endsWith(s, len, "ая") || + endsWith(s, len, "яя") || + endsWith(s, len, "ях") || + endsWith(s, len, "юю") || + endsWith(s, len, "ах") || + endsWith(s, len, "ею") || + endsWith(s, len, "их") || + endsWith(s, len, "ия") || + endsWith(s, len, "ию") || + endsWith(s, len, "ьв") || + endsWith(s, len, "ою") || + endsWith(s, len, "ую") || + endsWith(s, len, "ям") || + endsWith(s, len, "ых") || + endsWith(s, len, "ея") || + endsWith(s, len, "ам") || + endsWith(s, len, "ем") || + endsWith(s, len, "ей") || + endsWith(s, len, "ём") || + endsWith(s, len, "ев") || + endsWith(s, len, "ий") || + endsWith(s, len, "им") || + endsWith(s, len, "ое") || + endsWith(s, len, "ой") || + endsWith(s, len, "ом") || + endsWith(s, len, "ов") || + endsWith(s, len, "ые") || + endsWith(s, len, "ый") || + endsWith(s, len, "ым") || + endsWith(s, len, "ми"))) + return len - 2; + + if (len > 3) + switch(s[len-1]) { + case 'а': + case 'е': + case 'и': + case 'о': + case 'у': + case 'й': + case 'ы': + case 'я': + case 'ь': return len - 1; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java new file mode 100644 index 00000000000..226c974576a --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.sv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +/** + * A {@link TokenFilter} that applies {@link SwedishLightStemmer} to stem Swedish + * words. + *

+ * To prevent terms from being stemmed use an instance of + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ */ +public final class SwedishLightStemFilter extends TokenFilter { + private final SwedishLightStemmer stemmer = new SwedishLightStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public SwedishLightStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java new file mode 100644 index 00000000000..036ba5f4cb9 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java @@ -0,0 +1,111 @@ +package org.apache.lucene.analysis.sv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This algorithm is updated based on code located at: + * http://members.unine.ch/jacques.savoy/clef/ + * + * Full copyright for that code follows: + */ + +/* + * Copyright (c) 2005, Jacques Savoy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution. Neither the name of the author nor the names + * of its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Swedish. + *

+ * This stemmer implements the algorithm described in: + * Report on CLEF-2003 Monolingual Tracks + * Jacques Savoy + */ +public class SwedishLightStemmer { + + public int stem(char s[], int len) { + if (len > 4 && s[len-1] == 's') + len--; + + if (len > 7 && + (endsWith(s, len, "elser") || + endsWith(s, len, "heten"))) + return len - 5; + + if (len > 6 && + (endsWith(s, len, "arne") || + endsWith(s, len, "erna") || + endsWith(s, len, "ande") || + endsWith(s, len, "else") || + endsWith(s, len, "aste") || + endsWith(s, len, "orna") || + endsWith(s, len, "aren"))) + return len - 4; + + if (len > 5 && + (endsWith(s, len, "are") || + endsWith(s, len, "ast") || + endsWith(s, len, "het"))) + return len - 3; + + if (len > 4 && + (endsWith(s, len, "ar") || + endsWith(s, len, "er") || + endsWith(s, len, "or") || + endsWith(s, len, "en") || + endsWith(s, len, "at") || + endsWith(s, len, "te") || + endsWith(s, len, "et"))) + return len - 2; + + if (len > 3) + switch(s[len-1]) { + case 't': + case 'a': + case 'e': + case 'n': return len - 1; + } + + return len; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java new file mode 100644 index 00000000000..883a7af8109 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java @@ -0,0 +1,89 @@ +package org.apache.lucene.analysis.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Some commonly-used stemming functions */ +public class StemmerUtil { + /** + * Returns true if the character array starts with the suffix. + * + * @param s Input Buffer + * @param len length of input buffer + * @param suffix Suffix string to test + * @return true if s starts with suffix + */ + public static boolean startsWith(char s[], int len, String prefix) { + final int prefixLen = prefix.length(); + if (prefixLen > len) + return false; + for (int i = 0; i < prefixLen; i++) + if (s[i] != prefix.charAt(i)) + return false; + return true; + } + + /** + * Returns true if the character array ends with the suffix. + * + * @param s Input Buffer + * @param len length of input buffer + * @param suffix Suffix string to test + * @return true if s ends with suffix + */ + public static boolean endsWith(char s[], int len, String suffix) { + final int suffixLen = suffix.length(); + if (suffixLen > len) + return false; + for (int i = suffixLen - 1; i >= 0; i--) + if (s[len -(suffixLen - i)] != suffix.charAt(i)) + return false; + + return true; + } + + /** + * Delete a character in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len length of input buffer + * @return length of input buffer after deletion + */ + public static int delete(char s[], int pos, int len) { + if (pos < len) + System.arraycopy(s, pos + 1, s, pos, len - pos - 1); + + return len - 1; + } + + /** + * Delete n characters in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len Length of input buffer + * @param nChars number of characters to delete + * @return length of input buffer after deletion + */ + public static int deleteN(char s[], int pos, int len, int nChars) { + // TODO: speed up, this is silly + for (int i = 0; i < nChars; i++) + len = delete(s, pos, len); + return len; + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java new file mode 100644 index 00000000000..63dfdb6c4cc --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link GermanLightStemFilter} + */ +public class TestGermanLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new GermanLightStemFilter(source)); + } + }; + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java new file mode 100644 index 00000000000..c14c7ea4076 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.de; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link GermanMinimalStemFilter} + */ +public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new GermanMinimalStemFilter(source)); + } + }; + + /** Test some examples from the paper */ + public void testExamples() throws IOException { + checkOneTerm(analyzer, "sängerinnen", "sangerin"); + checkOneTerm(analyzer, "frauen", "frau"); + checkOneTerm(analyzer, "kenntnisse", "kenntnis"); + checkOneTerm(analyzer, "staates", "staat"); + checkOneTerm(analyzer, "bilder", "bild"); + checkOneTerm(analyzer, "boote", "boot"); + checkOneTerm(analyzer, "götter", "gott"); + checkOneTerm(analyzer, "äpfel", "apfel"); + } + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java index 0376ff5bebe..d7602aa47c9 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java @@ -17,17 +17,17 @@ package org.apache.lucene.analysis.de; * limitations under the License. */ -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.io.StringReader; +import java.io.InputStream; +import java.io.Reader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; /** * Test the German stemmer. The stemming algorithm is known to work less @@ -38,25 +38,18 @@ import org.apache.lucene.analysis.core.LowerCaseFilter; public class TestGermanStemFilter extends BaseTokenStreamTestCase { public void testStemming() throws Exception { - Tokenizer tokenizer = new KeywordTokenizer(new StringReader("")); - TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer)); - // read test cases from external file: - InputStreamReader isr = new InputStreamReader(getClass().getResourceAsStream("data.txt"), "iso-8859-1"); - BufferedReader breader = new BufferedReader(isr); - while(true) { - String line = breader.readLine(); - if (line == null) - break; - line = line.trim(); - if (line.startsWith("#") || line.equals("")) - continue; // ignore comments and empty lines - String[] parts = line.split(";"); - //System.out.println(parts[0] + " -- " + parts[1]); - tokenizer.reset(new StringReader(parts[0])); - filter.reset(); - assertTokenStreamContents(filter, new String[] { parts[1] }); - } - breader.close(); - isr.close(); + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer t = new KeywordTokenizer(reader); + return new TokenStreamComponents(t, + new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t))); + } + }; + + InputStream vocOut = getClass().getResourceAsStream("data.txt"); + assertVocabulary(analyzer, vocOut); + vocOut.close(); } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt index 520c18a1df6..5b8ce5ffe31 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/data.txt @@ -1,48 +1,48 @@ # German special characters are replaced: -hufig;haufig +häufig haufig # here the stemmer works okay, it maps related words to the same stem: -abschlieen;abschliess -abschlieender;abschliess -abschlieendes;abschliess -abschlieenden;abschliess +abschließen abschliess +abschließender abschliess +abschließendes abschliess +abschließenden abschliess -Tisch;tisch -Tische;tisch -Tischen;tisch +Tisch tisch +Tische tisch +Tischen tisch -Haus;hau -Hauses;hau -Huser;hau -Husern;hau +Haus hau +Hauses hau +Häuser hau +Häusern hau # here's a case where overstemming occurs, i.e. a word is # mapped to the same stem as unrelated words: -hauen;hau +hauen hau # here's a case where understemming occurs, i.e. two related words # are not mapped to the same stem. This is the case with basically # all irregular forms: -Drama;drama -Dramen;dram +Drama drama +Dramen dram -# replace "" with 'ss': -Ausma;ausmass +# replace "ß" with 'ss': +Ausmaß ausmass # fake words to test if suffixes are cut off: -xxxxxe;xxxxx -xxxxxs;xxxxx -xxxxxn;xxxxx -xxxxxt;xxxxx -xxxxxem;xxxxx -xxxxxer;xxxxx -xxxxxnd;xxxxx +xxxxxe xxxxx +xxxxxs xxxxx +xxxxxn xxxxx +xxxxxt xxxxx +xxxxxem xxxxx +xxxxxer xxxxx +xxxxxnd xxxxx # the suffixes are also removed when combined: -xxxxxetende;xxxxx +xxxxxetende xxxxx # words that are shorter than four charcters are not changed: -xxe;xxe +xxe xxe # -em and -er are not removed from words shorter than five characters: -xxem;xxem -xxer;xxer +xxem xxem +xxer xxer # -nd is not removed from words shorter than six characters: -xxxnd;xxxnd +xxxnd xxxnd diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip new file mode 100644 index 00000000000..400db0bd66e Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/delighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip new file mode 100644 index 00000000000..d930327386c Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/de/deminimaltestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java new file mode 100644 index 00000000000..8ff0303b47d --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilter.java @@ -0,0 +1,54 @@ +package org.apache.lucene.analysis.en; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +/** + * Simple tests for {@link EnglishMinimalStemFilter} + */ +public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new EnglishMinimalStemFilter(source)); + } + }; + + /** Test some examples from various papers about this technique */ + public void testExamples() throws IOException { + checkOneTerm(analyzer, "queries", "query"); + checkOneTerm(analyzer, "phrases", "phrase"); + checkOneTerm(analyzer, "corpus", "corpus"); + checkOneTerm(analyzer, "stress", "stress"); + checkOneTerm(analyzer, "kings", "king"); + checkOneTerm(analyzer, "panels", "panel"); + checkOneTerm(analyzer, "aerodynamics", "aerodynamic"); + checkOneTerm(analyzer, "congress", "congress"); + checkOneTerm(analyzer, "serious", "serious"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java index b3653ed4d7c..e34829a27bd 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilter.java @@ -17,21 +17,22 @@ package org.apache.lucene.analysis.en; * limitations under the License. */ -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.Reader; import java.io.StringReader; -import java.util.zip.ZipFile; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + /** * Test the PorterStemFilter with Martin Porter's test data. */ @@ -41,26 +42,16 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase { * The output should be the same as the string in output.txt */ public void testPorterStemFilter() throws Exception { - Tokenizer tokenizer = new KeywordTokenizer(new StringReader("")); - TokenStream filter = new PorterStemFilter(tokenizer); - ZipFile zipFile = new ZipFile(getDataFile("porterTestData.zip")); - InputStream voc = zipFile.getInputStream(zipFile.getEntry("voc.txt")); - InputStream out = zipFile.getInputStream(zipFile.getEntry("output.txt")); - BufferedReader vocReader = new BufferedReader(new InputStreamReader( - voc, "UTF-8")); - BufferedReader outputReader = new BufferedReader(new InputStreamReader( - out, "UTF-8")); - String inputWord = null; - while ((inputWord = vocReader.readLine()) != null) { - String expectedWord = outputReader.readLine(); - assertNotNull(expectedWord); - tokenizer.reset(new StringReader(inputWord)); - filter.reset(); - assertTokenStreamContents(filter, new String[] { expectedWord }); - } - vocReader.close(); - outputReader.close(); - zipFile.close(); + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer t = new KeywordTokenizer(reader); + return new TokenStreamComponents(t, new PorterStemFilter(t)); + } + }; + + assertVocabulary(a, getDataFile("porterTestData.zip"), "voc.txt", "output.txt"); } public void testWithKeywordAttribute() throws IOException { diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java new file mode 100644 index 00000000000..f494bd65725 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilter.java @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.es; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link SpanishLightStemFilter} + */ +public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new SpanishLightStemFilter(source)); + } + }; + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("eslighttestdata.zip"), "eslight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip new file mode 100644 index 00000000000..0e88cf28d12 Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/es/eslighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java new file mode 100644 index 00000000000..d946a20ca53 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.fi; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link FinnishLightStemFilter} + */ +public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new FinnishLightStemFilter(source)); + } + }; + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip new file mode 100644 index 00000000000..5a85453a614 Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/fi/filighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java new file mode 100644 index 00000000000..ffe8d6c22cc --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java @@ -0,0 +1,162 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link FrenchLightStemFilter} + */ +public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new FrenchLightStemFilter(source)); + } + }; + + /** Test some examples from the paper */ + public void testExamples() throws IOException { + checkOneTerm(analyzer, "chevaux", "cheval"); + checkOneTerm(analyzer, "cheval", "cheval"); + + checkOneTerm(analyzer, "hiboux", "hibou"); + checkOneTerm(analyzer, "hibou", "hibou"); + + checkOneTerm(analyzer, "chantés", "chant"); + checkOneTerm(analyzer, "chanter", "chant"); + checkOneTerm(analyzer, "chante", "chant"); + checkOneTerm(analyzer, "chant", "chant"); + + checkOneTerm(analyzer, "baronnes", "baron"); + checkOneTerm(analyzer, "barons", "baron"); + checkOneTerm(analyzer, "baron", "baron"); + + checkOneTerm(analyzer, "peaux", "peau"); + checkOneTerm(analyzer, "peau", "peau"); + + checkOneTerm(analyzer, "anneaux", "aneau"); + checkOneTerm(analyzer, "anneau", "aneau"); + + checkOneTerm(analyzer, "neveux", "neveu"); + checkOneTerm(analyzer, "neveu", "neveu"); + + checkOneTerm(analyzer, "affreux", "afreu"); + checkOneTerm(analyzer, "affreuse", "afreu"); + + checkOneTerm(analyzer, "investissement", "investi"); + checkOneTerm(analyzer, "investir", "investi"); + + checkOneTerm(analyzer, "assourdissant", "asourdi"); + checkOneTerm(analyzer, "assourdir", "asourdi"); + + checkOneTerm(analyzer, "pratiquement", "pratiqu"); + checkOneTerm(analyzer, "pratique", "pratiqu"); + + checkOneTerm(analyzer, "administrativement", "administratif"); + checkOneTerm(analyzer, "administratif", "administratif"); + + checkOneTerm(analyzer, "justificatrice", "justifi"); + checkOneTerm(analyzer, "justificateur", "justifi"); + checkOneTerm(analyzer, "justifier", "justifi"); + + checkOneTerm(analyzer, "educatrice", "eduqu"); + checkOneTerm(analyzer, "eduquer", "eduqu"); + + checkOneTerm(analyzer, "communicateur", "comuniqu"); + checkOneTerm(analyzer, "communiquer", "comuniqu"); + + checkOneTerm(analyzer, "accompagnatrice", "acompagn"); + checkOneTerm(analyzer, "accompagnateur", "acompagn"); + + checkOneTerm(analyzer, "administrateur", "administr"); + checkOneTerm(analyzer, "administrer", "administr"); + + checkOneTerm(analyzer, "productrice", "product"); + checkOneTerm(analyzer, "producteur", "product"); + + checkOneTerm(analyzer, "acheteuse", "achet"); + checkOneTerm(analyzer, "acheteur", "achet"); + + checkOneTerm(analyzer, "planteur", "plant"); + checkOneTerm(analyzer, "plante", "plant"); + + checkOneTerm(analyzer, "poreuse", "poreu"); + checkOneTerm(analyzer, "poreux", "poreu"); + + checkOneTerm(analyzer, "plieuse", "plieu"); + + checkOneTerm(analyzer, "bijoutière", "bijouti"); + checkOneTerm(analyzer, "bijoutier", "bijouti"); + + checkOneTerm(analyzer, "caissière", "caisi"); + checkOneTerm(analyzer, "caissier", "caisi"); + + checkOneTerm(analyzer, "abrasive", "abrasif"); + checkOneTerm(analyzer, "abrasif", "abrasif"); + + checkOneTerm(analyzer, "folle", "fou"); + checkOneTerm(analyzer, "fou", "fou"); + + checkOneTerm(analyzer, "personnelle", "person"); + checkOneTerm(analyzer, "personne", "person"); + + // algo bug: too short length + //checkOneTerm(analyzer, "personnel", "person"); + + checkOneTerm(analyzer, "complète", "complet"); + checkOneTerm(analyzer, "complet", "complet"); + + checkOneTerm(analyzer, "aromatique", "aromat"); + + checkOneTerm(analyzer, "faiblesse", "faibl"); + checkOneTerm(analyzer, "faible", "faibl"); + + checkOneTerm(analyzer, "patinage", "patin"); + checkOneTerm(analyzer, "patin", "patin"); + + checkOneTerm(analyzer, "sonorisation", "sono"); + + checkOneTerm(analyzer, "ritualisation", "rituel"); + checkOneTerm(analyzer, "rituel", "rituel"); + + // algo bug: masked by rules above + //checkOneTerm(analyzer, "colonisateur", "colon"); + + checkOneTerm(analyzer, "nomination", "nomin"); + + checkOneTerm(analyzer, "disposition", "dispos"); + checkOneTerm(analyzer, "dispose", "dispos"); + } + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java new file mode 100644 index 00000000000..b45c5323c82 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java @@ -0,0 +1,62 @@ +package org.apache.lucene.analysis.fr; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link FrenchMinimalStemFilter} + */ +public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new FrenchMinimalStemFilter(source)); + } + }; + + /** Test some examples from the paper */ + public void testExamples() throws IOException { + checkOneTerm(analyzer, "chevaux", "cheval"); + checkOneTerm(analyzer, "hiboux", "hibou"); + + checkOneTerm(analyzer, "chantés", "chant"); + checkOneTerm(analyzer, "chanter", "chant"); + checkOneTerm(analyzer, "chante", "chant"); + + checkOneTerm(analyzer, "baronnes", "baron"); + checkOneTerm(analyzer, "barons", "baron"); + checkOneTerm(analyzer, "baron", "baron"); + } + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frlighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frlighttestdata.zip new file mode 100644 index 00000000000..a036b8a991d Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frlighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frminimaltestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frminimaltestdata.zip new file mode 100644 index 00000000000..07dc4e1e550 Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/fr/frminimaltestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java new file mode 100644 index 00000000000..e0eaf2a0921 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.hu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link HungarianLightStemFilter} + */ +public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new HungarianLightStemFilter(source)); + } + }; + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/hulighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/hulighttestdata.zip new file mode 100644 index 00000000000..e334c6940e2 Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/hu/hulighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java new file mode 100644 index 00000000000..b850630086c --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilter.java @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.it; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link ItalianLightStemFilter} + */ +public class TestItalianLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new ItalianLightStemFilter(source)); + } + }; + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("itlighttestdata.zip"), "itlight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/itlighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/itlighttestdata.zip new file mode 100644 index 00000000000..f390507ff15 Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/itlighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java new file mode 100644 index 00000000000..6f5fdcf148d --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java @@ -0,0 +1,95 @@ +package org.apache.lucene.analysis.pt; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link PortugueseLightStemFilter} + */ +public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); + TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); + return new TokenStreamComponents(source, new PortugueseLightStemFilter(result)); + } + }; + + /** + * Test the example from the paper "Assessing the impact of stemming accuracy + * on information retrieval" + */ + public void testExamples() throws IOException { + assertAnalyzesTo( + analyzer, + "O debate político, pelo menos o que vem a público, parece, de modo nada " + + "surpreendente, restrito a temas menores. Mas há, evidentemente, " + + "grandes questões em jogo nas eleições que se aproximam.", + new String[] { + "o", "debat", "politic", "pelo", "meno", "o", "que", "vem", "a", + "public", "parec", "de", "modo", "nada", "surpreendent", "restrit", + "a", "tema", "menor", "mas", "há", "evident", "grand", "questa", + "em", "jogo", "nas", "eleica", "que", "se", "aproximam" + }); + } + + /** + * Test examples from the c implementation + */ + public void testMoreExamples() throws IOException { + checkOneTerm(analyzer, "doutores", "doutor"); + checkOneTerm(analyzer, "doutor", "doutor"); + + checkOneTerm(analyzer, "homens", "homem"); + checkOneTerm(analyzer, "homem", "homem"); + + checkOneTerm(analyzer, "papéis", "papel"); + checkOneTerm(analyzer, "papel", "papel"); + + checkOneTerm(analyzer, "normais", "normal"); + checkOneTerm(analyzer, "normal", "normal"); + + checkOneTerm(analyzer, "lencóis", "lencol"); + checkOneTerm(analyzer, "lencol", "lencol"); + + checkOneTerm(analyzer, "barris", "barril"); + checkOneTerm(analyzer, "barril", "barril"); + + checkOneTerm(analyzer, "botões", "bota"); + checkOneTerm(analyzer, "botão", "bota"); + } + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java new file mode 100644 index 00000000000..64a2dd7ac51 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java @@ -0,0 +1,69 @@ +package org.apache.lucene.analysis.pt; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link PortugueseMinimalStemFilter} + */ +public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); + TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); + return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(result)); + } + }; + + /** + * Test the example from the paper "Assessing the impact of stemming accuracy + * on information retrieval" + */ + public void testExamples() throws IOException { + assertAnalyzesTo( + analyzer, + "O debate político, pelo menos o que vem a público, parece, de modo nada " + + "surpreendente, restrito a temas menores. Mas há, evidentemente, " + + "grandes questões em jogo nas eleições que se aproximam.", + new String[] { + "o", "debate", "político", "pelo", "menos", "o", "que", "vem", "a", + "público", "parece", "de", "modo", "nada", "surpreendente", "restrito", + "a", "tema", "menor", "mas", "há", "evidentemente", "grande", "questão", + "em", "jogo", "na", "eleição", "que", "se", "aproximam" + }); + } + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptlighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptlighttestdata.zip new file mode 100644 index 00000000000..eca9a46ff9d Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptlighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptminimaltestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptminimaltestdata.zip new file mode 100644 index 00000000000..4169fa26b5e Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptminimaltestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java new file mode 100644 index 00000000000..b524d2a62a4 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.ru; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link RussianLightStemFilter} + */ +public class TestRussianLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new RussianLightStemFilter(source)); + } + }; + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java index 632391e88ea..0688f6d0d8f 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java @@ -17,71 +17,35 @@ package org.apache.lucene.analysis.ru; * limitations under the License. */ +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; import org.apache.lucene.util.LuceneTestCase; -import java.io.BufferedReader; -import java.io.File; -import java.io.InputStreamReader; -import java.io.FileInputStream; -import java.util.ArrayList; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; /** * @deprecated Remove this test class (and its datafiles!) in Lucene 4.0 */ @Deprecated -public class TestRussianStem extends LuceneTestCase -{ - private ArrayList words = new ArrayList(); - private ArrayList stems = new ArrayList(); - - public TestRussianStem(String name) - { - super(name); - } - - /** - * @see TestCase#setUp() - */ - @Override - protected void setUp() throws Exception { - super.setUp(); - //System.out.println(new java.util.Date()); - String str; - - // open and read words into an array list - BufferedReader inWords = - new BufferedReader( - new InputStreamReader( - getClass().getResourceAsStream("wordsUTF8.txt"), - "UTF-8")); - while ((str = inWords.readLine()) != null) - { - words.add(str); - } - inWords.close(); - - // open and read stems into an array list - BufferedReader inStems = - new BufferedReader( - new InputStreamReader( - getClass().getResourceAsStream("stemsUTF8.txt"), - "UTF-8")); - while ((str = inStems.readLine()) != null) - { - stems.add(str); - } - inStems.close(); - } - - public void testStem() - { - for (int i = 0; i < words.size(); i++) - { - //if ( (i % 100) == 0 ) System.err.println(i); - String realStem = - RussianStemmer.stemWord( - words.get(i)); - assertEquals("unicode", stems.get(i), realStem); - } - } - +public class TestRussianStem extends LuceneTestCase { + public void testStem() throws IOException { + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer t = new KeywordTokenizer(reader); + return new TokenStreamComponents(t, new RussianStemFilter(t)); + } + }; + InputStream voc = getClass().getResourceAsStream("wordsUTF8.txt"); + InputStream out = getClass().getResourceAsStream("stemsUTF8.txt"); + assertVocabulary(a, voc, out); + voc.close(); + out.close(); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/rulighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/rulighttestdata.zip new file mode 100644 index 00000000000..7d4f7bb31e1 Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/ru/rulighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java index 22790cc7dca..1fb3a88b695 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballVocab.java @@ -17,38 +17,21 @@ package org.apache.lucene.analysis.snowball; * limitations under the License. */ -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.StringReader; -import java.util.zip.ZipFile; +import java.io.Reader; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; +import org.apache.lucene.util.LuceneTestCase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; /** * Test the snowball filters against the snowball data tests */ -public class TestSnowballVocab extends BaseTokenStreamTestCase { - private Tokenizer tokenizer = new KeywordTokenizer(new StringReader("")); - ZipFile zipFile = null; - - @Override - protected void setUp() throws Exception { - super.setUp(); - this.zipFile = new ZipFile(getDataFile("TestSnowballVocabData.zip")); - } - - @Override - protected void tearDown() throws Exception { - this.zipFile.close(); - this.zipFile = null; - super.tearDown(); - } - +public class TestSnowballVocab extends LuceneTestCase { /** * Run all languages against their snowball vocabulary tests. */ @@ -82,25 +65,20 @@ public class TestSnowballVocab extends BaseTokenStreamTestCase { * For the supplied language, run the stemmer against all strings in voc.txt * The output should be the same as the string in output.txt */ - private void assertCorrectOutput(String snowballLanguage, String dataDirectory) + private void assertCorrectOutput(final String snowballLanguage, String dataDirectory) throws IOException { if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage); - TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage); - InputStream voc = zipFile.getInputStream(zipFile.getEntry(dataDirectory + "/voc.txt")); - InputStream out = zipFile.getInputStream(zipFile.getEntry(dataDirectory + "/output.txt")); - BufferedReader vocReader = new BufferedReader(new InputStreamReader( - voc, "UTF-8")); - BufferedReader outputReader = new BufferedReader(new InputStreamReader( - out, "UTF-8")); - String inputWord = null; - while ((inputWord = vocReader.readLine()) != null) { - String expectedWord = outputReader.readLine(); - assertNotNull(expectedWord); - tokenizer.reset(new StringReader(inputWord)); - filter.reset(); - assertTokenStreamContents(filter, new String[] {expectedWord}); - } - vocReader.close(); - outputReader.close(); + + Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer t = new KeywordTokenizer(reader); + return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage)); + } + }; + + assertVocabulary(a, getDataFile("TestSnowballVocabData.zip"), + dataDirectory + "/voc.txt", dataDirectory + "/output.txt"); } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java new file mode 100644 index 00000000000..b6b825e0c8a --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.sv; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; + +import static org.apache.lucene.analysis.util.VocabularyAssert.*; + +/** + * Simple tests for {@link SwedishLightStemFilter} + */ +public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(source, new SwedishLightStemFilter(source)); + } + }; + + /** Test against a vocabulary from the reference impl */ + public void testVocabulary() throws IOException { + assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt"); + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/svlighttestdata.zip b/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/svlighttestdata.zip new file mode 100644 index 00000000000..11f525b2a0a Binary files /dev/null and b/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/svlighttestdata.zip differ diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/VocabularyAssert.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/VocabularyAssert.java new file mode 100644 index 00000000000..4beba4bba0b --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/VocabularyAssert.java @@ -0,0 +1,83 @@ +package org.apache.lucene.analysis.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.zip.ZipFile; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.junit.Assert; + +/** Utility class for doing vocabulary-based stemming tests */ +public class VocabularyAssert { + /** Run a vocabulary test against two data files. */ + public static void assertVocabulary(Analyzer a, InputStream voc, InputStream out) + throws IOException { + BufferedReader vocReader = new BufferedReader( + new InputStreamReader(voc, "UTF-8")); + BufferedReader outputReader = new BufferedReader( + new InputStreamReader(out, "UTF-8")); + String inputWord = null; + while ((inputWord = vocReader.readLine()) != null) { + String expectedWord = outputReader.readLine(); + Assert.assertNotNull(expectedWord); + BaseTokenStreamTestCase.checkOneTermReuse(a, inputWord, expectedWord); + } + } + + /** Run a vocabulary test against one file: tab separated. */ + public static void assertVocabulary(Analyzer a, InputStream vocOut) + throws IOException { + BufferedReader vocReader = new BufferedReader( + new InputStreamReader(vocOut, "UTF-8")); + String inputLine = null; + while ((inputLine = vocReader.readLine()) != null) { + if (inputLine.startsWith("#") || inputLine.trim().length() == 0) + continue; /* comment */ + String words[] = inputLine.split("\t"); + BaseTokenStreamTestCase.checkOneTermReuse(a, words[0], words[1]); + } + } + + /** Run a vocabulary test against two data files inside a zip file */ + public static void assertVocabulary(Analyzer a, File zipFile, String voc, String out) + throws IOException { + ZipFile zip = new ZipFile(zipFile); + InputStream v = zip.getInputStream(zip.getEntry(voc)); + InputStream o = zip.getInputStream(zip.getEntry(out)); + assertVocabulary(a, v, o); + v.close(); + o.close(); + zip.close(); + } + + /** Run a vocabulary test against a tab-separated data file inside a zip file */ + public static void assertVocabulary(Analyzer a, File zipFile, String vocOut) + throws IOException { + ZipFile zip = new ZipFile(zipFile); + InputStream vo = zip.getInputStream(zip.getEntry(vocOut)); + assertVocabulary(a, vo); + vo.close(); + zip.close(); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java new file mode 100644 index 00000000000..a7f1f6dab5c --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; + +/** Factory for {@link EnglishMinimalStemFilter} */ +public class EnglishMinimalStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new EnglishMinimalStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java new file mode 100644 index 00000000000..7a284ed6f8a --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fi.FinnishLightStemFilter; + +/** Factory for {@link FinnishLightStemFilter} */ +public class FinnishLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new FinnishLightStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java new file mode 100644 index 00000000000..a2f15896174 --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fr.FrenchLightStemFilter; + +/** Factory for {@link FrenchLightStemFilter} */ +public class FrenchLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new FrenchLightStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java new file mode 100644 index 00000000000..3af344648fb --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter; + +/** Factory for {@link FrenchMinimalStemFilter} */ +public class FrenchMinimalStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new FrenchMinimalStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java new file mode 100644 index 00000000000..b790d5af297 --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.de.GermanLightStemFilter; + +/** Factory for {@link GermanLightStemFilter} */ +public class GermanLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new GermanLightStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java new file mode 100644 index 00000000000..e41329093eb --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.de.GermanMinimalStemFilter; + +/** Factory for {@link GermanMinimalStemFilter} */ +public class GermanMinimalStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new GermanMinimalStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java new file mode 100644 index 00000000000..b4f6dce5acc --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hu.HungarianLightStemFilter; + +/** Factory for {@link HungarianLightStemFilter} */ +public class HungarianLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new HungarianLightStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java new file mode 100644 index 00000000000..3281736b876 --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.it.ItalianLightStemFilter; + +/** Factory for {@link ItalianLightStemFilter} */ +public class ItalianLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new ItalianLightStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/PortugueseLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/PortugueseLightStemFilterFactory.java new file mode 100644 index 00000000000..50ec45a58c2 --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/PortugueseLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.pt.PortugueseLightStemFilter; + +/** Factory for {@link PortugueseLightStemFilter} */ +public class PortugueseLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new PortugueseLightStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/PortugueseMinimalStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/PortugueseMinimalStemFilterFactory.java new file mode 100644 index 00000000000..60039a7af40 --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/PortugueseMinimalStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.pt.PortugueseMinimalStemFilter; + +/** Factory for {@link PortugueseMinimalStemFilter} */ +public class PortugueseMinimalStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new PortugueseMinimalStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/RussianLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/RussianLightStemFilterFactory.java new file mode 100644 index 00000000000..fc9e301b64d --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/RussianLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ru.RussianLightStemFilter; + +/** Factory for {@link RussianLightStemFilter} */ +public class RussianLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new RussianLightStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java new file mode 100644 index 00000000000..148810d04d9 --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.es.SpanishLightStemFilter; + +/** Factory for {@link SpanishLightStemFilter} */ +public class SpanishLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new SpanishLightStemFilter(input); + } +} diff --git a/solr/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java new file mode 100644 index 00000000000..2c7aff2e93e --- /dev/null +++ b/solr/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java @@ -0,0 +1,28 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.sv.SwedishLightStemFilter; + +/** Factory for {@link SwedishLightStemFilter} */ +public class SwedishLightStemFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new SwedishLightStemFilter(input); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestEnglishMinimalStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestEnglishMinimalStemFilterFactory.java new file mode 100644 index 00000000000..b650e6b43ac --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestEnglishMinimalStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the English minimal stem factory is working. + */ +public class TestEnglishMinimalStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("bricks"); + EnglishMinimalStemFilterFactory factory = new EnglishMinimalStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "brick" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestFinnishLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestFinnishLightStemFilterFactory.java new file mode 100644 index 00000000000..4e547681c70 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestFinnishLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Finnish light stem factory is working. + */ +public class TestFinnishLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("aseistettujen"); + FinnishLightStemFilterFactory factory = new FinnishLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "aseistet" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestFrenchLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestFrenchLightStemFilterFactory.java new file mode 100644 index 00000000000..e650d406546 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestFrenchLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the French light stem factory is working. + */ +public class TestFrenchLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("administrativement"); + FrenchLightStemFilterFactory factory = new FrenchLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "administratif" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestFrenchMinimalStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestFrenchMinimalStemFilterFactory.java new file mode 100644 index 00000000000..ae2d6f91618 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestFrenchMinimalStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the French minimal stem factory is working. + */ +public class TestFrenchMinimalStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("chevaux"); + FrenchMinimalStemFilterFactory factory = new FrenchMinimalStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "cheval" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestGermanLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestGermanLightStemFilterFactory.java new file mode 100644 index 00000000000..2c3950b4b2e --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestGermanLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the German light stem factory is working. + */ +public class TestGermanLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("häuser"); + GermanLightStemFilterFactory factory = new GermanLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "haus" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestGermanMinimalStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestGermanMinimalStemFilterFactory.java new file mode 100644 index 00000000000..cb1ab0242b8 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestGermanMinimalStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the German minimal stem factory is working. + */ +public class TestGermanMinimalStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("bilder"); + GermanMinimalStemFilterFactory factory = new GermanMinimalStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "bild" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestHungarianLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestHungarianLightStemFilterFactory.java new file mode 100644 index 00000000000..e78ef7554d1 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestHungarianLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Hungarian light stem factory is working. + */ +public class TestHungarianLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("házakat"); + HungarianLightStemFilterFactory factory = new HungarianLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "haz" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestItalianLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestItalianLightStemFilterFactory.java new file mode 100644 index 00000000000..c539841b1b7 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestItalianLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Italian light stem factory is working. + */ +public class TestItalianLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("ragazzo ragazzi"); + ItalianLightStemFilterFactory factory = new ItalianLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "ragazz", "ragazz" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestPortugueseLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestPortugueseLightStemFilterFactory.java new file mode 100644 index 00000000000..1dea46ba48f --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestPortugueseLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Portuguese Light stem factory is working. + */ +public class TestPortugueseLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("evidentemente"); + PortugueseLightStemFilterFactory factory = new PortugueseLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "evident" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestPortugueseMinimalStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestPortugueseMinimalStemFilterFactory.java new file mode 100644 index 00000000000..d6e144493f1 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestPortugueseMinimalStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Portuguese Minimal stem factory is working. + */ +public class TestPortugueseMinimalStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("questões"); + PortugueseMinimalStemFilterFactory factory = new PortugueseMinimalStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "questão" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestRussianLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestRussianLightStemFilterFactory.java new file mode 100644 index 00000000000..aff5663c52d --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestRussianLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Russian light stem factory is working. + */ +public class TestRussianLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("журналы"); + RussianLightStemFilterFactory factory = new RussianLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "журнал" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestSpanishLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestSpanishLightStemFilterFactory.java new file mode 100644 index 00000000000..2bdf8f356be --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestSpanishLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Spanish Light stem factory is working. + */ +public class TestSpanishLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("sociedades"); + SpanishLightStemFilterFactory factory = new SpanishLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "sociedad" }); + } +} diff --git a/solr/src/test/org/apache/solr/analysis/TestSwedishLightStemFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestSwedishLightStemFilterFactory.java new file mode 100644 index 00000000000..1590cf5ded0 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestSwedishLightStemFilterFactory.java @@ -0,0 +1,36 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Simple tests to ensure the Swedish Light stem factory is working. + */ +public class TestSwedishLightStemFilterFactory extends BaseTokenTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("äpplen äpple"); + SwedishLightStemFilterFactory factory = new SwedishLightStemFilterFactory(); + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); + assertTokenStreamContents(stream, new String[] { "äppl", "äppl" }); + } +}