LUCENE-2503: add light stemmers for european languages

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@964019 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-07-14 12:10:34 +00:00
parent d49603b939
commit 3241eb9291
95 changed files with 4686 additions and 367 deletions

View File

@ -187,6 +187,9 @@ New features
* LUCENE-2464: FastVectorHighlighter: add SingleFragListBuilder to return * LUCENE-2464: FastVectorHighlighter: add SingleFragListBuilder to return
entire field contents. (Koji Sekiguchi) entire field contents. (Koji Sekiguchi)
* LUCENE-2503: Added lighter stemming alternatives for European languages.
(Robert Muir)
Build Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation * LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -17,30 +17,29 @@ were developed by Martin Porter and Richard Boulton.
The full snowball package is available from The full snowball package is available from
http://snowball.tartarus.org/ http://snowball.tartarus.org/
The Arabic stemmer (common) comes with a default The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt. common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The Persian analyzer (common) comes with a default The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in (common) are based on BSD-licensed reference implementations created by Jacques Savoy and
common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt. Ljiljana Dolamic. These files reside in:
See http://members.unine.ch/jacques.savoy/clef/index.html. common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
The Romanian analyzer (common) comes with a default common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt. common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
See http://members.unine.ch/jacques.savoy/clef/index.html. common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
The Bulgarian analyzer (common) comes with a default common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt. common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
See http://members.unine.ch/jacques.savoy/clef/index.html. common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
The Hindi analyzer (common) comes with a default
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html.
The Stempel analyzer (stempel) includes BSD-licensed software developed The Stempel analyzer (stempel) includes BSD-licensed software developed
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ar;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Normalizer for Arabic. * Normalizer for Arabic.
* <p> * <p>
@ -96,20 +98,4 @@ public class ArabicNormalizer {
return len; return len;
} }
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
protected int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
} }

View File

@ -1,4 +1,6 @@
package org.apache.lucene.analysis.ar; package org.apache.lucene.analysis.ar;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -16,6 +18,8 @@ package org.apache.lucene.analysis.ar;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Stemmer for Arabic. * Stemmer for Arabic.
* <p> * <p>
@ -86,7 +90,7 @@ public class ArabicStemmer {
*/ */
public int stemPrefix(char s[], int len) { public int stemPrefix(char s[], int len) {
for (int i = 0; i < prefixes.length; i++) for (int i = 0; i < prefixes.length; i++)
if (startsWith(s, len, prefixes[i])) if (startsWithCheckLength(s, len, prefixes[i]))
return deleteN(s, 0, len, prefixes[i].length); return deleteN(s, 0, len, prefixes[i].length);
return len; return len;
} }
@ -99,7 +103,7 @@ public class ArabicStemmer {
*/ */
public int stemSuffix(char s[], int len) { public int stemSuffix(char s[], int len) {
for (int i = 0; i < suffixes.length; i++) for (int i = 0; i < suffixes.length; i++)
if (endsWith(s, len, suffixes[i])) if (endsWithCheckLength(s, len, suffixes[i]))
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length); len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
return len; return len;
} }
@ -111,7 +115,7 @@ public class ArabicStemmer {
* @param prefix prefix to check * @param prefix prefix to check
* @return true if the prefix matches and can be stemmed * @return true if the prefix matches and can be stemmed
*/ */
boolean startsWith(char s[], int len, char prefix[]) { boolean startsWithCheckLength(char s[], int len, char prefix[]) {
if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
return false; return false;
} else if (len < prefix.length + 2) { // other prefixes require only 2. } else if (len < prefix.length + 2) { // other prefixes require only 2.
@ -132,7 +136,7 @@ public class ArabicStemmer {
* @param suffix suffix to check * @param suffix suffix to check
* @return true if the suffix matches and can be stemmed * @return true if the suffix matches and can be stemmed
*/ */
boolean endsWith(char s[], int len, char suffix[]) { boolean endsWithCheckLength(char s[], int len, char suffix[]) {
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
return false; return false;
} else { } else {
@ -143,36 +147,4 @@ public class ArabicStemmer {
return true; return true;
} }
} }
/**
* Delete n characters in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len Length of input buffer
* @param nChars number of characters to delete
* @return length of input buffer after deletion
*/
protected int deleteN(char s[], int pos, int len, int nChars) {
for (int i = 0; i < nChars; i++)
len = delete(s, pos, len);
return len;
}
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
protected int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
} }

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.bg;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Light Stemmer for Bulgarian. * Light Stemmer for Bulgarian.
* <p> * <p>
@ -138,15 +140,4 @@ public class BulgarianStemmer {
return len; return len;
} }
private boolean endsWith(final char s[], final int len, final String suffix) {
final int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
} }

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.cz;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Light Stemmer for Czech. * Light Stemmer for Czech.
* <p> * <p>
@ -166,16 +168,4 @@ public class CzechStemmer {
return len; return len;
} }
private boolean endsWith(char s[], int len, String suffix) {
int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len - (suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
} }

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.de;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link GermanLightStemmer} to stem German
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class GermanLightStemFilter extends TokenFilter {
private final GermanLightStemmer stemmer = new GermanLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public GermanLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,138 @@
package org.apache.lucene.analysis.de;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Light Stemmer for German.
* <p>
* This stemmer implements the "UniNE" algorithm in:
* <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
* Jacques Savoy
*/
public class GermanLightStemmer {
public int stem(char s[], int len) {
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'ä':
case 'à':
case 'á':
case 'â': s[i] = 'a'; break;
case 'ö':
case 'ò':
case 'ó':
case 'ô': s[i] = 'o'; break;
case 'ï':
case 'ì':
case 'í':
case 'î': s[i] = 'i'; break;
case 'ü':
case 'ù':
case 'ú':
case 'û': s[i] = 'u'; break;
}
len = step1(s, len);
return step2(s, len);
}
private boolean stEnding(char ch) {
switch(ch) {
case 'b':
case 'd':
case 'f':
case 'g':
case 'h':
case 'k':
case 'l':
case 'm':
case 'n':
case 't': return true;
default: return false;
}
}
private int step1(char s[], int len) {
if (len > 5 && s[len-3] == 'e' && s[len-2] == 'r' && s[len-1] == 'n')
return len - 3;
if (len > 4 && s[len-2] == 'e')
switch(s[len-1]) {
case 'm':
case 'n':
case 'r':
case 's': return len - 2;
}
if (len > 3 && s[len-1] == 'e')
return len - 1;
if (len > 3 && s[len-1] == 's' && stEnding(s[len-2]))
return len - 1;
return len;
}
private int step2(char s[], int len) {
if (len > 5 && s[len-3] == 'e' && s[len-2] == 's' && s[len-1] == 't')
return len - 3;
if (len > 4 && s[len-2] == 'e' && (s[len-1] == 'r' || s[len-1] == 'n'))
return len - 2;
if (len > 4 && s[len-2] == 's' && s[len-1] == 't' && stEnding(s[len-3]))
return len - 2;
return len;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.de;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link GermanMinimalStemmer} to stem German
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class GermanMinimalStemFilter extends TokenFilter {
private final GermanMinimalStemmer stemmer = new GermanMinimalStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public GermanMinimalStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,95 @@
package org.apache.lucene.analysis.de;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Minimal Stemmer for German.
* <p>
* This stemmer implements the following algorithm:
* <i>Morphologie et recherche d'information</i>
* Jacques Savoy.
*/
public class GermanMinimalStemmer {
public int stem(char s[], int len) {
if (len < 5)
return len;
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'ä': s[i] = 'a'; break;
case 'ö': s[i] = 'o'; break;
case 'ü': s[i] = 'u'; break;
}
if (len > 6 && s[len-3] == 'n' && s[len-2] == 'e' && s[len-1] == 'n')
return len - 3;
if (len > 5)
switch(s[len-1]) {
case 'n': if (s[len-2] == 'e') return len - 2; else break;
case 'e': if (s[len-2] == 's') return len - 2; else break;
case 's': if (s[len-2] == 'e') return len - 2; else break;
case 'r': if (s[len-2] == 'e') return len - 2; else break;
}
switch(s[len-1]) {
case 'n':
case 'e':
case 's':
case 'r': return len - 1;
}
return len;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.en;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link EnglishMinimalStemmer} to stem
* English words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class EnglishMinimalStemFilter extends TokenFilter {
private final EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public EnglishMinimalStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,45 @@
package org.apache.lucene.analysis.en;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Minimal plural stemmer for English.
* <p>
* This stemmer implements the "S-Stemmer" from
* <i>How Effective Is Suffixing?</i>
* Donna Harman.
*/
public class EnglishMinimalStemmer {
public int stem(char s[], int len) {
if (len < 3 || s[len-1] != 's')
return len;
switch(s[len-2]) {
case 'u':
case 's': return len;
case 'e':
if (len > 3 && s[len-3] == 'i' && s[len-4] != 'a' && s[len-4] != 'e') {
s[len - 3] = 'y';
return len - 2;
}
if (s[len-3] == 'i' || s[len-3] == 'a' || s[len-3] == 'o' || s[len-3] == 'e')
return len;
default: return len - 1;
}
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.es;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link SpanishLightStemmer} to stem Spanish
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class SpanishLightStemFilter extends TokenFilter {
private final SpanishLightStemmer stemmer = new SpanishLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public SpanishLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,109 @@
package org.apache.lucene.analysis.es;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Light Stemmer for Spanish
* <p>
* This stemmer implements the algorithm described in:
* <i>Report on CLEF-2001 Experiments</i>
* Jacques Savoy
*/
public class SpanishLightStemmer {
public int stem(char s[], int len) {
if (len < 5)
return len;
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'à':
case 'á':
case 'â':
case 'ä': s[i] = 'a'; break;
case 'ò':
case 'ó':
case 'ô':
case 'ö': s[i] = 'o'; break;
case 'è':
case 'é':
case 'ê':
case 'ë': s[i] = 'e'; break;
case 'ù':
case 'ú':
case 'û':
case 'ü': s[i] = 'u'; break;
case 'ì':
case 'í':
case 'î':
case 'ï': s[i] = 'i'; break;
}
switch(s[len-1]) {
case 'o':
case 'a':
case 'e': return len - 1;
case 's':
if (s[len-2] == 'e' && s[len-3] == 's' && s[len-4] == 'e')
return len-2;
if (s[len-2] == 'e' && s[len-3] == 'c') {
s[len-3] = 'z';
return len - 2;
}
if (s[len-2] == 'o' || s[len-2] == 'a' || s[len-2] == 'e')
return len - 2;
}
return len;
}
}

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.fa;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Normalizer for Persian. * Normalizer for Persian.
* <p> * <p>
@ -82,20 +84,4 @@ public class PersianNormalizer {
return len; return len;
} }
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
protected int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
} }

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.fi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link FinnishLightStemmer} to stem Finnish
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class FinnishLightStemFilter extends TokenFilter {
private final FinnishLightStemmer stemmer = new FinnishLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public FinnishLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,259 @@
package org.apache.lucene.analysis.fi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Finnish.
* <p>
* This stemmer implements the algorithm described in:
* <i>Report on CLEF-2003 Monolingual Tracks</i>
* Jacques Savoy
*/
public class FinnishLightStemmer {
public int stem(char s[], int len) {
if (len < 4)
return len;
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'ä':
case 'å': s[i] = 'a'; break;
case 'ö': s[i] = 'o'; break;
}
len = step1(s, len);
len = step2(s, len);
len = step3(s, len);
len = norm1(s, len);
len = norm2(s, len);
return len;
}
private int step1(char s[], int len) {
if (len > 8) {
if (endsWith(s, len, "kin"))
return step1(s, len-3);
if (endsWith(s, len, "ko"))
return step1(s, len-2);
}
if (len > 11) {
if (endsWith(s, len, "dellinen"))
return len-8;
if (endsWith(s, len, "dellisuus"))
return len-9;
}
return len;
}
private int step2(char s[], int len) {
if (len > 5) {
if (endsWith(s, len, "lla")
|| endsWith(s, len, "tse")
|| endsWith(s, len, "sti"))
return len-3;
if (endsWith(s, len, "ni"))
return len-2;
if (endsWith(s, len, "aa"))
return len-1; // aa -> a
}
return len;
}
private int step3(char s[], int len) {
if (len > 8) {
if (endsWith(s, len, "nnen")) {
s[len-4] = 's';
return len-3;
}
if (endsWith(s, len, "ntena")) {
s[len-5] = 's';
return len-4;
}
if (endsWith(s, len, "tten"))
return len-4;
if (endsWith(s, len, "eiden"))
return len-5;
}
if (len > 6) {
if (endsWith(s, len, "neen")
|| endsWith(s, len, "niin")
|| endsWith(s, len, "seen")
|| endsWith(s, len, "teen")
|| endsWith(s, len, "inen"))
return len-4;
if (s[len-3] == 'h' && isVowel(s[len-2]) && s[len-1] == 'n')
return len-3;
if (endsWith(s, len, "den")) {
s[len-3] = 's';
return len-2;
}
if (endsWith(s, len, "ksen")) {
s[len-4] = 's';
return len-3;
}
if (endsWith(s, len, "ssa")
|| endsWith(s, len, "sta")
|| endsWith(s, len, "lla")
|| endsWith(s, len, "lta")
|| endsWith(s, len, "tta")
|| endsWith(s, len, "ksi")
|| endsWith(s, len, "lle"))
return len-3;
}
if (len > 5) {
if (endsWith(s, len, "na")
|| endsWith(s, len, "ne"))
return len-2;
if (endsWith(s, len, "nei"))
return len-3;
}
if (len > 4) {
if (endsWith(s, len, "ja")
|| endsWith(s, len, "ta"))
return len-2;
if (s[len-1] == 'a')
return len-1;
if (s[len-1] == 'n' && isVowel(s[len-2]))
return len-2;
if (s[len-1] == 'n')
return len-1;
}
return len;
}
private int norm1(char s[], int len) {
if (len > 5 && endsWith(s, len, "hde")) {
s[len-3] = 'k';
s[len-2] = 's';
s[len-1] = 'i';
}
if (len > 4) {
if (endsWith(s, len, "ei") || endsWith(s, len, "at"))
return len-2;
}
if (len > 3)
switch(s[len-1]) {
case 't':
case 's':
case 'j':
case 'e':
case 'a':
case 'i': return len-1;
}
return len;
}
private int norm2(char s[], int len) {
if (len > 8) {
if (s[len-1] == 'e'
|| s[len-1] == 'o'
|| s[len-1] == 'u')
len--;
}
if (len > 4) {
if (s[len-1] == 'i')
len--;
if (len > 4) {
char ch = s[0];
for (int i = 1; i < len; i++) {
if (s[i] == ch &&
(ch == 'k' || ch == 'p' || ch == 't'))
len = delete(s, i--, len);
else
ch = s[i];
}
}
}
return len;
}
private boolean isVowel(char ch) {
switch(ch) {
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
case 'y': return true;
default: return false;
}
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link FrenchLightStemmer} to stem French
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class FrenchLightStemFilter extends TokenFilter {
private final FrenchLightStemmer stemmer = new FrenchLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public FrenchLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,267 @@
package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for French.
* <p>
* This stemmer implements the "UniNE" algorithm in:
* <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
* Jacques Savoy
*/
public class FrenchLightStemmer {
public int stem(char s[], int len) {
if (len > 5 && s[len-1] == 'x') {
if (s[len-3] == 'a' && s[len-2] == 'u' && s[len-4] != 'e')
s[len-2] = 'l';
len--;
}
if (len > 3 && s[len-1] == 'x')
len--;
if (len > 3 && s[len-1] == 's')
len--;
if (len > 9 && endsWith(s, len, "issement")) {
len -= 6;
s[len-1] = 'r';
return norm(s, len);
}
if (len > 8 && endsWith(s, len, "issant")) {
len -= 4;
s[len-1] = 'r';
return norm(s, len);
}
if (len > 6 && endsWith(s, len, "ement")) {
len -= 4;
if (len > 3 && endsWith(s, len, "ive")) {
len--;
s[len-1] = 'f';
}
return norm(s, len);
}
if (len > 11 && endsWith(s, len, "ficatrice")) {
len -= 5;
s[len-2] = 'e';
s[len-1] = 'r';
return norm(s, len);
}
if (len > 10 && endsWith(s, len, "ficateur")) {
len -= 4;
s[len-2] = 'e';
s[len-1] = 'r';
return norm(s, len);
}
if (len > 9 && endsWith(s, len, "catrice")) {
len -= 3;
s[len-4] = 'q';
s[len-3] = 'u';
s[len-2] = 'e';
//s[len-1] = 'r' <-- unnecessary, already 'r'.
return norm(s, len);
}
if (len > 8 && endsWith(s, len, "cateur")) {
len -= 2;
s[len-4] = 'q';
s[len-3] = 'u';
s[len-2] = 'e';
s[len-1] = 'r';
return norm(s, len);
}
if (len > 8 && endsWith(s, len, "atrice")) {
len -= 4;
s[len-2] = 'e';
s[len-1] = 'r';
return norm(s, len);
}
if (len > 7 && endsWith(s, len, "ateur")) {
len -= 3;
s[len-2] = 'e';
s[len-1] = 'r';
return norm(s, len);
}
if (len > 6 && endsWith(s, len, "trice")) {
len--;
s[len-3] = 'e';
s[len-2] = 'u';
s[len-1] = 'r';
}
if (len > 5 && endsWith(s, len, "ième"))
return norm(s, len-4);
if (len > 7 && endsWith(s, len, "teuse")) {
len -= 2;
s[len-1] = 'r';
return norm(s, len);
}
if (len > 6 && endsWith(s, len, "teur")) {
len--;
s[len-1] = 'r';
return norm(s, len);
}
if (len > 5 && endsWith(s, len, "euse"))
return norm(s, len-2);
if (len > 8 && endsWith(s, len, "ère")) {
len--;
s[len-2] = 'e';
return norm(s, len);
}
if (len > 7 && endsWith(s, len, "ive")) {
len--;
s[len-1] = 'f';
return norm(s, len);
}
if (len > 4 &&
(endsWith(s, len, "folle") ||
endsWith(s, len, "molle"))) {
len -= 2;
s[len-1] = 'u';
return norm(s, len);
}
if (len > 9 && endsWith(s, len, "nnelle"))
return norm(s, len-5);
if (len > 9 && endsWith(s, len, "nnel"))
return norm(s, len-3);
if (len > 4 && endsWith(s, len, "ète")) {
len--;
s[len-2] = 'e';
}
if (len > 8 && endsWith(s, len, "ique"))
len -= 4;
if (len > 8 && endsWith(s, len, "esse"))
return norm(s, len-3);
if (len > 7 && endsWith(s, len, "inage"))
return norm(s, len-3);
if (len > 9 && endsWith(s, len, "isation")) {
len -= 7;
if (len > 5 && endsWith(s, len, "ual"))
s[len-2] = 'e';
return norm(s, len);
}
if (len > 9 && endsWith(s, len, "isateur"))
return norm(s, len-7);
if (len > 8 && endsWith(s, len, "ation"))
return norm(s, len-5);
if (len > 8 && endsWith(s, len, "ition"))
return norm(s, len-5);
return norm(s, len);
}
private int norm(char s[], int len) {
if (len > 4) {
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'à':
case 'á':
case 'â': s[i] = 'a'; break;
case 'ô': s[i] = 'o'; break;
case 'è':
case 'é':
case 'ê': s[i] = 'e'; break;
case 'ù':
case 'û': s[i] = 'u'; break;
case 'î': s[i] = 'i'; break;
case 'ç': s[i] = 'c'; break;
}
char ch = s[0];
for (int i = 1; i < len; i++) {
if (s[i] == ch)
len = delete(s, i--, len);
else
ch = s[i];
}
}
if (len > 4 && endsWith(s, len, "ie"))
len -= 2;
if (len > 4) {
if (s[len-1] == 'r') len--;
if (s[len-1] == 'e') len--;
if (s[len-1] == 'e') len--;
if (s[len-1] == s[len-2]) len--;
}
return len;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link FrenchMinimalStemmer} to stem French
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class FrenchMinimalStemFilter extends TokenFilter {
private final FrenchMinimalStemmer stemmer = new FrenchMinimalStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public FrenchMinimalStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,80 @@
package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Light Stemmer for French.
* <p>
* This stemmer implements the following algorithm:
* <i>A Stemming procedure and stopword list for general French corpora.</i>
* Jacques Savoy.
*/
public class FrenchMinimalStemmer {
public int stem(char s[], int len) {
if (len < 6)
return len;
if (s[len-1] == 'x') {
if (s[len-3] == 'a' && s[len-2] == 'u')
s[len-2] = 'l';
return len - 1;
}
if (s[len-1] == 's') len--;
if (s[len-1] == 'r') len--;
if (s[len-1] == 'e') len--;
if (s[len-1] == 'é') len--;
if (s[len-1] == s[len-2]) len--;
return len;
}
}

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hi;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Normalizer for Hindi. * Normalizer for Hindi.
* <p> * <p>
@ -176,19 +178,4 @@ public class HindiNormalizer {
return len; return len;
} }
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
protected int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
} }

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hi;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Light Stemmer for Hindi. * Light Stemmer for Hindi.
* <p> * <p>
@ -116,15 +118,4 @@ public class HindiStemmer {
return len - 1; return len - 1;
return len; return len;
} }
private boolean endsWith(final char s[], final int len, final String suffix) {
final int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
} }

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.hu;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link HungarianLightStemmer} to stem
* Hungarian words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class HungarianLightStemFilter extends TokenFilter {
private final HungarianLightStemmer stemmer = new HungarianLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public HungarianLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,238 @@
package org.apache.lucene.analysis.hu;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Hungarian.
* <p>
* This stemmer implements the "UniNE" algorithm in:
* <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
* Jacques Savoy
*/
public class HungarianLightStemmer {
public int stem(char s[], int len) {
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'á': s[i] = 'a'; break;
case 'ë':
case 'é': s[i] = 'e'; break;
case 'í': s[i] = 'i'; break;
case 'ó':
case 'ő':
case 'õ':
case 'ö': s[i] = 'o'; break;
case 'ú':
case 'ű':
case 'ũ':
case 'û':
case 'ü': s[i] = 'u'; break;
}
len = removeCase(s, len);
len = removePossessive(s, len);
len = removePlural(s, len);
return normalize(s, len);
}
private int removeCase(char s[], int len) {
if (len > 6 && endsWith(s, len, "kent"))
return len - 4;
if (len > 5) {
if (endsWith(s, len, "nak") ||
endsWith(s, len, "nek") ||
endsWith(s, len, "val") ||
endsWith(s, len, "vel") ||
endsWith(s, len, "ert") ||
endsWith(s, len, "rol") ||
endsWith(s, len, "ban") ||
endsWith(s, len, "ben") ||
endsWith(s, len, "bol") ||
endsWith(s, len, "nal") ||
endsWith(s, len, "nel") ||
endsWith(s, len, "hoz") ||
endsWith(s, len, "hez") ||
endsWith(s, len, "tol"))
return len - 3;
if (endsWith(s, len, "al") || endsWith(s, len, "el")) {
if (!isVowel(s[len-3]) && s[len-3] == s[len-4])
return len - 3;
}
}
if (len > 4) {
if (endsWith(s, len, "at") ||
endsWith(s, len, "et") ||
endsWith(s, len, "ot") ||
endsWith(s, len, "va") ||
endsWith(s, len, "ve") ||
endsWith(s, len, "ra") ||
endsWith(s, len, "re") ||
endsWith(s, len, "ba") ||
endsWith(s, len, "be") ||
endsWith(s, len, "ul") ||
endsWith(s, len, "ig"))
return len - 2;
if ((endsWith(s, len, "on") || endsWith(s, len, "en")) && !isVowel(s[len-3]))
return len - 2;
switch(s[len-1]) {
case 't':
case 'n': return len - 1;
case 'a':
case 'e': if (s[len-2] == s[len-3] && !isVowel(s[len-2])) return len - 2;
}
}
return len;
}
private int removePossessive(char s[], int len) {
if (len > 6) {
if (!isVowel(s[len-5]) &&
(endsWith(s, len, "atok") ||
endsWith(s, len, "otok") ||
endsWith(s, len, "etek")))
return len - 4;
if (endsWith(s, len, "itek") || endsWith(s, len, "itok"))
return len - 4;
}
if (len > 5) {
if (!isVowel(s[len-4]) &&
(endsWith(s, len, "unk") ||
endsWith(s, len, "tok") ||
endsWith(s, len, "tek")))
return len - 3;
if (isVowel(s[len-4]) && endsWith(s, len, "juk"))
return len - 3;
if (endsWith(s, len, "ink"))
return len - 3;
}
if (len > 4) {
if (!isVowel(s[len-3]) &&
(endsWith(s, len, "am") ||
endsWith(s, len, "em") ||
endsWith(s, len, "om") ||
endsWith(s, len, "ad") ||
endsWith(s, len, "ed") ||
endsWith(s, len, "od") ||
endsWith(s, len, "uk")))
return len - 2;
if (isVowel(s[len-3]) &&
(endsWith(s, len, "nk") ||
endsWith(s, len, "ja") ||
endsWith(s, len, "je")))
return len - 2;
if (endsWith(s, len, "im") ||
endsWith(s, len, "id") ||
endsWith(s, len, "ik"))
return len - 2;
}
if (len > 3)
switch(s[len-1]) {
case 'a':
case 'e': if (!isVowel(s[len-2])) return len - 1; break;
case 'm':
case 'd': if (isVowel(s[len-2])) return len - 1; break;
case 'i': return len - 1;
}
return len;
}
private int removePlural(char s[], int len) {
if (len > 3 && s[len-1] == 'k')
switch(s[len-2]) {
case 'a':
case 'o':
case 'e': if (len > 4) return len - 2; /* intentional fallthru */
default: return len - 1;
}
return len;
}
private int normalize(char s[], int len) {
if (len > 3)
switch(s[len-1]) {
case 'a':
case 'e':
case 'i':
case 'o': return len - 1;
}
return len;
}
private boolean isVowel(char ch) {
switch(ch) {
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
case 'y': return true;
default: return false;
}
}
}

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.id;
* limitations under the License. * limitations under the License.
*/ */
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Stemmer for Indonesian. * Stemmer for Indonesian.
* <p> * <p>
@ -267,38 +269,4 @@ public class IndonesianStemmer {
} }
return length; return length;
} }
private boolean startsWith(char s[], int len, String prefix) {
final int prefixLen = prefix.length();
if (prefixLen > len)
return false;
for (int i = 0; i < prefixLen; i++)
if (s[i] != prefix.charAt(i))
return false;
return true;
}
private boolean endsWith(char s[], int len, String suffix) {
final int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
private int deleteN(char s[], int pos, int len, int nChars) {
for (int i = 0; i < nChars; i++)
len = delete(s, pos, len);
return len;
}
private int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
} }

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.in;
import java.util.BitSet; import java.util.BitSet;
import java.util.IdentityHashMap; import java.util.IdentityHashMap;
import static java.lang.Character.UnicodeBlock.*; import static java.lang.Character.UnicodeBlock.*;
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/** /**
* Normalizes the Unicode representation of text in Indian languages. * Normalizes the Unicode representation of text in Indian languages.
@ -290,14 +291,4 @@ public class IndicNormalizer {
return len; return len;
} }
/**
* Delete a character in-place
*/
private int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
} }

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.it;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link ItalianLightStemmer} to stem Italian
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class ItalianLightStemFilter extends TokenFilter {
private final ItalianLightStemmer stemmer = new ItalianLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public ItalianLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.analysis.it;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Light Stemmer for Italian.
* <p>
* This stemmer implements the algorithm described in:
* <i>Report on CLEF-2001 Experiments</i>
* Jacques Savoy
*/
public class ItalianLightStemmer {
public int stem(char s[], int len) {
if (len < 6)
return len;
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'à':
case 'á':
case 'â':
case 'ä': s[i] = 'a'; break;
case 'ò':
case 'ó':
case 'ô':
case 'ö': s[i] = 'o'; break;
case 'è':
case 'é':
case 'ê':
case 'ë': s[i] = 'e'; break;
case 'ù':
case 'ú':
case 'û':
case 'ü': s[i] = 'u'; break;
case 'ì':
case 'í':
case 'î':
case 'ï': s[i] = 'i'; break;
}
switch(s[len-1]) {
case 'e':
if (s[len-2] == 'i' || s[len-2] == 'h')
return len - 2;
else
return len - 1;
case 'i':
if (s[len-2] == 'h' || s[len-2] == 'i')
return len - 2;
else
return len - 1;
case 'a':
if (s[len-2] == 'i')
return len - 2;
else
return len - 1;
case 'o':
if (s[len-2] == 'i')
return len - 2;
else
return len - 1;
}
return len;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link PortugueseLightStemmer} to stem
* Portuguese words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class PortugueseLightStemFilter extends TokenFilter {
private final PortugueseLightStemmer stemmer = new PortugueseLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public PortugueseLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,202 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Portuguese
*/
public class PortugueseLightStemmer {
public int stem(char s[], int len) {
if (len < 4)
return len;
len = removeSuffix(s, len);
if (len > 3 && s[len-1] == 'a')
len = normFeminine(s, len);
if (len > 4)
switch(s[len-1]) {
case 'e':
case 'a':
case 'o': len--; break;
}
for (int i = 0; i < len; i++)
switch(s[i]) {
case 'à':
case 'á':
case 'â':
case 'ä':
case 'ã': s[i] = 'a'; break;
case 'ò':
case 'ó':
case 'ô':
case 'ö':
case 'õ': s[i] = 'o'; break;
case 'è':
case 'é':
case 'ê':
case 'ë': s[i] = 'e'; break;
case 'ù':
case 'ú':
case 'û':
case 'ü': s[i] = 'u'; break;
case 'ì':
case 'í':
case 'î':
case 'ï': s[i] = 'i'; break;
case 'ç': s[i] = 'c'; break;
}
return len;
}
private int removeSuffix(char s[], int len) {
if (len > 4 && endsWith(s, len, "es"))
switch(s[len-3]) {
case 'r':
case 's':
case 'l':
case 'z': return len - 2;
}
if (len > 3 && endsWith(s, len, "ns")) {
s[len - 2] = 'm';
return len - 1;
}
if (len > 4 && (endsWith(s, len, "eis") || endsWith(s, len, "éis"))) {
s[len - 3] = 'e';
s[len - 2] = 'l';
return len - 1;
}
if (len > 4 && endsWith(s, len, "ais")) {
s[len - 2] = 'l';
return len - 1;
}
if (len > 4 && endsWith(s, len, "óis")) {
s[len - 3] = 'o';
s[len - 2] = 'l';
return len - 1;
}
if (len > 4 && endsWith(s, len, "is")) {
s[len - 1] = 'l';
return len;
}
if (len > 3 &&
(endsWith(s, len, "ões") ||
endsWith(s, len, "ães"))) {
len--;
s[len - 2] = 'ã';
s[len - 1] = 'o';
return len;
}
if (len > 6 && endsWith(s, len, "mente"))
return len - 5;
if (len > 3 && s[len-1] == 's')
return len - 1;
return len;
}
private int normFeminine(char s[], int len) {
if (len > 7 &&
(endsWith(s, len, "inha") ||
endsWith(s, len, "iaca") ||
endsWith(s, len, "eira"))) {
s[len - 1] = 'o';
return len;
}
if (len > 6) {
if (endsWith(s, len, "osa") ||
endsWith(s, len, "ica") ||
endsWith(s, len, "ida") ||
endsWith(s, len, "ada") ||
endsWith(s, len, "iva") ||
endsWith(s, len, "ama")) {
s[len - 1] = 'o';
return len;
}
if (endsWith(s, len, "ona")) {
s[len - 3] = 'ã';
s[len - 2] = 'o';
return len - 1;
}
if (endsWith(s, len, "ora"))
return len - 1;
if (endsWith(s, len, "esa")) {
s[len - 3] = 'ê';
return len - 1;
}
if (endsWith(s, len, "na")) {
s[len - 1] = 'o';
return len;
}
}
return len;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link PortugueseMinimalStemmer} to stem
* Portuguese words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class PortugueseMinimalStemFilter extends TokenFilter {
private final PortugueseMinimalStemmer stemmer = new PortugueseMinimalStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public PortugueseMinimalStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,119 @@
package org.apache.lucene.analysis.pt;
import java.util.Arrays;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Minimal Stemmer for Portuguese
* <p>
* This follows the "RSLP-S" algorithm presented in:
* <i>A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese
* Information Retrieval</i> (Orengo, et al)
* which is just the plural reduction step of the RSLP
* algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
* Orengo et al.
*/
public class PortugueseMinimalStemmer {
private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31,
Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois",
"depois","dois","leis"),
false);
private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31,
Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos",
"férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés",
"através", "convés", "ês", "país", "após", "ambas", "ambos",
"messias", "depois"),
false);
public int stem(char s[], int len) {
if (len < 3 || s[len-1] != 's')
return len;
if (s[len-2] == 'n') {
len--;
s[len-1] = 'm';
return len;
}
if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') {
len--;
s[len-2] = 'ã';
s[len-1] = 'o';
return len;
}
if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e')
if (!(len == 4 && s[0] == 'm')) {
len--;
s[len-1] = 'o';
return len;
}
if (len >= 4 && s[len-2] == 'i') {
if (s[len-3] == 'a')
if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) {
len--;
s[len-1] = 'l';
return len;
}
if (len >= 5 && s[len-3] == 'é') {
len--;
s[len-2] = 'e';
s[len-1] = 'l';
return len;
}
if (len >= 5 && s[len-3] == 'e') {
len--;
s[len-1] = 'l';
return len;
}
if (len >= 5 && s[len-3] == 'ó') {
len--;
s[len-2] = 'o';
s[len-1] = 'l';
return len;
}
if (!excIS.contains(s, 0, len)) {
s[len-1] = 'l';
return len;
}
}
if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e')
return len - 2;
if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e')
if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o'))
return len - 2;
if (excS.contains(s, 0, len))
return len;
else
return len-1;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.ru;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link RussianLightStemmer} to stem Russian
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class RussianLightStemFilter extends TokenFilter {
private final RussianLightStemmer stemmer = new RussianLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public RussianLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,153 @@
package org.apache.lucene.analysis.ru;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Russian.
* <p>
* This stemmer implements the following algorithm:
* <i>Indexing and Searching Strategies for the Russian Language.</i>
* Ljiljana Dolamic and Jacques Savoy.
*/
public class RussianLightStemmer {
public int stem(char s[], int len) {
len = removeCase(s, len);
return normalize(s, len);
}
private int normalize(char s[], int len) {
if (len > 3)
switch(s[len-1]) {
case 'ь':
case 'и': return len - 1;
case 'н': if (s[len-2] == 'н') return len - 1;
}
return len;
}
private int removeCase(char s[], int len) {
if (len > 6 &&
(endsWith(s, len, "иями") ||
endsWith(s, len, "оями")))
return len - 4;
if (len > 5 &&
(endsWith(s, len, "иям") ||
endsWith(s, len, "иях") ||
endsWith(s, len, "оях") ||
endsWith(s, len, "ями") ||
endsWith(s, len, "оям") ||
endsWith(s, len, "оьв") ||
endsWith(s, len, "ами") ||
endsWith(s, len, "его") ||
endsWith(s, len, "ему") ||
endsWith(s, len, "ери") ||
endsWith(s, len, "ими") ||
endsWith(s, len, "ого") ||
endsWith(s, len, "ому") ||
endsWith(s, len, "ыми") ||
endsWith(s, len, "оев")))
return len - 3;
if (len > 4 &&
(endsWith(s, len, "ая") ||
endsWith(s, len, "яя") ||
endsWith(s, len, "ях") ||
endsWith(s, len, "юю") ||
endsWith(s, len, "ах") ||
endsWith(s, len, "ею") ||
endsWith(s, len, "их") ||
endsWith(s, len, "ия") ||
endsWith(s, len, "ию") ||
endsWith(s, len, "ьв") ||
endsWith(s, len, "ою") ||
endsWith(s, len, "ую") ||
endsWith(s, len, "ям") ||
endsWith(s, len, "ых") ||
endsWith(s, len, "ея") ||
endsWith(s, len, "ам") ||
endsWith(s, len, "ем") ||
endsWith(s, len, "ей") ||
endsWith(s, len, "ём") ||
endsWith(s, len, "ев") ||
endsWith(s, len, "ий") ||
endsWith(s, len, "им") ||
endsWith(s, len, "ое") ||
endsWith(s, len, "ой") ||
endsWith(s, len, "ом") ||
endsWith(s, len, "ов") ||
endsWith(s, len, "ые") ||
endsWith(s, len, "ый") ||
endsWith(s, len, "ым") ||
endsWith(s, len, "ми")))
return len - 2;
if (len > 3)
switch(s[len-1]) {
case 'а':
case 'е':
case 'и':
case 'о':
case 'у':
case 'й':
case 'ы':
case 'я':
case 'ь': return len - 1;
}
return len;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.sv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link SwedishLightStemmer} to stem Swedish
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class SwedishLightStemFilter extends TokenFilter {
private final SwedishLightStemmer stemmer = new SwedishLightStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public SwedishLightStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,111 @@
package org.apache.lucene.analysis.sv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This algorithm is updated based on code located at:
* http://members.unine.ch/jacques.savoy/clef/
*
* Full copyright for that code follows:
*/
/*
* Copyright (c) 2005, Jacques Savoy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the author nor the names
* of its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Swedish.
* <p>
* This stemmer implements the algorithm described in:
* <i>Report on CLEF-2003 Monolingual Tracks</i>
* Jacques Savoy
*/
public class SwedishLightStemmer {
public int stem(char s[], int len) {
if (len > 4 && s[len-1] == 's')
len--;
if (len > 7 &&
(endsWith(s, len, "elser") ||
endsWith(s, len, "heten")))
return len - 5;
if (len > 6 &&
(endsWith(s, len, "arne") ||
endsWith(s, len, "erna") ||
endsWith(s, len, "ande") ||
endsWith(s, len, "else") ||
endsWith(s, len, "aste") ||
endsWith(s, len, "orna") ||
endsWith(s, len, "aren")))
return len - 4;
if (len > 5 &&
(endsWith(s, len, "are") ||
endsWith(s, len, "ast") ||
endsWith(s, len, "het")))
return len - 3;
if (len > 4 &&
(endsWith(s, len, "ar") ||
endsWith(s, len, "er") ||
endsWith(s, len, "or") ||
endsWith(s, len, "en") ||
endsWith(s, len, "at") ||
endsWith(s, len, "te") ||
endsWith(s, len, "et")))
return len - 2;
if (len > 3)
switch(s[len-1]) {
case 't':
case 'a':
case 'e':
case 'n': return len - 1;
}
return len;
}
}

View File

@ -0,0 +1,89 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Some commonly-used stemming functions */
public class StemmerUtil {
/**
* Returns true if the character array starts with the suffix.
*
* @param s Input Buffer
* @param len length of input buffer
* @param suffix Suffix string to test
* @return true if <code>s</code> starts with <code>suffix</code>
*/
public static boolean startsWith(char s[], int len, String prefix) {
final int prefixLen = prefix.length();
if (prefixLen > len)
return false;
for (int i = 0; i < prefixLen; i++)
if (s[i] != prefix.charAt(i))
return false;
return true;
}
/**
* Returns true if the character array ends with the suffix.
*
* @param s Input Buffer
* @param len length of input buffer
* @param suffix Suffix string to test
* @return true if <code>s</code> ends with <code>suffix</code>
*/
public static boolean endsWith(char s[], int len, String suffix) {
final int suffixLen = suffix.length();
if (suffixLen > len)
return false;
for (int i = suffixLen - 1; i >= 0; i--)
if (s[len -(suffixLen - i)] != suffix.charAt(i))
return false;
return true;
}
/**
* Delete a character in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len length of input buffer
* @return length of input buffer after deletion
*/
public static int delete(char s[], int pos, int len) {
if (pos < len)
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
return len - 1;
}
/**
* Delete n characters in-place
*
* @param s Input Buffer
* @param pos Position of character to delete
* @param len Length of input buffer
* @param nChars number of characters to delete
* @return length of input buffer after deletion
*/
public static int deleteN(char s[], int pos, int len, int nChars) {
// TODO: speed up, this is silly
for (int i = 0; i < nChars; i++)
len = delete(s, pos, len);
return len;
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.de;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link GermanLightStemFilter}
*/
public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new GermanLightStemFilter(source));
}
};
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.analysis.de;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link GermanMinimalStemFilter}
*/
public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new GermanMinimalStemFilter(source));
}
};
/** Test some examples from the paper */
public void testExamples() throws IOException {
checkOneTerm(analyzer, "sängerinnen", "sangerin");
checkOneTerm(analyzer, "frauen", "frau");
checkOneTerm(analyzer, "kenntnisse", "kenntnis");
checkOneTerm(analyzer, "staates", "staat");
checkOneTerm(analyzer, "bilder", "bild");
checkOneTerm(analyzer, "boote", "boot");
checkOneTerm(analyzer, "götter", "gott");
checkOneTerm(analyzer, "äpfel", "apfel");
}
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
}
}

View File

@ -17,17 +17,17 @@ package org.apache.lucene.analysis.de;
* limitations under the License. * limitations under the License.
*/ */
import java.io.BufferedReader; import java.io.InputStream;
import java.io.File; import java.io.Reader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/** /**
* Test the German stemmer. The stemming algorithm is known to work less * Test the German stemmer. The stemming algorithm is known to work less
@ -38,25 +38,18 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
public class TestGermanStemFilter extends BaseTokenStreamTestCase { public class TestGermanStemFilter extends BaseTokenStreamTestCase {
public void testStemming() throws Exception { public void testStemming() throws Exception {
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("")); Analyzer analyzer = new ReusableAnalyzerBase() {
TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer)); @Override
// read test cases from external file: protected TokenStreamComponents createComponents(String fieldName,
InputStreamReader isr = new InputStreamReader(getClass().getResourceAsStream("data.txt"), "iso-8859-1"); Reader reader) {
BufferedReader breader = new BufferedReader(isr); Tokenizer t = new KeywordTokenizer(reader);
while(true) { return new TokenStreamComponents(t,
String line = breader.readLine(); new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t)));
if (line == null)
break;
line = line.trim();
if (line.startsWith("#") || line.equals(""))
continue; // ignore comments and empty lines
String[] parts = line.split(";");
//System.out.println(parts[0] + " -- " + parts[1]);
tokenizer.reset(new StringReader(parts[0]));
filter.reset();
assertTokenStreamContents(filter, new String[] { parts[1] });
} }
breader.close(); };
isr.close();
InputStream vocOut = getClass().getResourceAsStream("data.txt");
assertVocabulary(analyzer, vocOut);
vocOut.close();
} }
} }

View File

@ -1,48 +1,48 @@
# German special characters are replaced: # German special characters are replaced:
häufig;haufig häufig haufig
# here the stemmer works okay, it maps related words to the same stem: # here the stemmer works okay, it maps related words to the same stem:
abschließen;abschliess abschließen abschliess
abschließender;abschliess abschließender abschliess
abschließendes;abschliess abschließendes abschliess
abschließenden;abschliess abschließenden abschliess
Tisch;tisch Tisch tisch
Tische;tisch Tische tisch
Tischen;tisch Tischen tisch
Haus;hau Haus hau
Hauses;hau Hauses hau
Häuser;hau Häuser hau
Häusern;hau Häusern hau
# here's a case where overstemming occurs, i.e. a word is # here's a case where overstemming occurs, i.e. a word is
# mapped to the same stem as unrelated words: # mapped to the same stem as unrelated words:
hauen;hau hauen hau
# here's a case where understemming occurs, i.e. two related words # here's a case where understemming occurs, i.e. two related words
# are not mapped to the same stem. This is the case with basically # are not mapped to the same stem. This is the case with basically
# all irregular forms: # all irregular forms:
Drama;drama Drama drama
Dramen;dram Dramen dram
# replace "ß" with 'ss': # replace "ß" with 'ss':
Ausmaß;ausmass Ausmaß ausmass
# fake words to test if suffixes are cut off: # fake words to test if suffixes are cut off:
xxxxxe;xxxxx xxxxxe xxxxx
xxxxxs;xxxxx xxxxxs xxxxx
xxxxxn;xxxxx xxxxxn xxxxx
xxxxxt;xxxxx xxxxxt xxxxx
xxxxxem;xxxxx xxxxxem xxxxx
xxxxxer;xxxxx xxxxxer xxxxx
xxxxxnd;xxxxx xxxxxnd xxxxx
# the suffixes are also removed when combined: # the suffixes are also removed when combined:
xxxxxetende;xxxxx xxxxxetende xxxxx
# words that are shorter than four charcters are not changed: # words that are shorter than four charcters are not changed:
xxe;xxe xxe xxe
# -em and -er are not removed from words shorter than five characters: # -em and -er are not removed from words shorter than five characters:
xxem;xxem xxem xxem
xxer;xxer xxer xxer
# -nd is not removed from words shorter than six characters: # -nd is not removed from words shorter than six characters:
xxxnd;xxxnd xxxnd xxxnd

View File

@ -0,0 +1,54 @@
package org.apache.lucene.analysis.en;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
/**
* Simple tests for {@link EnglishMinimalStemFilter}
*/
public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new EnglishMinimalStemFilter(source));
}
};
/** Test some examples from various papers about this technique */
public void testExamples() throws IOException {
checkOneTerm(analyzer, "queries", "query");
checkOneTerm(analyzer, "phrases", "phrase");
checkOneTerm(analyzer, "corpus", "corpus");
checkOneTerm(analyzer, "stress", "stress");
checkOneTerm(analyzer, "kings", "king");
checkOneTerm(analyzer, "panels", "panel");
checkOneTerm(analyzer, "aerodynamics", "aerodynamic");
checkOneTerm(analyzer, "congress", "congress");
checkOneTerm(analyzer, "serious", "serious");
}
}

View File

@ -17,21 +17,22 @@ package org.apache.lucene.analysis.en;
* limitations under the License. * limitations under the License.
*/ */
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.Reader;
import java.io.InputStreamReader;
import java.io.StringReader; import java.io.StringReader;
import java.util.zip.ZipFile;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/** /**
* Test the PorterStemFilter with Martin Porter's test data. * Test the PorterStemFilter with Martin Porter's test data.
*/ */
@ -41,26 +42,16 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
* The output should be the same as the string in output.txt * The output should be the same as the string in output.txt
*/ */
public void testPorterStemFilter() throws Exception { public void testPorterStemFilter() throws Exception {
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("")); Analyzer a = new ReusableAnalyzerBase() {
TokenStream filter = new PorterStemFilter(tokenizer); @Override
ZipFile zipFile = new ZipFile(getDataFile("porterTestData.zip")); protected TokenStreamComponents createComponents(String fieldName,
InputStream voc = zipFile.getInputStream(zipFile.getEntry("voc.txt")); Reader reader) {
InputStream out = zipFile.getInputStream(zipFile.getEntry("output.txt")); Tokenizer t = new KeywordTokenizer(reader);
BufferedReader vocReader = new BufferedReader(new InputStreamReader( return new TokenStreamComponents(t, new PorterStemFilter(t));
voc, "UTF-8"));
BufferedReader outputReader = new BufferedReader(new InputStreamReader(
out, "UTF-8"));
String inputWord = null;
while ((inputWord = vocReader.readLine()) != null) {
String expectedWord = outputReader.readLine();
assertNotNull(expectedWord);
tokenizer.reset(new StringReader(inputWord));
filter.reset();
assertTokenStreamContents(filter, new String[] { expectedWord });
} }
vocReader.close(); };
outputReader.close();
zipFile.close(); assertVocabulary(a, getDataFile("porterTestData.zip"), "voc.txt", "output.txt");
} }
public void testWithKeywordAttribute() throws IOException { public void testWithKeywordAttribute() throws IOException {

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.es;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link SpanishLightStemFilter}
*/
public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new SpanishLightStemFilter(source));
}
};
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("eslighttestdata.zip"), "eslight.txt");
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.fi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link FinnishLightStemFilter}
*/
public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new FinnishLightStemFilter(source));
}
};
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
}
}

View File

@ -0,0 +1,162 @@
package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link FrenchLightStemFilter}
*/
public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new FrenchLightStemFilter(source));
}
};
/** Test some examples from the paper */
public void testExamples() throws IOException {
checkOneTerm(analyzer, "chevaux", "cheval");
checkOneTerm(analyzer, "cheval", "cheval");
checkOneTerm(analyzer, "hiboux", "hibou");
checkOneTerm(analyzer, "hibou", "hibou");
checkOneTerm(analyzer, "chantés", "chant");
checkOneTerm(analyzer, "chanter", "chant");
checkOneTerm(analyzer, "chante", "chant");
checkOneTerm(analyzer, "chant", "chant");
checkOneTerm(analyzer, "baronnes", "baron");
checkOneTerm(analyzer, "barons", "baron");
checkOneTerm(analyzer, "baron", "baron");
checkOneTerm(analyzer, "peaux", "peau");
checkOneTerm(analyzer, "peau", "peau");
checkOneTerm(analyzer, "anneaux", "aneau");
checkOneTerm(analyzer, "anneau", "aneau");
checkOneTerm(analyzer, "neveux", "neveu");
checkOneTerm(analyzer, "neveu", "neveu");
checkOneTerm(analyzer, "affreux", "afreu");
checkOneTerm(analyzer, "affreuse", "afreu");
checkOneTerm(analyzer, "investissement", "investi");
checkOneTerm(analyzer, "investir", "investi");
checkOneTerm(analyzer, "assourdissant", "asourdi");
checkOneTerm(analyzer, "assourdir", "asourdi");
checkOneTerm(analyzer, "pratiquement", "pratiqu");
checkOneTerm(analyzer, "pratique", "pratiqu");
checkOneTerm(analyzer, "administrativement", "administratif");
checkOneTerm(analyzer, "administratif", "administratif");
checkOneTerm(analyzer, "justificatrice", "justifi");
checkOneTerm(analyzer, "justificateur", "justifi");
checkOneTerm(analyzer, "justifier", "justifi");
checkOneTerm(analyzer, "educatrice", "eduqu");
checkOneTerm(analyzer, "eduquer", "eduqu");
checkOneTerm(analyzer, "communicateur", "comuniqu");
checkOneTerm(analyzer, "communiquer", "comuniqu");
checkOneTerm(analyzer, "accompagnatrice", "acompagn");
checkOneTerm(analyzer, "accompagnateur", "acompagn");
checkOneTerm(analyzer, "administrateur", "administr");
checkOneTerm(analyzer, "administrer", "administr");
checkOneTerm(analyzer, "productrice", "product");
checkOneTerm(analyzer, "producteur", "product");
checkOneTerm(analyzer, "acheteuse", "achet");
checkOneTerm(analyzer, "acheteur", "achet");
checkOneTerm(analyzer, "planteur", "plant");
checkOneTerm(analyzer, "plante", "plant");
checkOneTerm(analyzer, "poreuse", "poreu");
checkOneTerm(analyzer, "poreux", "poreu");
checkOneTerm(analyzer, "plieuse", "plieu");
checkOneTerm(analyzer, "bijoutière", "bijouti");
checkOneTerm(analyzer, "bijoutier", "bijouti");
checkOneTerm(analyzer, "caissière", "caisi");
checkOneTerm(analyzer, "caissier", "caisi");
checkOneTerm(analyzer, "abrasive", "abrasif");
checkOneTerm(analyzer, "abrasif", "abrasif");
checkOneTerm(analyzer, "folle", "fou");
checkOneTerm(analyzer, "fou", "fou");
checkOneTerm(analyzer, "personnelle", "person");
checkOneTerm(analyzer, "personne", "person");
// algo bug: too short length
//checkOneTerm(analyzer, "personnel", "person");
checkOneTerm(analyzer, "complète", "complet");
checkOneTerm(analyzer, "complet", "complet");
checkOneTerm(analyzer, "aromatique", "aromat");
checkOneTerm(analyzer, "faiblesse", "faibl");
checkOneTerm(analyzer, "faible", "faibl");
checkOneTerm(analyzer, "patinage", "patin");
checkOneTerm(analyzer, "patin", "patin");
checkOneTerm(analyzer, "sonorisation", "sono");
checkOneTerm(analyzer, "ritualisation", "rituel");
checkOneTerm(analyzer, "rituel", "rituel");
// algo bug: masked by rules above
//checkOneTerm(analyzer, "colonisateur", "colon");
checkOneTerm(analyzer, "nomination", "nomin");
checkOneTerm(analyzer, "disposition", "dispos");
checkOneTerm(analyzer, "dispose", "dispos");
}
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
}
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link FrenchMinimalStemFilter}
*/
public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(source));
}
};
/** Test some examples from the paper */
public void testExamples() throws IOException {
checkOneTerm(analyzer, "chevaux", "cheval");
checkOneTerm(analyzer, "hiboux", "hibou");
checkOneTerm(analyzer, "chantés", "chant");
checkOneTerm(analyzer, "chanter", "chant");
checkOneTerm(analyzer, "chante", "chant");
checkOneTerm(analyzer, "baronnes", "baron");
checkOneTerm(analyzer, "barons", "baron");
checkOneTerm(analyzer, "baron", "baron");
}
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.hu;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link HungarianLightStemFilter}
*/
public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new HungarianLightStemFilter(source));
}
};
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.it;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link ItalianLightStemFilter}
*/
public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new ItalianLightStemFilter(source));
}
};
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("itlighttestdata.zip"), "itlight.txt");
}
}

View File

@ -0,0 +1,95 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link PortugueseLightStemFilter}
*/
public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
return new TokenStreamComponents(source, new PortugueseLightStemFilter(result));
}
};
/**
* Test the example from the paper "Assessing the impact of stemming accuracy
* on information retrieval"
*/
public void testExamples() throws IOException {
assertAnalyzesTo(
analyzer,
"O debate político, pelo menos o que vem a público, parece, de modo nada "
+ "surpreendente, restrito a temas menores. Mas há, evidentemente, "
+ "grandes questões em jogo nas eleições que se aproximam.",
new String[] {
"o", "debat", "politic", "pelo", "meno", "o", "que", "vem", "a",
"public", "parec", "de", "modo", "nada", "surpreendent", "restrit",
"a", "tema", "menor", "mas", "", "evident", "grand", "questa",
"em", "jogo", "nas", "eleica", "que", "se", "aproximam"
});
}
/**
* Test examples from the c implementation
*/
public void testMoreExamples() throws IOException {
checkOneTerm(analyzer, "doutores", "doutor");
checkOneTerm(analyzer, "doutor", "doutor");
checkOneTerm(analyzer, "homens", "homem");
checkOneTerm(analyzer, "homem", "homem");
checkOneTerm(analyzer, "papéis", "papel");
checkOneTerm(analyzer, "papel", "papel");
checkOneTerm(analyzer, "normais", "normal");
checkOneTerm(analyzer, "normal", "normal");
checkOneTerm(analyzer, "lencóis", "lencol");
checkOneTerm(analyzer, "lencol", "lencol");
checkOneTerm(analyzer, "barris", "barril");
checkOneTerm(analyzer, "barril", "barril");
checkOneTerm(analyzer, "botões", "bota");
checkOneTerm(analyzer, "botão", "bota");
}
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
}
}

View File

@ -0,0 +1,69 @@
package org.apache.lucene.analysis.pt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link PortugueseMinimalStemFilter}
*/
public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(result));
}
};
/**
* Test the example from the paper "Assessing the impact of stemming accuracy
* on information retrieval"
*/
public void testExamples() throws IOException {
assertAnalyzesTo(
analyzer,
"O debate político, pelo menos o que vem a público, parece, de modo nada "
+ "surpreendente, restrito a temas menores. Mas há, evidentemente, "
+ "grandes questões em jogo nas eleições que se aproximam.",
new String[] {
"o", "debate", "político", "pelo", "menos", "o", "que", "vem", "a",
"público", "parece", "de", "modo", "nada", "surpreendente", "restrito",
"a", "tema", "menor", "mas", "", "evidentemente", "grande", "questão",
"em", "jogo", "na", "eleição", "que", "se", "aproximam"
});
}
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.ru;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link RussianLightStemFilter}
*/
public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new RussianLightStemFilter(source));
}
};
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
}
}

View File

@ -17,71 +17,35 @@ package org.apache.lucene.analysis.ru;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import java.io.BufferedReader; import java.io.IOException;
import java.io.File; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.Reader;
import java.io.FileInputStream;
import java.util.ArrayList; import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/** /**
* @deprecated Remove this test class (and its datafiles!) in Lucene 4.0 * @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
*/ */
@Deprecated @Deprecated
public class TestRussianStem extends LuceneTestCase public class TestRussianStem extends LuceneTestCase {
{ public void testStem() throws IOException {
private ArrayList<String> words = new ArrayList<String>(); Analyzer a = new ReusableAnalyzerBase() {
private ArrayList<String> stems = new ArrayList<String>();
public TestRussianStem(String name)
{
super(name);
}
/**
* @see TestCase#setUp()
*/
@Override @Override
protected void setUp() throws Exception { protected TokenStreamComponents createComponents(String fieldName,
super.setUp(); Reader reader) {
//System.out.println(new java.util.Date()); Tokenizer t = new KeywordTokenizer(reader);
String str; return new TokenStreamComponents(t, new RussianStemFilter(t));
// open and read words into an array list
BufferedReader inWords =
new BufferedReader(
new InputStreamReader(
getClass().getResourceAsStream("wordsUTF8.txt"),
"UTF-8"));
while ((str = inWords.readLine()) != null)
{
words.add(str);
} }
inWords.close(); };
InputStream voc = getClass().getResourceAsStream("wordsUTF8.txt");
// open and read stems into an array list InputStream out = getClass().getResourceAsStream("stemsUTF8.txt");
BufferedReader inStems = assertVocabulary(a, voc, out);
new BufferedReader( voc.close();
new InputStreamReader( out.close();
getClass().getResourceAsStream("stemsUTF8.txt"),
"UTF-8"));
while ((str = inStems.readLine()) != null)
{
stems.add(str);
} }
inStems.close();
}
public void testStem()
{
for (int i = 0; i < words.size(); i++)
{
//if ( (i % 100) == 0 ) System.err.println(i);
String realStem =
RussianStemmer.stemWord(
words.get(i));
assertEquals("unicode", stems.get(i), realStem);
}
}
} }

View File

@ -17,38 +17,21 @@ package org.apache.lucene.analysis.snowball;
* limitations under the License. * limitations under the License.
*/ */
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.Reader;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.zip.ZipFile;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.util.LuceneTestCase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/** /**
* Test the snowball filters against the snowball data tests * Test the snowball filters against the snowball data tests
*/ */
public class TestSnowballVocab extends BaseTokenStreamTestCase { public class TestSnowballVocab extends LuceneTestCase {
private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
ZipFile zipFile = null;
@Override
protected void setUp() throws Exception {
super.setUp();
this.zipFile = new ZipFile(getDataFile("TestSnowballVocabData.zip"));
}
@Override
protected void tearDown() throws Exception {
this.zipFile.close();
this.zipFile = null;
super.tearDown();
}
/** /**
* Run all languages against their snowball vocabulary tests. * Run all languages against their snowball vocabulary tests.
*/ */
@ -82,25 +65,20 @@ public class TestSnowballVocab extends BaseTokenStreamTestCase {
* For the supplied language, run the stemmer against all strings in voc.txt * For the supplied language, run the stemmer against all strings in voc.txt
* The output should be the same as the string in output.txt * The output should be the same as the string in output.txt
*/ */
private void assertCorrectOutput(String snowballLanguage, String dataDirectory) private void assertCorrectOutput(final String snowballLanguage, String dataDirectory)
throws IOException { throws IOException {
if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage); if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage);
TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage);
InputStream voc = zipFile.getInputStream(zipFile.getEntry(dataDirectory + "/voc.txt")); Analyzer a = new ReusableAnalyzerBase() {
InputStream out = zipFile.getInputStream(zipFile.getEntry(dataDirectory + "/output.txt")); @Override
BufferedReader vocReader = new BufferedReader(new InputStreamReader( protected TokenStreamComponents createComponents(String fieldName,
voc, "UTF-8")); Reader reader) {
BufferedReader outputReader = new BufferedReader(new InputStreamReader( Tokenizer t = new KeywordTokenizer(reader);
out, "UTF-8")); return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage));
String inputWord = null;
while ((inputWord = vocReader.readLine()) != null) {
String expectedWord = outputReader.readLine();
assertNotNull(expectedWord);
tokenizer.reset(new StringReader(inputWord));
filter.reset();
assertTokenStreamContents(filter, new String[] {expectedWord});
} }
vocReader.close(); };
outputReader.close();
assertVocabulary(a, getDataFile("TestSnowballVocabData.zip"),
dataDirectory + "/voc.txt", dataDirectory + "/output.txt");
} }
} }

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.sv;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
/**
* Simple tests for {@link SwedishLightStemFilter}
*/
public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(source, new SwedishLightStemFilter(source));
}
};
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
}
}

View File

@ -0,0 +1,83 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.zip.ZipFile;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.junit.Assert;
/** Utility class for doing vocabulary-based stemming tests */
public class VocabularyAssert {
/** Run a vocabulary test against two data files. */
public static void assertVocabulary(Analyzer a, InputStream voc, InputStream out)
throws IOException {
BufferedReader vocReader = new BufferedReader(
new InputStreamReader(voc, "UTF-8"));
BufferedReader outputReader = new BufferedReader(
new InputStreamReader(out, "UTF-8"));
String inputWord = null;
while ((inputWord = vocReader.readLine()) != null) {
String expectedWord = outputReader.readLine();
Assert.assertNotNull(expectedWord);
BaseTokenStreamTestCase.checkOneTermReuse(a, inputWord, expectedWord);
}
}
/** Run a vocabulary test against one file: tab separated. */
public static void assertVocabulary(Analyzer a, InputStream vocOut)
throws IOException {
BufferedReader vocReader = new BufferedReader(
new InputStreamReader(vocOut, "UTF-8"));
String inputLine = null;
while ((inputLine = vocReader.readLine()) != null) {
if (inputLine.startsWith("#") || inputLine.trim().length() == 0)
continue; /* comment */
String words[] = inputLine.split("\t");
BaseTokenStreamTestCase.checkOneTermReuse(a, words[0], words[1]);
}
}
/** Run a vocabulary test against two data files inside a zip file */
public static void assertVocabulary(Analyzer a, File zipFile, String voc, String out)
throws IOException {
ZipFile zip = new ZipFile(zipFile);
InputStream v = zip.getInputStream(zip.getEntry(voc));
InputStream o = zip.getInputStream(zip.getEntry(out));
assertVocabulary(a, v, o);
v.close();
o.close();
zip.close();
}
/** Run a vocabulary test against a tab-separated data file inside a zip file */
public static void assertVocabulary(Analyzer a, File zipFile, String vocOut)
throws IOException {
ZipFile zip = new ZipFile(zipFile);
InputStream vo = zip.getInputStream(zip.getEntry(vocOut));
assertVocabulary(a, vo);
vo.close();
zip.close();
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
/** Factory for {@link EnglishMinimalStemFilter} */
public class EnglishMinimalStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new EnglishMinimalStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fi.FinnishLightStemFilter;
/** Factory for {@link FinnishLightStemFilter} */
public class FinnishLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new FinnishLightStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fr.FrenchLightStemFilter;
/** Factory for {@link FrenchLightStemFilter} */
public class FrenchLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new FrenchLightStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter;
/** Factory for {@link FrenchMinimalStemFilter} */
public class FrenchMinimalStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new FrenchMinimalStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.de.GermanLightStemFilter;
/** Factory for {@link GermanLightStemFilter} */
public class GermanLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new GermanLightStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.de.GermanMinimalStemFilter;
/** Factory for {@link GermanMinimalStemFilter} */
public class GermanMinimalStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new GermanMinimalStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hu.HungarianLightStemFilter;
/** Factory for {@link HungarianLightStemFilter} */
public class HungarianLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new HungarianLightStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.it.ItalianLightStemFilter;
/** Factory for {@link ItalianLightStemFilter} */
public class ItalianLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new ItalianLightStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pt.PortugueseLightStemFilter;
/** Factory for {@link PortugueseLightStemFilter} */
public class PortugueseLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new PortugueseLightStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pt.PortugueseMinimalStemFilter;
/** Factory for {@link PortugueseMinimalStemFilter} */
public class PortugueseMinimalStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new PortugueseMinimalStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ru.RussianLightStemFilter;
/** Factory for {@link RussianLightStemFilter} */
public class RussianLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new RussianLightStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.es.SpanishLightStemFilter;
/** Factory for {@link SpanishLightStemFilter} */
public class SpanishLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new SpanishLightStemFilter(input);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.sv.SwedishLightStemFilter;
/** Factory for {@link SwedishLightStemFilter} */
public class SwedishLightStemFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new SwedishLightStemFilter(input);
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the English minimal stem factory is working.
*/
public class TestEnglishMinimalStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("bricks");
EnglishMinimalStemFilterFactory factory = new EnglishMinimalStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "brick" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Finnish light stem factory is working.
*/
public class TestFinnishLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("aseistettujen");
FinnishLightStemFilterFactory factory = new FinnishLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "aseistet" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the French light stem factory is working.
*/
public class TestFrenchLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("administrativement");
FrenchLightStemFilterFactory factory = new FrenchLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "administratif" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the French minimal stem factory is working.
*/
public class TestFrenchMinimalStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("chevaux");
FrenchMinimalStemFilterFactory factory = new FrenchMinimalStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "cheval" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the German light stem factory is working.
*/
public class TestGermanLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("häuser");
GermanLightStemFilterFactory factory = new GermanLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "haus" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the German minimal stem factory is working.
*/
public class TestGermanMinimalStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("bilder");
GermanMinimalStemFilterFactory factory = new GermanMinimalStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "bild" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Hungarian light stem factory is working.
*/
public class TestHungarianLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("házakat");
HungarianLightStemFilterFactory factory = new HungarianLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "haz" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Italian light stem factory is working.
*/
public class TestItalianLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("ragazzo ragazzi");
ItalianLightStemFilterFactory factory = new ItalianLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "ragazz", "ragazz" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Portuguese Light stem factory is working.
*/
public class TestPortugueseLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("evidentemente");
PortugueseLightStemFilterFactory factory = new PortugueseLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "evident" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Portuguese Minimal stem factory is working.
*/
public class TestPortugueseMinimalStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("questões");
PortugueseMinimalStemFilterFactory factory = new PortugueseMinimalStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "questão" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Russian light stem factory is working.
*/
public class TestRussianLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("журналы");
RussianLightStemFilterFactory factory = new RussianLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "журнал" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Spanish Light stem factory is working.
*/
public class TestSpanishLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("sociedades");
SpanishLightStemFilterFactory factory = new SpanishLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "sociedad" });
}
}

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Simple tests to ensure the Swedish Light stem factory is working.
*/
public class TestSwedishLightStemFilterFactory extends BaseTokenTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("äpplen äpple");
SwedishLightStemFilterFactory factory = new SwedishLightStemFilterFactory();
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
assertTokenStreamContents(stream, new String[] { "äppl", "äppl" });
}
}